diff --git a/weed/filer/mysql/mysql_collation.go b/weed/filer/mysql/mysql_collation.go new file mode 100644 index 000000000..4cb362a07 --- /dev/null +++ b/weed/filer/mysql/mysql_collation.go @@ -0,0 +1,41 @@ +package mysql + +import ( + "database/sql" + "strings" + + "github.com/seaweedfs/seaweedfs/weed/filer/abstract_sql" + "github.com/seaweedfs/seaweedfs/weed/glog" +) + +// ConfigureListOrdering switches the list queries to BINARY byte ordering when +// the live filemeta.name collation is not binary, so S3 ListObjectsV2 stays +// lexicographic. The fallback costs a filesort, so warn the operator to fix it. +func ConfigureListOrdering(db *sql.DB, gen *SqlGenMysql) { + collation, isBinary, err := nameColumnCollation(db) + if err != nil { + glog.V(1).Infof("mysql: skip filemeta.name collation check: %v", err) + return + } + if isBinary { + return + } + gen.ForceBinaryCollation = true + glog.Warningf("mysql: filemeta.name collation %q is not binary, so S3 list order is not byte-lexicographic and clients that merge sorted listings may report spurious diffs. Falling back to a slower BINARY sort; convert the name column to a *_bin collation (e.g. utf8mb4_bin) for correct, indexed ordering.", collation) +} + +func nameColumnCollation(db *sql.DB) (collation string, isBinary bool, err error) { + row := db.QueryRow("SELECT COLLATION_NAME FROM information_schema.COLUMNS WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = ? AND COLUMN_NAME = 'name'", abstract_sql.DEFAULT_TABLE) + var c sql.NullString + if err = row.Scan(&c); err != nil { + return "", false, err + } + return c.String, isBinaryCollation(c.String), nil +} + +// isBinaryCollation reports whether a collation orders by byte value: a NULL +// collation (BINARY column) or any *_bin collation. +func isBinaryCollation(collation string) bool { + c := strings.ToLower(strings.TrimSpace(collation)) + return c == "" || c == "binary" || strings.HasSuffix(c, "_bin") +} diff --git a/weed/filer/mysql/mysql_sql_gen.go b/weed/filer/mysql/mysql_sql_gen.go index 8e77bab8a..30f470d0d 100644 --- a/weed/filer/mysql/mysql_sql_gen.go +++ b/weed/filer/mysql/mysql_sql_gen.go @@ -11,6 +11,8 @@ type SqlGenMysql struct { CreateTableSqlTemplate string DropTableSqlTemplate string UpsertQueryTemplate string + // Force byte ordering on a non-binary name column; see ConfigureListOrdering. + ForceBinaryCollation bool } // DefaultUpsertQuery keeps INSERTs idempotent so the inode-index KvPut @@ -48,12 +50,22 @@ func (gen *SqlGenMysql) GetSqlDeleteFolderChildren(tableName string) string { return fmt.Sprintf("DELETE FROM `%s` WHERE `dirhash` = ? AND `directory` = ?", tableName) } +// nameExpr forces byte ordering on a non-binary column at the cost of a filesort. +func (gen *SqlGenMysql) nameExpr() string { + if gen.ForceBinaryCollation { + return "BINARY `name`" + } + return "`name`" +} + func (gen *SqlGenMysql) GetSqlListExclusive(tableName string) string { - return fmt.Sprintf("SELECT `name`, `meta` FROM `%s` WHERE `dirhash` = ? AND `name` > ? AND `directory` = ? AND `name` LIKE ? ORDER BY `name` ASC LIMIT ?", tableName) + name := gen.nameExpr() + return fmt.Sprintf("SELECT `name`, `meta` FROM `%s` WHERE `dirhash` = ? AND %s > ? AND `directory` = ? AND %s LIKE ? ORDER BY %s ASC LIMIT ?", tableName, name, name, name) } func (gen *SqlGenMysql) GetSqlListInclusive(tableName string) string { - return fmt.Sprintf("SELECT `name`, `meta` FROM `%s` WHERE `dirhash` = ? AND `name` >= ? AND `directory` = ? AND `name` LIKE ? ORDER BY `name` ASC LIMIT ?", tableName) + name := gen.nameExpr() + return fmt.Sprintf("SELECT `name`, `meta` FROM `%s` WHERE `dirhash` = ? AND %s >= ? AND `directory` = ? AND %s LIKE ? ORDER BY %s ASC LIMIT ?", tableName, name, name, name) } func (gen *SqlGenMysql) GetSqlCreateTable(tableName string) string { diff --git a/weed/filer/mysql/mysql_sql_gen_test.go b/weed/filer/mysql/mysql_sql_gen_test.go index 74303b764..f72607a32 100644 --- a/weed/filer/mysql/mysql_sql_gen_test.go +++ b/weed/filer/mysql/mysql_sql_gen_test.go @@ -26,3 +26,45 @@ func TestEmptyUpsertTemplateFallsBackToPlainInsert(t *testing.T) { t.Fatalf("plain INSERT path should not contain ON DUPLICATE KEY UPDATE, got: %s", got) } } + +func TestListSqlDefaultOrderingFollowsColumn(t *testing.T) { + gen := &SqlGenMysql{} + for _, got := range []string{gen.GetSqlListExclusive("filemeta"), gen.GetSqlListInclusive("filemeta")} { + if strings.Contains(got, "BINARY") { + t.Fatalf("default list query should not force BINARY, got: %s", got) + } + if !strings.Contains(got, "ORDER BY `name` ASC") { + t.Fatalf("expected plain name ordering, got: %s", got) + } + } +} + +func TestListSqlBinaryOrderingOnNonBinaryColumn(t *testing.T) { + gen := &SqlGenMysql{ForceBinaryCollation: true} + for _, got := range []string{gen.GetSqlListExclusive("filemeta"), gen.GetSqlListInclusive("filemeta")} { + if !strings.Contains(got, "ORDER BY BINARY `name` ASC") { + t.Fatalf("expected BINARY ordering, got: %s", got) + } + if !strings.Contains(got, "BINARY `name` LIKE ?") { + t.Fatalf("expected BINARY prefix filter, got: %s", got) + } + if strings.Contains(got, "AND `name` > ?") || strings.Contains(got, "AND `name` >= ?") { + t.Fatalf("pagination comparison must also be BINARY, got: %s", got) + } + } +} + +func TestIsBinaryCollation(t *testing.T) { + binary := []string{"", "binary", "utf8mb4_bin", "utf8mb3_bin", "latin1_bin", "UTF8MB4_BIN"} + for _, c := range binary { + if !isBinaryCollation(c) { + t.Fatalf("expected %q to be treated as binary", c) + } + } + ci := []string{"utf8mb4_general_ci", "utf8mb3_general_ci", "utf8mb4_0900_ai_ci", "latin1_swedish_ci"} + for _, c := range ci { + if isBinaryCollation(c) { + t.Fatalf("expected %q to be treated as non-binary", c) + } + } +} diff --git a/weed/filer/mysql/mysql_store.go b/weed/filer/mysql/mysql_store.go index b5b5ef025..3513b5523 100644 --- a/weed/filer/mysql/mysql_store.go +++ b/weed/filer/mysql/mysql_store.go @@ -70,11 +70,12 @@ func (store *MysqlStore) initialize(dsn string, upsertQuery string, enableUpsert } else if upsertQuery == "" { upsertQuery = DefaultUpsertQuery } - store.SqlGenerator = &SqlGenMysql{ + gen := &SqlGenMysql{ CreateTableSqlTemplate: "", DropTableSqlTemplate: "DROP TABLE `%s`", UpsertQueryTemplate: upsertQuery, } + store.SqlGenerator = gen store.RetryableErrorCallback = func(err error) bool { var mysqlError *mysql.MySQLError @@ -155,6 +156,8 @@ func (store *MysqlStore) initialize(dsn string, upsertQuery string, enableUpsert return fmt.Errorf("connect to %s error:%v", maskedDSN(cfg), err) } + ConfigureListOrdering(store.DB, gen) + return nil } diff --git a/weed/filer/mysql2/mysql2_store.go b/weed/filer/mysql2/mysql2_store.go index 3dde672b0..d1f13b9ae 100644 --- a/weed/filer/mysql2/mysql2_store.go +++ b/weed/filer/mysql2/mysql2_store.go @@ -63,11 +63,12 @@ func (store *MysqlStore2) initialize(createTable, upsertQuery string, enableUpse } else if upsertQuery == "" { upsertQuery = mysql.DefaultUpsertQuery } - store.SqlGenerator = &mysql.SqlGenMysql{ + gen := &mysql.SqlGenMysql{ CreateTableSqlTemplate: createTable, DropTableSqlTemplate: "DROP TABLE `%s`", UpsertQueryTemplate: upsertQuery, } + store.SqlGenerator = gen sqlUrl := fmt.Sprintf(CONNECTION_URL_PATTERN, user, password, hostname, port, database) adaptedSqlUrl := fmt.Sprintf(CONNECTION_URL_PATTERN, user, "", hostname, port, database) @@ -96,5 +97,7 @@ func (store *MysqlStore2) initialize(createTable, upsertQuery string, enableUpse return fmt.Errorf("init table %s: %v", abstract_sql.DEFAULT_TABLE, err) } + mysql.ConfigureListOrdering(store.DB, gen) + return nil } diff --git a/weed/filer/postgres/postgres_collation.go b/weed/filer/postgres/postgres_collation.go new file mode 100644 index 000000000..02240fe96 --- /dev/null +++ b/weed/filer/postgres/postgres_collation.go @@ -0,0 +1,53 @@ +package postgres + +import ( + "database/sql" + "strings" + + "github.com/seaweedfs/seaweedfs/weed/filer/abstract_sql" + "github.com/seaweedfs/seaweedfs/weed/glog" +) + +// ConfigureListOrdering switches the list queries to COLLATE "C" when the live +// filemeta.name collation is locale-aware, so S3 ListObjectsV2 stays +// lexicographic. The fallback costs a sort, so warn the operator to fix it. +func ConfigureListOrdering(db *sql.DB, gen *SqlGenPostgres) { + collation, isBinary, err := nameColumnCollation(db) + if err != nil { + glog.V(1).Infof("postgres: skip filemeta.name collation check: %v", err) + return + } + if isBinary { + return + } + gen.ForceBinaryCollation = true + glog.Warningf(`postgres: filemeta.name collation %q is not byte-ordered, so S3 list order is not byte-lexicographic and clients that merge sorted listings may report spurious diffs. Falling back to a slower COLLATE "C" sort; declare the name column COLLATE "C" (or use a C/C.UTF-8 database collation) for correct, indexed ordering.`, collation) +} + +func nameColumnCollation(db *sql.DB) (collation string, isBinary bool, err error) { + // information_schema reports NULL for a column on the database default, so + // fall back to datcollate for the effective ordering. + row := db.QueryRow(`SELECT + (SELECT collation_name FROM information_schema.columns + WHERE table_schema = current_schema() AND table_name = $1 AND column_name = 'name'), + (SELECT datcollate FROM pg_database WHERE datname = current_database())`, abstract_sql.DEFAULT_TABLE) + var columnCollation, dbCollate sql.NullString + if err = row.Scan(&columnCollation, &dbCollate); err != nil { + return "", false, err + } + effective := columnCollation.String + if !columnCollation.Valid || effective == "" { + effective = dbCollate.String + } + return effective, isByteOrderedCollation(effective), nil +} + +// isByteOrderedCollation reports whether a Postgres locale sorts by byte value +// (C, POSIX, C.UTF-8) rather than locale order (en_US.UTF-8, ICU, ...). +func isByteOrderedCollation(name string) bool { + switch strings.ToLower(strings.TrimSpace(name)) { + case "c", "posix", "c.utf-8", "c.utf8": + return true + } + return false +} diff --git a/weed/filer/postgres/postgres_sql_gen.go b/weed/filer/postgres/postgres_sql_gen.go index 89696d810..c16b0d9d0 100644 --- a/weed/filer/postgres/postgres_sql_gen.go +++ b/weed/filer/postgres/postgres_sql_gen.go @@ -11,6 +11,8 @@ type SqlGenPostgres struct { CreateTableSqlTemplate string DropTableSqlTemplate string UpsertQueryTemplate string + // Force byte ordering on a locale-aware name column; see ConfigureListOrdering. + ForceBinaryCollation bool } // DefaultUpsertQuery keeps INSERTs idempotent so a duplicate-key failure @@ -46,12 +48,22 @@ func (gen *SqlGenPostgres) GetSqlDeleteFolderChildren(tableName string) string { return fmt.Sprintf(`DELETE FROM "%s" WHERE dirhash=$1 AND directory=$2`, tableName) } +// nameExpr forces byte ordering on a locale-aware column via COLLATE "C". +func (gen *SqlGenPostgres) nameExpr() string { + if gen.ForceBinaryCollation { + return `name COLLATE "C"` + } + return "name" +} + func (gen *SqlGenPostgres) GetSqlListExclusive(tableName string) string { - return fmt.Sprintf(`SELECT NAME, meta FROM "%s" WHERE dirhash=$1 AND name>$2 AND directory=$3 AND name like $4 ORDER BY NAME ASC LIMIT $5`, tableName) + name := gen.nameExpr() + return fmt.Sprintf(`SELECT NAME, meta FROM "%s" WHERE dirhash=$1 AND %s>$2 AND directory=$3 AND %s like $4 ORDER BY %s ASC LIMIT $5`, tableName, name, name, name) } func (gen *SqlGenPostgres) GetSqlListInclusive(tableName string) string { - return fmt.Sprintf(`SELECT NAME, meta FROM "%s" WHERE dirhash=$1 AND name>=$2 AND directory=$3 AND name like $4 ORDER BY NAME ASC LIMIT $5`, tableName) + name := gen.nameExpr() + return fmt.Sprintf(`SELECT NAME, meta FROM "%s" WHERE dirhash=$1 AND %s>=$2 AND directory=$3 AND %s like $4 ORDER BY %s ASC LIMIT $5`, tableName, name, name, name) } func (gen *SqlGenPostgres) GetSqlCreateTable(tableName string) string { diff --git a/weed/filer/postgres/postgres_sql_gen_test.go b/weed/filer/postgres/postgres_sql_gen_test.go index 16039803c..f539bfb89 100644 --- a/weed/filer/postgres/postgres_sql_gen_test.go +++ b/weed/filer/postgres/postgres_sql_gen_test.go @@ -23,3 +23,45 @@ func TestEmptyUpsertTemplateFallsBackToPlainInsert(t *testing.T) { t.Fatalf("plain INSERT path should not contain ON CONFLICT, got: %s", got) } } + +func TestListSqlDefaultOrderingFollowsColumn(t *testing.T) { + gen := &SqlGenPostgres{} + for _, got := range []string{gen.GetSqlListExclusive("filemeta"), gen.GetSqlListInclusive("filemeta")} { + if strings.Contains(got, "COLLATE") { + t.Fatalf("default list query should not force a collation, got: %s", got) + } + if !strings.Contains(got, "ORDER BY name ASC") { + t.Fatalf("expected plain name ordering, got: %s", got) + } + } +} + +func TestListSqlBinaryOrderingOnNonBinaryColumn(t *testing.T) { + gen := &SqlGenPostgres{ForceBinaryCollation: true} + for _, got := range []string{gen.GetSqlListExclusive("filemeta"), gen.GetSqlListInclusive("filemeta")} { + if !strings.Contains(got, `ORDER BY name COLLATE "C" ASC`) { + t.Fatalf(`expected COLLATE "C" ordering, got: %s`, got) + } + if !strings.Contains(got, `name COLLATE "C" like $4`) { + t.Fatalf(`expected COLLATE "C" prefix filter, got: %s`, got) + } + if strings.Contains(got, "AND name>$2") || strings.Contains(got, "AND name>=$2") { + t.Fatalf("pagination comparison must also be byte-ordered, got: %s", got) + } + } +} + +func TestIsByteOrderedCollation(t *testing.T) { + byteOrdered := []string{"C", "POSIX", "c", "C.UTF-8", "C.utf8"} + for _, c := range byteOrdered { + if !isByteOrderedCollation(c) { + t.Fatalf("expected %q to be treated as byte-ordered", c) + } + } + locale := []string{"en_US.UTF-8", "en_US.utf8", "en-US-x-icu", "und-x-icu", ""} + for _, c := range locale { + if isByteOrderedCollation(c) { + t.Fatalf("expected %q to be treated as locale-aware", c) + } + } +} diff --git a/weed/filer/postgres/postgres_store.go b/weed/filer/postgres/postgres_store.go index 97f90b83f..a2388624b 100644 --- a/weed/filer/postgres/postgres_store.go +++ b/weed/filer/postgres/postgres_store.go @@ -62,11 +62,12 @@ func (store *PostgresStore) initialize(upsertQuery string, enableUpsert bool, us } else if upsertQuery == "" { upsertQuery = DefaultUpsertQuery } - store.SqlGenerator = &SqlGenPostgres{ + gen := &SqlGenPostgres{ CreateTableSqlTemplate: "", DropTableSqlTemplate: `drop table "%s"`, UpsertQueryTemplate: upsertQuery, } + store.SqlGenerator = gen // pgx-optimized connection string with better timeouts and connection handling sqlUrl := "connect_timeout=30" @@ -116,5 +117,7 @@ func (store *PostgresStore) initialize(upsertQuery string, enableUpsert bool, us } store.DB = db + ConfigureListOrdering(store.DB, gen) + return nil } diff --git a/weed/filer/postgres2/postgres2_store.go b/weed/filer/postgres2/postgres2_store.go index 3861e0689..b818e550d 100644 --- a/weed/filer/postgres2/postgres2_store.go +++ b/weed/filer/postgres2/postgres2_store.go @@ -68,11 +68,12 @@ func (store *PostgresStore2) initialize(createTable, upsertQuery string, enableU } else if upsertQuery == "" { upsertQuery = postgres.DefaultUpsertQuery } - store.SqlGenerator = &postgres.SqlGenPostgres{ + gen := &postgres.SqlGenPostgres{ CreateTableSqlTemplate: createTable, DropTableSqlTemplate: `drop table "%s"`, UpsertQueryTemplate: upsertQuery, } + store.SqlGenerator = gen // pgx-optimized connection string with better timeouts and connection handling sqlUrl := "connect_timeout=30" @@ -126,5 +127,7 @@ func (store *PostgresStore2) initialize(createTable, upsertQuery string, enableU return fmt.Errorf("init table %s: %v", abstract_sql.DEFAULT_TABLE, err) } + postgres.ConfigureListOrdering(store.DB, gen) + return nil }