filer: keep S3 list order byte-lexicographic regardless of SQL name column collation (#9824)

* mysql: keep S3 list order byte-lexicographic regardless of name column collation

ORDER BY name and the name > ? pagination predicate follow the column
collation, so a case-insensitive filemeta.name (e.g. utf8mb3_general_ci)
returns S3 keys out of byte order and breaks clients that merge two sorted
listings.

Detect the live name collation at startup; only when it isn't binary, wrap
the list comparison, prefix, and ORDER BY in BINARY name so order and
pagination stay consistent. Correctly configured utf8mb4_bin tables keep
their indexed range scan unchanged, and the operator gets a warning to
convert the column.

* postgres: keep S3 list order byte-lexicographic regardless of name column collation

ORDER BY name and the name > $n pagination predicate follow the column or
database collation, so a locale-aware filemeta.name (e.g. the en_US.UTF-8
database default) returns S3 keys out of byte order and breaks clients that
merge two sorted listings.

Detect the live name collation at startup; only when it isn't byte-ordered,
wrap the list comparison, prefix, and ORDER BY in COLLATE "C" so order and
pagination stay consistent. A byte-ordered (C/POSIX/C.UTF-8) column keeps its
indexed range scan unchanged, and the operator gets a warning to declare the
column COLLATE "C".
This commit is contained in:
Chris Lu
2026-06-04 14:33:41 -07:00
committed by GitHub
parent 8c2d9f466f
commit a24f4844d3
10 changed files with 222 additions and 8 deletions
+41
View File
@@ -0,0 +1,41 @@
package mysql
import (
"database/sql"
"strings"
"github.com/seaweedfs/seaweedfs/weed/filer/abstract_sql"
"github.com/seaweedfs/seaweedfs/weed/glog"
)
// ConfigureListOrdering switches the list queries to BINARY byte ordering when
// the live filemeta.name collation is not binary, so S3 ListObjectsV2 stays
// lexicographic. The fallback costs a filesort, so warn the operator to fix it.
func ConfigureListOrdering(db *sql.DB, gen *SqlGenMysql) {
collation, isBinary, err := nameColumnCollation(db)
if err != nil {
glog.V(1).Infof("mysql: skip filemeta.name collation check: %v", err)
return
}
if isBinary {
return
}
gen.ForceBinaryCollation = true
glog.Warningf("mysql: filemeta.name collation %q is not binary, so S3 list order is not byte-lexicographic and clients that merge sorted listings may report spurious diffs. Falling back to a slower BINARY sort; convert the name column to a *_bin collation (e.g. utf8mb4_bin) for correct, indexed ordering.", collation)
}
func nameColumnCollation(db *sql.DB) (collation string, isBinary bool, err error) {
row := db.QueryRow("SELECT COLLATION_NAME FROM information_schema.COLUMNS WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = ? AND COLUMN_NAME = 'name'", abstract_sql.DEFAULT_TABLE)
var c sql.NullString
if err = row.Scan(&c); err != nil {
return "", false, err
}
return c.String, isBinaryCollation(c.String), nil
}
// isBinaryCollation reports whether a collation orders by byte value: a NULL
// collation (BINARY column) or any *_bin collation.
func isBinaryCollation(collation string) bool {
c := strings.ToLower(strings.TrimSpace(collation))
return c == "" || c == "binary" || strings.HasSuffix(c, "_bin")
}
+14 -2
View File
@@ -11,6 +11,8 @@ type SqlGenMysql struct {
CreateTableSqlTemplate string
DropTableSqlTemplate string
UpsertQueryTemplate string
// Force byte ordering on a non-binary name column; see ConfigureListOrdering.
ForceBinaryCollation bool
}
// DefaultUpsertQuery keeps INSERTs idempotent so the inode-index KvPut
@@ -48,12 +50,22 @@ func (gen *SqlGenMysql) GetSqlDeleteFolderChildren(tableName string) string {
return fmt.Sprintf("DELETE FROM `%s` WHERE `dirhash` = ? AND `directory` = ?", tableName)
}
// nameExpr forces byte ordering on a non-binary column at the cost of a filesort.
func (gen *SqlGenMysql) nameExpr() string {
if gen.ForceBinaryCollation {
return "BINARY `name`"
}
return "`name`"
}
func (gen *SqlGenMysql) GetSqlListExclusive(tableName string) string {
return fmt.Sprintf("SELECT `name`, `meta` FROM `%s` WHERE `dirhash` = ? AND `name` > ? AND `directory` = ? AND `name` LIKE ? ORDER BY `name` ASC LIMIT ?", tableName)
name := gen.nameExpr()
return fmt.Sprintf("SELECT `name`, `meta` FROM `%s` WHERE `dirhash` = ? AND %s > ? AND `directory` = ? AND %s LIKE ? ORDER BY %s ASC LIMIT ?", tableName, name, name, name)
}
func (gen *SqlGenMysql) GetSqlListInclusive(tableName string) string {
return fmt.Sprintf("SELECT `name`, `meta` FROM `%s` WHERE `dirhash` = ? AND `name` >= ? AND `directory` = ? AND `name` LIKE ? ORDER BY `name` ASC LIMIT ?", tableName)
name := gen.nameExpr()
return fmt.Sprintf("SELECT `name`, `meta` FROM `%s` WHERE `dirhash` = ? AND %s >= ? AND `directory` = ? AND %s LIKE ? ORDER BY %s ASC LIMIT ?", tableName, name, name, name)
}
func (gen *SqlGenMysql) GetSqlCreateTable(tableName string) string {
+42
View File
@@ -26,3 +26,45 @@ func TestEmptyUpsertTemplateFallsBackToPlainInsert(t *testing.T) {
t.Fatalf("plain INSERT path should not contain ON DUPLICATE KEY UPDATE, got: %s", got)
}
}
func TestListSqlDefaultOrderingFollowsColumn(t *testing.T) {
gen := &SqlGenMysql{}
for _, got := range []string{gen.GetSqlListExclusive("filemeta"), gen.GetSqlListInclusive("filemeta")} {
if strings.Contains(got, "BINARY") {
t.Fatalf("default list query should not force BINARY, got: %s", got)
}
if !strings.Contains(got, "ORDER BY `name` ASC") {
t.Fatalf("expected plain name ordering, got: %s", got)
}
}
}
func TestListSqlBinaryOrderingOnNonBinaryColumn(t *testing.T) {
gen := &SqlGenMysql{ForceBinaryCollation: true}
for _, got := range []string{gen.GetSqlListExclusive("filemeta"), gen.GetSqlListInclusive("filemeta")} {
if !strings.Contains(got, "ORDER BY BINARY `name` ASC") {
t.Fatalf("expected BINARY ordering, got: %s", got)
}
if !strings.Contains(got, "BINARY `name` LIKE ?") {
t.Fatalf("expected BINARY prefix filter, got: %s", got)
}
if strings.Contains(got, "AND `name` > ?") || strings.Contains(got, "AND `name` >= ?") {
t.Fatalf("pagination comparison must also be BINARY, got: %s", got)
}
}
}
func TestIsBinaryCollation(t *testing.T) {
binary := []string{"", "binary", "utf8mb4_bin", "utf8mb3_bin", "latin1_bin", "UTF8MB4_BIN"}
for _, c := range binary {
if !isBinaryCollation(c) {
t.Fatalf("expected %q to be treated as binary", c)
}
}
ci := []string{"utf8mb4_general_ci", "utf8mb3_general_ci", "utf8mb4_0900_ai_ci", "latin1_swedish_ci"}
for _, c := range ci {
if isBinaryCollation(c) {
t.Fatalf("expected %q to be treated as non-binary", c)
}
}
}
+4 -1
View File
@@ -70,11 +70,12 @@ func (store *MysqlStore) initialize(dsn string, upsertQuery string, enableUpsert
} else if upsertQuery == "" {
upsertQuery = DefaultUpsertQuery
}
store.SqlGenerator = &SqlGenMysql{
gen := &SqlGenMysql{
CreateTableSqlTemplate: "",
DropTableSqlTemplate: "DROP TABLE `%s`",
UpsertQueryTemplate: upsertQuery,
}
store.SqlGenerator = gen
store.RetryableErrorCallback = func(err error) bool {
var mysqlError *mysql.MySQLError
@@ -155,6 +156,8 @@ func (store *MysqlStore) initialize(dsn string, upsertQuery string, enableUpsert
return fmt.Errorf("connect to %s error:%v", maskedDSN(cfg), err)
}
ConfigureListOrdering(store.DB, gen)
return nil
}
+4 -1
View File
@@ -63,11 +63,12 @@ func (store *MysqlStore2) initialize(createTable, upsertQuery string, enableUpse
} else if upsertQuery == "" {
upsertQuery = mysql.DefaultUpsertQuery
}
store.SqlGenerator = &mysql.SqlGenMysql{
gen := &mysql.SqlGenMysql{
CreateTableSqlTemplate: createTable,
DropTableSqlTemplate: "DROP TABLE `%s`",
UpsertQueryTemplate: upsertQuery,
}
store.SqlGenerator = gen
sqlUrl := fmt.Sprintf(CONNECTION_URL_PATTERN, user, password, hostname, port, database)
adaptedSqlUrl := fmt.Sprintf(CONNECTION_URL_PATTERN, user, "<ADAPTED>", hostname, port, database)
@@ -96,5 +97,7 @@ func (store *MysqlStore2) initialize(createTable, upsertQuery string, enableUpse
return fmt.Errorf("init table %s: %v", abstract_sql.DEFAULT_TABLE, err)
}
mysql.ConfigureListOrdering(store.DB, gen)
return nil
}
+53
View File
@@ -0,0 +1,53 @@
package postgres
import (
"database/sql"
"strings"
"github.com/seaweedfs/seaweedfs/weed/filer/abstract_sql"
"github.com/seaweedfs/seaweedfs/weed/glog"
)
// ConfigureListOrdering switches the list queries to COLLATE "C" when the live
// filemeta.name collation is locale-aware, so S3 ListObjectsV2 stays
// lexicographic. The fallback costs a sort, so warn the operator to fix it.
func ConfigureListOrdering(db *sql.DB, gen *SqlGenPostgres) {
collation, isBinary, err := nameColumnCollation(db)
if err != nil {
glog.V(1).Infof("postgres: skip filemeta.name collation check: %v", err)
return
}
if isBinary {
return
}
gen.ForceBinaryCollation = true
glog.Warningf(`postgres: filemeta.name collation %q is not byte-ordered, so S3 list order is not byte-lexicographic and clients that merge sorted listings may report spurious diffs. Falling back to a slower COLLATE "C" sort; declare the name column COLLATE "C" (or use a C/C.UTF-8 database collation) for correct, indexed ordering.`, collation)
}
func nameColumnCollation(db *sql.DB) (collation string, isBinary bool, err error) {
// information_schema reports NULL for a column on the database default, so
// fall back to datcollate for the effective ordering.
row := db.QueryRow(`SELECT
(SELECT collation_name FROM information_schema.columns
WHERE table_schema = current_schema() AND table_name = $1 AND column_name = 'name'),
(SELECT datcollate FROM pg_database WHERE datname = current_database())`, abstract_sql.DEFAULT_TABLE)
var columnCollation, dbCollate sql.NullString
if err = row.Scan(&columnCollation, &dbCollate); err != nil {
return "", false, err
}
effective := columnCollation.String
if !columnCollation.Valid || effective == "" {
effective = dbCollate.String
}
return effective, isByteOrderedCollation(effective), nil
}
// isByteOrderedCollation reports whether a Postgres locale sorts by byte value
// (C, POSIX, C.UTF-8) rather than locale order (en_US.UTF-8, ICU, ...).
func isByteOrderedCollation(name string) bool {
switch strings.ToLower(strings.TrimSpace(name)) {
case "c", "posix", "c.utf-8", "c.utf8":
return true
}
return false
}
+14 -2
View File
@@ -11,6 +11,8 @@ type SqlGenPostgres struct {
CreateTableSqlTemplate string
DropTableSqlTemplate string
UpsertQueryTemplate string
// Force byte ordering on a locale-aware name column; see ConfigureListOrdering.
ForceBinaryCollation bool
}
// DefaultUpsertQuery keeps INSERTs idempotent so a duplicate-key failure
@@ -46,12 +48,22 @@ func (gen *SqlGenPostgres) GetSqlDeleteFolderChildren(tableName string) string {
return fmt.Sprintf(`DELETE FROM "%s" WHERE dirhash=$1 AND directory=$2`, tableName)
}
// nameExpr forces byte ordering on a locale-aware column via COLLATE "C".
func (gen *SqlGenPostgres) nameExpr() string {
if gen.ForceBinaryCollation {
return `name COLLATE "C"`
}
return "name"
}
func (gen *SqlGenPostgres) GetSqlListExclusive(tableName string) string {
return fmt.Sprintf(`SELECT NAME, meta FROM "%s" WHERE dirhash=$1 AND name>$2 AND directory=$3 AND name like $4 ORDER BY NAME ASC LIMIT $5`, tableName)
name := gen.nameExpr()
return fmt.Sprintf(`SELECT NAME, meta FROM "%s" WHERE dirhash=$1 AND %s>$2 AND directory=$3 AND %s like $4 ORDER BY %s ASC LIMIT $5`, tableName, name, name, name)
}
func (gen *SqlGenPostgres) GetSqlListInclusive(tableName string) string {
return fmt.Sprintf(`SELECT NAME, meta FROM "%s" WHERE dirhash=$1 AND name>=$2 AND directory=$3 AND name like $4 ORDER BY NAME ASC LIMIT $5`, tableName)
name := gen.nameExpr()
return fmt.Sprintf(`SELECT NAME, meta FROM "%s" WHERE dirhash=$1 AND %s>=$2 AND directory=$3 AND %s like $4 ORDER BY %s ASC LIMIT $5`, tableName, name, name, name)
}
func (gen *SqlGenPostgres) GetSqlCreateTable(tableName string) string {
@@ -23,3 +23,45 @@ func TestEmptyUpsertTemplateFallsBackToPlainInsert(t *testing.T) {
t.Fatalf("plain INSERT path should not contain ON CONFLICT, got: %s", got)
}
}
func TestListSqlDefaultOrderingFollowsColumn(t *testing.T) {
gen := &SqlGenPostgres{}
for _, got := range []string{gen.GetSqlListExclusive("filemeta"), gen.GetSqlListInclusive("filemeta")} {
if strings.Contains(got, "COLLATE") {
t.Fatalf("default list query should not force a collation, got: %s", got)
}
if !strings.Contains(got, "ORDER BY name ASC") {
t.Fatalf("expected plain name ordering, got: %s", got)
}
}
}
func TestListSqlBinaryOrderingOnNonBinaryColumn(t *testing.T) {
gen := &SqlGenPostgres{ForceBinaryCollation: true}
for _, got := range []string{gen.GetSqlListExclusive("filemeta"), gen.GetSqlListInclusive("filemeta")} {
if !strings.Contains(got, `ORDER BY name COLLATE "C" ASC`) {
t.Fatalf(`expected COLLATE "C" ordering, got: %s`, got)
}
if !strings.Contains(got, `name COLLATE "C" like $4`) {
t.Fatalf(`expected COLLATE "C" prefix filter, got: %s`, got)
}
if strings.Contains(got, "AND name>$2") || strings.Contains(got, "AND name>=$2") {
t.Fatalf("pagination comparison must also be byte-ordered, got: %s", got)
}
}
}
func TestIsByteOrderedCollation(t *testing.T) {
byteOrdered := []string{"C", "POSIX", "c", "C.UTF-8", "C.utf8"}
for _, c := range byteOrdered {
if !isByteOrderedCollation(c) {
t.Fatalf("expected %q to be treated as byte-ordered", c)
}
}
locale := []string{"en_US.UTF-8", "en_US.utf8", "en-US-x-icu", "und-x-icu", ""}
for _, c := range locale {
if isByteOrderedCollation(c) {
t.Fatalf("expected %q to be treated as locale-aware", c)
}
}
}
+4 -1
View File
@@ -62,11 +62,12 @@ func (store *PostgresStore) initialize(upsertQuery string, enableUpsert bool, us
} else if upsertQuery == "" {
upsertQuery = DefaultUpsertQuery
}
store.SqlGenerator = &SqlGenPostgres{
gen := &SqlGenPostgres{
CreateTableSqlTemplate: "",
DropTableSqlTemplate: `drop table "%s"`,
UpsertQueryTemplate: upsertQuery,
}
store.SqlGenerator = gen
// pgx-optimized connection string with better timeouts and connection handling
sqlUrl := "connect_timeout=30"
@@ -116,5 +117,7 @@ func (store *PostgresStore) initialize(upsertQuery string, enableUpsert bool, us
}
store.DB = db
ConfigureListOrdering(store.DB, gen)
return nil
}
+4 -1
View File
@@ -68,11 +68,12 @@ func (store *PostgresStore2) initialize(createTable, upsertQuery string, enableU
} else if upsertQuery == "" {
upsertQuery = postgres.DefaultUpsertQuery
}
store.SqlGenerator = &postgres.SqlGenPostgres{
gen := &postgres.SqlGenPostgres{
CreateTableSqlTemplate: createTable,
DropTableSqlTemplate: `drop table "%s"`,
UpsertQueryTemplate: upsertQuery,
}
store.SqlGenerator = gen
// pgx-optimized connection string with better timeouts and connection handling
sqlUrl := "connect_timeout=30"
@@ -126,5 +127,7 @@ func (store *PostgresStore2) initialize(createTable, upsertQuery string, enableU
return fmt.Errorf("init table %s: %v", abstract_sql.DEFAULT_TABLE, err)
}
postgres.ConfigureListOrdering(store.DB, gen)
return nil
}