diff --git a/weed/mount/weedfs_dir_lookup.go b/weed/mount/weedfs_dir_lookup.go index 1b5253354..319a476e2 100644 --- a/weed/mount/weedfs_dir_lookup.go +++ b/weed/mount/weedfs_dir_lookup.go @@ -14,7 +14,8 @@ import ( func (wfs *WFS) Lookup(cancel <-chan struct{}, header *fuse.InHeader, name string, out *fuse.EntryOut) (code fuse.Status) { - if s := checkName(name); s != fuse.OK { + var s fuse.Status + if name, s = checkName(name); s != fuse.OK { return s } diff --git a/weed/mount/weedfs_dir_mkrm.go b/weed/mount/weedfs_dir_mkrm.go index 67705d1e4..ef0f8ab43 100644 --- a/weed/mount/weedfs_dir_mkrm.go +++ b/weed/mount/weedfs_dir_mkrm.go @@ -26,7 +26,8 @@ func (wfs *WFS) Mkdir(cancel <-chan struct{}, in *fuse.MkdirIn, name string, out return fuse.Status(syscall.ENOSPC) } - if s := checkName(name); s != fuse.OK { + var s fuse.Status + if name, s = checkName(name); s != fuse.OK { return s } @@ -65,7 +66,12 @@ func (wfs *WFS) Mkdir(cancel <-chan struct{}, in *fuse.MkdirIn, name string, out // explicitly below instead of using defer so the kernel gets local values. request := &filer_pb.CreateEntryRequest{ - Directory: string(dirFullPath), + // Defensive: dirFullPath is clean by construction for mount-originated + // mutations, but could carry invalid-UTF-8 bytes if metaCache was + // populated from a non-gRPC source (direct store write, legacy import). + // Sanitizing here keeps the marshal strictly per-request on the off + // chance invalid bytes do reach us. + Directory: dirFullPath.Sanitized(), Entry: newEntry, Signatures: []int32{wfs.signature}, SkipCheckParentDirectory: true, @@ -124,6 +130,9 @@ func (wfs *WFS) Rmdir(cancel <-chan struct{}, header *fuse.InHeader, name string return fuse.Status(syscall.ENOTEMPTY) } + // Sanitize before it reaches DeleteEntryRequest.Name; see sanitizeFuseName. + name = sanitizeFuseName(name) + dirFullPath, code := wfs.inodeToPath.GetPath(header.NodeId) if code != fuse.OK { return diff --git a/weed/mount/weedfs_file_mkrm.go b/weed/mount/weedfs_file_mkrm.go index 8d4d8d601..5fc2d63a7 100644 --- a/weed/mount/weedfs_file_mkrm.go +++ b/weed/mount/weedfs_file_mkrm.go @@ -27,7 +27,8 @@ import ( * will be called instead. */ func (wfs *WFS) Create(cancel <-chan struct{}, in *fuse.CreateIn, name string, out *fuse.CreateOut) (code fuse.Status) { - if s := checkName(name); s != fuse.OK { + var s fuse.Status + if name, s = checkName(name); s != fuse.OK { return s } @@ -140,7 +141,8 @@ func (wfs *WFS) Create(cancel <-chan struct{}, in *fuse.CreateIn, name string, o */ func (wfs *WFS) Mknod(cancel <-chan struct{}, in *fuse.MknodIn, name string, out *fuse.EntryOut) (code fuse.Status) { - if s := checkName(name); s != fuse.OK { + var s fuse.Status + if name, s = checkName(name); s != fuse.OK { return s } @@ -167,6 +169,9 @@ func (wfs *WFS) Mknod(cancel <-chan struct{}, in *fuse.MknodIn, name string, out /** Remove a file */ func (wfs *WFS) Unlink(cancel <-chan struct{}, header *fuse.InHeader, name string) (code fuse.Status) { + // Sanitize before it reaches DeleteEntryRequest.Name; see sanitizeFuseName. + name = sanitizeFuseName(name) + dirFullPath, code := wfs.inodeToPath.GetPath(header.NodeId) if code != fuse.OK { if code == fuse.ENOENT { @@ -251,7 +256,8 @@ func (wfs *WFS) Unlink(cancel <-chan struct{}, header *fuse.InHeader, name strin // Always let the filer decide whether to delete chunks based on its authoritative data. // The filer has the correct hard link count and will only delete chunks when appropriate. deleteReq := &filer_pb.DeleteEntryRequest{ - Directory: string(dirFullPath), + // See weedfs_dir_mkrm.go Mkdir for why Directory is sanitized. + Directory: dirFullPath.Sanitized(), Name: name, IsDeleteData: true, Signatures: []int32{wfs.signature}, diff --git a/weed/mount/weedfs_link.go b/weed/mount/weedfs_link.go index 16952a33d..969d7d199 100644 --- a/weed/mount/weedfs_link.go +++ b/weed/mount/weedfs_link.go @@ -31,7 +31,8 @@ func (wfs *WFS) Link(cancel <-chan struct{}, in *fuse.LinkIn, name string, out * return fuse.Status(syscall.ENOSPC) } - if s := checkName(name); s != fuse.OK { + var s fuse.Status + if name, s = checkName(name); s != fuse.OK { return s } diff --git a/weed/mount/weedfs_rename.go b/weed/mount/weedfs_rename.go index 55218bf09..d14eddb79 100644 --- a/weed/mount/weedfs_rename.go +++ b/weed/mount/weedfs_rename.go @@ -169,7 +169,12 @@ func (wfs *WFS) Rename(cancel <-chan struct{}, in *fuse.RenameIn, oldName string return fuse.Status(syscall.ENOSPC) } - if s := checkName(newName); s != fuse.OK { + // Both names end up in StreamRenameEntryRequest (proto string fields), so + // sanitize both. checkName handles newName's length check; oldName may + // legitimately be a pre-existing entry whose bytes were not validated. + oldName = sanitizeFuseName(oldName) + var s fuse.Status + if newName, s = checkName(newName); s != fuse.OK { return s } diff --git a/weed/mount/weedfs_symlink.go b/weed/mount/weedfs_symlink.go index 85f108913..b0d4cd9cc 100644 --- a/weed/mount/weedfs_symlink.go +++ b/weed/mount/weedfs_symlink.go @@ -18,7 +18,8 @@ func (wfs *WFS) Symlink(cancel <-chan struct{}, header *fuse.InHeader, target st if wfs.IsOverQuotaWithUncommitted() { return fuse.Status(syscall.ENOSPC) } - if s := checkName(name); s != fuse.OK { + var s fuse.Status + if name, s = checkName(name); s != fuse.OK { return s } diff --git a/weed/mount/weedfs_write.go b/weed/mount/weedfs_write.go index 552202ef5..0bcc6f21d 100644 --- a/weed/mount/weedfs_write.go +++ b/weed/mount/weedfs_write.go @@ -13,6 +13,13 @@ import ( func (wfs *WFS) saveDataAsChunk(fullPath util.FullPath) filer.SaveDataAsChunkFunctionType { + // Backstop: FUSE entry points sanitize names before they reach + // inodeToPath, but async flush paths (e.g. writebackCache, handles whose + // RememberPath was set from an older code path) may still carry bytes + // that predate sanitization. Proto3 string fields require valid UTF-8, + // so scrub the full path once here before every AssignVolume call. + assignPath := fullPath.Sanitized() + return func(reader io.Reader, filename string, offset int64, tsNs int64, _ uint64) (chunk *filer_pb.FileChunk, err error) { uploader, err := operation.NewUploader() if err != nil { @@ -43,7 +50,7 @@ func (wfs *WFS) saveDataAsChunk(fullPath util.FullPath) filer.SaveDataAsChunkFun TtlSec: wfs.option.TtlSec, DiskType: string(wfs.option.DiskType), DataCenter: wfs.option.DataCenter, - Path: string(fullPath), + Path: assignPath, }, uploadOption, genFileUrlFn, reader, ) diff --git a/weed/mount/wfs_save.go b/weed/mount/wfs_save.go index 3e3a1bb63..be848fc2d 100644 --- a/weed/mount/wfs_save.go +++ b/weed/mount/wfs_save.go @@ -76,12 +76,29 @@ func (wfs *WFS) mapPbIdFromLocalToFiler(entry *filer_pb.Entry) { entry.Attributes.Uid, entry.Attributes.Gid = wfs.option.UidGidMapper.LocalToFiler(entry.Attributes.Uid, entry.Attributes.Gid) } -func checkName(name string) fuse.Status { +// sanitizeFuseName scrubs a name arriving from the kernel before it is placed +// in a proto string field. Linux (and macOS) pass raw bytes for filenames; +// apps like GNOME Trash produce partial files whose names contain binary +// payloads. Proto3 `string` fields require valid UTF-8, so an unsanitized +// name causes gRPC to fail the whole AssignVolume / CreateEntry / DeleteEntry +// RPC with "grpc: error while marshaling: string field contains invalid +// UTF-8", which surfaces to userspace as EIO. Sanitizing at every FUSE +// boundary keeps filer RPCs marshalable and prevents a single ill-named file +// from poisoning the shared gRPC channel for every other in-flight request. +// +// Delegates to util.SanitizeUTF8Name so the replacement character is chosen +// in exactly one place across the codebase. +func sanitizeFuseName(name string) string { + return util.SanitizeUTF8Name(name) +} + +func checkName(name string) (string, fuse.Status) { + name = sanitizeFuseName(name) // The Linux FUSE kernel module enforces NAME_MAX=255 at the VFS layer. // Return ENAMETOOLONG early to avoid creating entries that cannot be // looked up via normal syscalls (stat, chmod, etc.). if len(name) > 255 { - return fuse.Status(syscall.ENAMETOOLONG) + return name, fuse.Status(syscall.ENAMETOOLONG) } - return fuse.OK + return name, fuse.OK } diff --git a/weed/mount/wfs_save_test.go b/weed/mount/wfs_save_test.go new file mode 100644 index 000000000..c6abe713b --- /dev/null +++ b/weed/mount/wfs_save_test.go @@ -0,0 +1,73 @@ +package mount + +import ( + "strings" + "syscall" + "testing" + "unicode/utf8" + + "github.com/seaweedfs/go-fuse/v2/fuse" +) + +// TestSanitizeFuseName_InvalidBytesReplaced reproduces the filename from +// seaweedfs#9139: GNOME Trash "partial" files carry raw binary bytes +// (\x10\x98=\\\x8a\x7f) that are not valid UTF-8. The sanitizer must return a +// UTF-8-valid string so the subsequent proto marshal cannot fail. +func TestSanitizeFuseName_InvalidBytesReplaced(t *testing.T) { + raw := "\x10\x98=\\\x8a\x7f.trashinfo.9a51454f.partial" + out := sanitizeFuseName(raw) + if !utf8.ValidString(out) { + t.Fatalf("sanitizeFuseName returned non-UTF-8: %q", out) + } + // The replacement must be URL-safe: these sanitized names flow into HTTP + // URLs (volume-server uploads, filer HTTP API, S3/WebDAV gateways), so a + // '?' would be interpreted as the query-string delimiter and split the + // path. Explicitly assert the replacement char is neither '?' nor U+FFFD. + if strings.ContainsRune(out, '?') { + t.Fatalf("sanitizer produced non-URL-safe '?' in %q", out) + } + if strings.ContainsRune(out, 0xFFFD) { + t.Fatalf("expected single-byte replacement, got U+FFFD in %q", out) + } + // Valid bytes (\x10, \x3D '=', \x5C '\\', \x7F) must be preserved; only + // \x98 and \x8A — the standalone continuation bytes — get replaced. + if !strings.HasSuffix(out, ".trashinfo.9a51454f.partial") { + t.Fatalf("trailing valid bytes were dropped: %q", out) + } +} + +func TestSanitizeFuseName_PassThroughValidUTF8(t *testing.T) { + // An already-valid UTF-8 string must be returned unchanged — no heap + // allocation, no alteration of byte content. Preserving identity matters + // because the overwhelming hot path is valid input. + for _, s := range []string{ + "plain.txt", + "日本語.txt", + "🦑 squid", + "", + strings.Repeat("a", 255), + } { + if got := sanitizeFuseName(s); got != s { + t.Fatalf("sanitizeFuseName(%q) = %q, want unchanged", s, got) + } + } +} + +// TestCheckName_SanitizesBeforeLengthCheck verifies the caller contract: the +// returned name is always safe to put into a proto string field, and the +// length guard still fires on over-long inputs. +func TestCheckName_SanitizesBeforeLengthCheck(t *testing.T) { + bad := "foo\x80bar" + got, s := checkName(bad) + if s != fuse.OK { + t.Fatalf("checkName(%q) status = %v, want OK", bad, s) + } + if !utf8.ValidString(got) { + t.Fatalf("checkName did not sanitize: %q", got) + } + + tooLong := strings.Repeat("x", 300) + if _, s := checkName(tooLong); s != fuse.Status(syscall.ENAMETOOLONG) { + t.Fatalf("checkName length guard lost: status=%v", s) + } +} diff --git a/weed/pb/grpc_client_server.go b/weed/pb/grpc_client_server.go index ddae14219..e49eb59b1 100644 --- a/weed/pb/grpc_client_server.go +++ b/weed/pb/grpc_client_server.go @@ -2,6 +2,7 @@ package pb import ( "context" + "errors" "fmt" "math/rand/v2" "net" @@ -265,12 +266,62 @@ func InvalidateGrpcConnection(address string) { } } +// grpcMarshalErrorPrefix is the library-owned prefix gRPC prepends to every +// client-side proto marshal failure; see grpc-go rpc_util.go encode(): +// status.Errorf(codes.Internal, "grpc: error while marshaling: %v", ...) +// The "grpc:" token is reserved for gRPC internal diagnostics and will not +// collide with user-produced Internal statuses. +const grpcMarshalErrorPrefix = "grpc: error while marshaling" + +// grpcStatusError is the minimal interface every gRPC status error satisfies. +// Using it with errors.As lets us reach through arbitrary fmt.Errorf("...: %w") +// wrapping to pull out the *original* gRPC Status — important because +// status.FromError rewrites Status.Message with the outermost err.Error() +// whenever it has to unwrap, which would defeat a prefix check on the +// library-owned message. +type grpcStatusError interface{ GRPCStatus() *status.Status } + +// isClientSideMarshalError reports whether err originates from gRPC failing to +// marshal the outgoing request on the client side. These errors are surfaced +// with codes.Internal (because gRPC has no better code for them) but do NOT +// reflect a problem with the TCP/HTTP2 connection: the request never left the +// process. Tearing down the shared cached ClientConn in response would cancel +// every other in-flight RPC on that connection — which is exactly the cascade +// seen in seaweedfs#9139 when a single file with invalid-UTF-8 bytes in its +// name produces an avalanche of "connection is closing" errors on unrelated +// LookupEntry / ReadDirAll / UpdateEntry calls. +// +// errors.Is against a proto-level sentinel is not viable: gRPC's encode() +// collapses the inner proto error with "%v" before wrapping it in a Status, +// so the original error type does not survive. The structural signal that +// *does* survive is the Status itself — we recover it via errors.As and +// match the library-owned "grpc:" prefix to disambiguate from a server-side +// application Internal status that genuinely warrants invalidation. +func isClientSideMarshalError(err error) bool { + var gse grpcStatusError + if !errors.As(err, &gse) { + return false + } + s := gse.GRPCStatus() + if s == nil { + return false + } + return s.Code() == codes.Internal && strings.HasPrefix(s.Message(), grpcMarshalErrorPrefix) +} + // shouldInvalidateConnection checks if an error indicates the cached connection should be invalidated func shouldInvalidateConnection(err error) bool { if err == nil { return false } + // Client-side marshal errors (e.g. invalid UTF-8 in a proto string field) + // are per-request bugs, not connection failures. Do not poison the shared + // ClientConn because of them. + if isClientSideMarshalError(err) { + return false + } + // Check gRPC status codes first (more reliable) if s, ok := status.FromError(err); ok { code := s.Code() diff --git a/weed/pb/grpc_client_server_test.go b/weed/pb/grpc_client_server_test.go new file mode 100644 index 000000000..5d3f4aa16 --- /dev/null +++ b/weed/pb/grpc_client_server_test.go @@ -0,0 +1,68 @@ +package pb + +import ( + "fmt" + "testing" + + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" +) + +// TestShouldInvalidateConnection_MarshalErrorIsPerRequest ensures that a +// client-side proto marshal failure does NOT cause the shared cached +// ClientConn to be torn down. Tearing it down would cancel every other +// in-flight RPC (seaweedfs#9139: one file with invalid-UTF-8 bytes triggered +// an avalanche of "connection is closing" errors on unrelated operations). +func TestShouldInvalidateConnection_MarshalErrorIsPerRequest(t *testing.T) { + // Reproduces the exact error gRPC returns when a string field in the + // outgoing request contains invalid UTF-8 bytes. + marshalErr := status.Error(codes.Internal, + "grpc: error while marshaling: string field contains invalid UTF-8") + if shouldInvalidateConnection(marshalErr) { + t.Fatalf("client-side marshal error must not invalidate the shared connection") + } + + // Same error wrapped with fmt.Errorf (common when callers add context). + wrapped := fmt.Errorf("upload data: %w", marshalErr) + if shouldInvalidateConnection(wrapped) { + t.Fatalf("wrapped marshal error must not invalidate the shared connection") + } +} + +// TestShouldInvalidateConnection_GenuineInternalStillInvalidates ensures the +// marshal-error carve-out does not swallow real server-side Internal errors, +// which previously caused — and should continue to cause — connection +// invalidation. +func TestShouldInvalidateConnection_GenuineInternalStillInvalidates(t *testing.T) { + serverInternal := status.Error(codes.Internal, "stream terminated by RST_STREAM with code 2") + if !shouldInvalidateConnection(serverInternal) { + t.Fatalf("genuine server-side Internal must still invalidate the connection") + } +} + +// TestShouldInvalidateConnection_TransportErrorsStillInvalidate is a +// regression guard for the string-matching fallback path (e.g. a raw +// "connection refused" from net.Dial that never acquired a gRPC status). +func TestShouldInvalidateConnection_TransportErrorsStillInvalidate(t *testing.T) { + for _, msg := range []string{ + "rpc error: code = Unavailable desc = transport is closing", + "dial tcp: connection refused", + "read: connection reset by peer", + } { + if !shouldInvalidateConnection(fmt.Errorf("%s", msg)) { + t.Fatalf("transport error %q must still invalidate", msg) + } + } +} + +// TestIsClientSideMarshalError_RequiresGrpcStatus ensures the carve-out is +// type-based (via errors.As on the grpc status interface), not a naive +// string match against arbitrary errors that happen to mention marshaling. +// A plain errors.New(...) with the same prefix must NOT be treated as a +// per-request marshal error — we have no evidence the connection is healthy. +func TestIsClientSideMarshalError_RequiresGrpcStatus(t *testing.T) { + impostor := fmt.Errorf("grpc: error while marshaling: synthetic non-status error") + if isClientSideMarshalError(impostor) { + t.Fatalf("plain error must not match the marshal-error carve-out") + } +} diff --git a/weed/shell/command_fs_merge_volumes.go b/weed/shell/command_fs_merge_volumes.go index b1542b2d8..59c1650b3 100644 --- a/weed/shell/command_fs_merge_volumes.go +++ b/weed/shell/command_fs_merge_volumes.go @@ -591,9 +591,13 @@ func (c *commandFsMergeVolumes) uploadManifestChunk( if err := commandEnv.WithFilerClient(false, func(client filer_pb.SeaweedFilerClient) error { for attempt := 1; attempt <= manifestAssignAttempts; attempt++ { resp, err := client.AssignVolume(ctx, &filer_pb.AssignVolumeRequest{ - Count: 1, - Collection: collection, - Path: string(entryPath), + Count: 1, + Collection: collection, + // entryPath is built from entry.Name returned by the filer. Filers + // written through gRPC already hold valid UTF-8, but legacy or + // directly-imported entries may not — sanitize so one bad name + // does not fail the whole merge pass. + Path: entryPath.Sanitized(), ExpectedDataSize: uint64(len(data)), }) if err != nil { diff --git a/weed/util/fullpath.go b/weed/util/fullpath.go index 9a2b198b7..e72341ba4 100644 --- a/weed/util/fullpath.go +++ b/weed/util/fullpath.go @@ -4,10 +4,36 @@ import ( "path" "path/filepath" "strings" + "unicode/utf8" ) type FullPath string +// invalidUTF8Replacement is the single-byte replacement used everywhere a name +// or path from an untrusted source (kernel FUSE input, external clients, store +// imports) may contain bytes that are not valid UTF-8. Proto3 `string` fields +// require valid UTF-8, so any such bytes must be substituted before the value +// enters a gRPC request; otherwise marshaling fails for the whole RPC. +// +// '_' is URL-safe: these sanitized strings also flow into HTTP URLs +// (volume-server uploads, filer HTTP API, S3/WebDAV gateways). Using '?' +// would cause it to be interpreted as the query-string delimiter the first +// time the name lands in a URL and split the path. +const invalidUTF8Replacement = "_" + +// SanitizeUTF8Name replaces every invalid-UTF-8 byte in s with +// invalidUTF8Replacement. For the common, valid-UTF-8 case the input is +// returned unchanged with no allocation. Use this for any byte sequence +// that will be assigned to a proto string field (names, paths) from an +// untrusted source; centralising the replacement keeps the chosen character +// consistent across the codebase. +func SanitizeUTF8Name(s string) string { + if utf8.ValidString(s) { + return s + } + return strings.ToValidUTF8(s, invalidUTF8Replacement) +} + func NewFullPath(dir, name string) FullPath { name = strings.TrimSuffix(name, "/") return FullPath(dir).Child(name) @@ -15,7 +41,7 @@ func NewFullPath(dir, name string) FullPath { func (fp FullPath) DirAndName() (string, string) { dir, name := filepath.Split(string(fp)) - name = strings.ToValidUTF8(name, "?") + name = SanitizeUTF8Name(name) if dir == "/" { return dir, name } @@ -25,10 +51,22 @@ func (fp FullPath) DirAndName() (string, string) { return dir[:len(dir)-1], name } +// Name returns the last path component, with any invalid-UTF-8 bytes replaced +// via SanitizeUTF8Name so the result is always safe to place in a proto +// string field or HTTP URL. func (fp FullPath) Name() string { _, name := filepath.Split(string(fp)) - name = strings.ToValidUTF8(name, "?") - return name + return SanitizeUTF8Name(name) +} + +// Sanitized returns the full path with every invalid-UTF-8 byte — in any +// component, not just the last — replaced via SanitizeUTF8Name. Use this +// before assigning the path to a proto string field (e.g. Directory, +// AssignVolumeRequest.Path) when the path may have been produced from +// sources that do not enforce UTF-8 (cache populated from an external +// store, legacy metadata, shell traversals of existing filer entries). +func (fp FullPath) Sanitized() string { + return SanitizeUTF8Name(string(fp)) } func (fp FullPath) IsLongerFileName(maxFilenameLength uint32) bool { diff --git a/weed/util/fullpath_test.go b/weed/util/fullpath_test.go new file mode 100644 index 000000000..af29e3d71 --- /dev/null +++ b/weed/util/fullpath_test.go @@ -0,0 +1,84 @@ +package util + +import ( + "strings" + "testing" + "unicode/utf8" +) + +// TestSanitizeUTF8Name_ValidPassThrough asserts the fast path returns the +// input unchanged (no allocation, no byte alteration). +func TestSanitizeUTF8Name_ValidPassThrough(t *testing.T) { + for _, s := range []string{ + "", + "plain.txt", + "日本語.txt", + "🦑 squid", + } { + if got := SanitizeUTF8Name(s); got != s { + t.Fatalf("SanitizeUTF8Name(%q) = %q, want unchanged", s, got) + } + } +} + +// TestSanitizeUTF8Name_InvalidBytes asserts invalid bytes are replaced with a +// single '_' (URL-safe, single-byte) and the output is valid UTF-8. The +// replacement char is load-bearing — downstream code places these strings in +// HTTP URLs, where '?' would be parsed as the query delimiter. +func TestSanitizeUTF8Name_InvalidBytes(t *testing.T) { + out := SanitizeUTF8Name("foo\x80bar") + if !utf8.ValidString(out) { + t.Fatalf("result is not valid UTF-8: %q", out) + } + if out != "foo_bar" { + t.Fatalf("SanitizeUTF8Name = %q, want %q", out, "foo_bar") + } + if strings.ContainsRune(out, '?') { + t.Fatalf("replacement must be URL-safe, got %q", out) + } +} + +// TestFullPathSanitized_WholePath ensures Sanitized() scrubs invalid bytes in +// every component, not just the last — that's the difference from Name() and +// the reason call sites that need to pass a full path to a proto field must +// use Sanitized(), not (dir, _) := DirAndName(). +func TestFullPathSanitized_WholePath(t *testing.T) { + // Invalid byte sits in the middle component. + fp := FullPath("/home/bad\x80dir/file.txt") + got := fp.Sanitized() + want := "/home/bad_dir/file.txt" + if got != want { + t.Fatalf("Sanitized() = %q, want %q", got, want) + } + + // Bytes in every component — all get replaced, structure preserved. + fp = FullPath("/a\xffb/c\xffd/e\xfff") + got = fp.Sanitized() + want = "/a_b/c_d/e_f" + if got != want { + t.Fatalf("Sanitized() = %q, want %q", got, want) + } + if !utf8.ValidString(got) { + t.Fatalf("Sanitized() returned non-UTF-8: %q", got) + } +} + +// TestFullPathDirAndName_OnlyNameSanitized documents a (deliberate) sharp +// edge: DirAndName() sanitizes only the trailing name, not dir. Callers who +// need a sanitized full path must use Sanitized(); using dir from DirAndName +// will still carry invalid bytes in parent components. This test pins the +// existing behavior so it is not accidentally "fixed" in a way that changes +// the (dir, name) semantics that everything else depends on. +func TestFullPathDirAndName_OnlyNameSanitized(t *testing.T) { + fp := FullPath("/home/bad\x80dir/child\xffname") + dir, name := fp.DirAndName() + if !utf8.ValidString(name) { + t.Fatalf("name must be sanitized: %q", name) + } + // dir still contains the invalid byte — this is by design, because dir is + // used positionally (e.g. as a parent key) and changing its bytes would + // change identity. Sanitized() is the method to use for proto fields. + if utf8.ValidString(dir) { + t.Fatalf("regression: dir should remain raw (%q); callers needing a clean path must use Sanitized()", dir) + } +}