mirror of
https://github.com/seaweedfs/seaweedfs.git
synced 2026-06-13 23:36:45 +03:00
a3c0baa9b0
While the ring changed within the last snapshot interval, a fresh owner asks the key's previous owner (LockRing.PriorOwner) whether it still holds a conflicting lock before granting TRY_LOCK or answering GET_LK, so it does not double-grant before re-assertion rebuilds its local state. The probe is marked cooling_probe so the previous owner answers from local state without recursing. PriorOwner uses the snapshot's prebuilt ring rather than rebuilding a hash ring per call.
846 lines
29 KiB
Protocol Buffer
846 lines
29 KiB
Protocol Buffer
syntax = "proto3";
|
|
|
|
package filer_pb;
|
|
|
|
option go_package = "github.com/seaweedfs/seaweedfs/weed/pb/filer_pb";
|
|
option java_package = "seaweedfs.client";
|
|
option java_outer_classname = "FilerProto";
|
|
|
|
//////////////////////////////////////////////////
|
|
|
|
service SeaweedFiler {
|
|
|
|
rpc LookupDirectoryEntry (LookupDirectoryEntryRequest) returns (LookupDirectoryEntryResponse) {
|
|
}
|
|
|
|
rpc ListEntries (ListEntriesRequest) returns (stream ListEntriesResponse) {
|
|
}
|
|
|
|
rpc CreateEntry (CreateEntryRequest) returns (CreateEntryResponse) {
|
|
}
|
|
|
|
rpc UpdateEntry (UpdateEntryRequest) returns (UpdateEntryResponse) {
|
|
}
|
|
|
|
rpc TouchAccessTime (TouchAccessTimeRequest) returns (TouchAccessTimeResponse) {
|
|
}
|
|
|
|
rpc AppendToEntry (AppendToEntryRequest) returns (AppendToEntryResponse) {
|
|
}
|
|
|
|
rpc DeleteEntry (DeleteEntryRequest) returns (DeleteEntryResponse) {
|
|
}
|
|
|
|
rpc ObjectTransaction (ObjectTransactionRequest) returns (ObjectTransactionResponse) {
|
|
}
|
|
|
|
rpc ObjectTransactionBatch (ObjectTransactionBatchRequest) returns (ObjectTransactionBatchResponse) {
|
|
}
|
|
|
|
rpc PosixLock (PosixLockRequest) returns (PosixLockResponse) {
|
|
}
|
|
|
|
rpc AtomicRenameEntry (AtomicRenameEntryRequest) returns (AtomicRenameEntryResponse) {
|
|
}
|
|
rpc StreamRenameEntry (StreamRenameEntryRequest) returns (stream StreamRenameEntryResponse) {
|
|
}
|
|
|
|
rpc StreamMutateEntry (stream StreamMutateEntryRequest) returns (stream StreamMutateEntryResponse) {
|
|
}
|
|
|
|
rpc AssignVolume (AssignVolumeRequest) returns (AssignVolumeResponse) {
|
|
}
|
|
|
|
rpc LookupVolume (LookupVolumeRequest) returns (LookupVolumeResponse) {
|
|
}
|
|
|
|
rpc CollectionList (CollectionListRequest) returns (CollectionListResponse) {
|
|
}
|
|
|
|
rpc DeleteCollection (DeleteCollectionRequest) returns (DeleteCollectionResponse) {
|
|
}
|
|
|
|
rpc Statistics (StatisticsRequest) returns (StatisticsResponse) {
|
|
}
|
|
|
|
rpc Ping (PingRequest) returns (PingResponse) {
|
|
}
|
|
|
|
rpc GetFilerConfiguration (GetFilerConfigurationRequest) returns (GetFilerConfigurationResponse) {
|
|
}
|
|
|
|
rpc TraverseBfsMetadata (TraverseBfsMetadataRequest) returns (stream TraverseBfsMetadataResponse) {
|
|
}
|
|
|
|
rpc SubscribeMetadata (SubscribeMetadataRequest) returns (stream SubscribeMetadataResponse) {
|
|
}
|
|
|
|
rpc SubscribeLocalMetadata (SubscribeMetadataRequest) returns (stream SubscribeMetadataResponse) {
|
|
}
|
|
|
|
rpc KvGet (KvGetRequest) returns (KvGetResponse) {
|
|
}
|
|
|
|
rpc KvPut (KvPutRequest) returns (KvPutResponse) {
|
|
}
|
|
|
|
rpc CacheRemoteObjectToLocalCluster (CacheRemoteObjectToLocalClusterRequest) returns (CacheRemoteObjectToLocalClusterResponse) {
|
|
}
|
|
|
|
rpc DistributedLock(LockRequest) returns (LockResponse) {
|
|
}
|
|
rpc DistributedUnlock(UnlockRequest) returns (UnlockResponse) {
|
|
}
|
|
rpc FindLockOwner(FindLockOwnerRequest) returns (FindLockOwnerResponse) {
|
|
}
|
|
// distributed lock management internal use only
|
|
rpc TransferLocks(TransferLocksRequest) returns (TransferLocksResponse) {
|
|
}
|
|
rpc ReplicateLock(ReplicateLockRequest) returns (ReplicateLockResponse) {
|
|
}
|
|
|
|
// Peer chunk sharing — tier 1: mount-server registry.
|
|
// See design-weed-mount-peer-chunk-sharing.md for details.
|
|
rpc MountRegister (MountRegisterRequest) returns (MountRegisterResponse) {
|
|
}
|
|
rpc MountList (MountListRequest) returns (MountListResponse) {
|
|
}
|
|
}
|
|
|
|
//////////////////////////////////////////////////
|
|
|
|
message LookupDirectoryEntryRequest {
|
|
string directory = 1;
|
|
string name = 2;
|
|
}
|
|
|
|
message LookupDirectoryEntryResponse {
|
|
Entry entry = 1;
|
|
}
|
|
|
|
message ListEntriesRequest {
|
|
string directory = 1;
|
|
string prefix = 2;
|
|
string startFromFileName = 3;
|
|
bool inclusiveStartFrom = 4;
|
|
uint32 limit = 5;
|
|
int64 snapshot_ts_ns = 6;
|
|
}
|
|
|
|
message ListEntriesResponse {
|
|
Entry entry = 1;
|
|
int64 snapshot_ts_ns = 2;
|
|
}
|
|
|
|
message RemoteEntry {
|
|
string storage_name = 1;
|
|
int64 last_local_sync_ts_ns = 2;
|
|
string remote_e_tag = 3;
|
|
int64 remote_mtime = 4;
|
|
int64 remote_size = 5;
|
|
}
|
|
message Entry {
|
|
string name = 1;
|
|
bool is_directory = 2;
|
|
repeated FileChunk chunks = 3;
|
|
FuseAttributes attributes = 4;
|
|
map<string, bytes> extended = 5;
|
|
bytes hard_link_id = 7;
|
|
int32 hard_link_counter = 8; // only exists in hard link meta data
|
|
bytes content = 9; // if not empty, the file content
|
|
|
|
RemoteEntry remote_entry = 10;
|
|
int64 quota = 11; // for bucket only. Positive/Negative means enabled/disabled.
|
|
int64 worm_enforced_at_ts_ns = 12;
|
|
}
|
|
|
|
message FullEntry {
|
|
string dir = 1;
|
|
Entry entry = 2;
|
|
}
|
|
|
|
message EventNotification {
|
|
Entry old_entry = 1;
|
|
Entry new_entry = 2;
|
|
bool delete_chunks = 3;
|
|
string new_parent_path = 4;
|
|
bool is_from_other_cluster = 5;
|
|
repeated int32 signatures = 6;
|
|
}
|
|
|
|
enum SSEType {
|
|
NONE = 0; // No server-side encryption
|
|
SSE_C = 1; // Server-Side Encryption with Customer-Provided Keys
|
|
SSE_KMS = 2; // Server-Side Encryption with KMS-Managed Keys
|
|
SSE_S3 = 3; // Server-Side Encryption with S3-Managed Keys
|
|
}
|
|
|
|
message FileChunk {
|
|
string file_id = 1; // to be deprecated
|
|
int64 offset = 2;
|
|
uint64 size = 3;
|
|
int64 modified_ts_ns = 4;
|
|
string e_tag = 5;
|
|
string source_file_id = 6; // to be deprecated
|
|
FileId fid = 7;
|
|
FileId source_fid = 8;
|
|
bytes cipher_key = 9;
|
|
bool is_compressed = 10;
|
|
bool is_chunk_manifest = 11; // content is a list of FileChunks
|
|
SSEType sse_type = 12; // Server-side encryption type
|
|
bytes sse_metadata = 13; // Serialized SSE metadata for this chunk (SSE-C, SSE-KMS, or SSE-S3)
|
|
}
|
|
|
|
message FileChunkManifest {
|
|
repeated FileChunk chunks = 1;
|
|
}
|
|
|
|
message FileId {
|
|
uint32 volume_id = 1;
|
|
uint64 file_key = 2;
|
|
fixed32 cookie = 3;
|
|
}
|
|
|
|
message FuseAttributes {
|
|
uint64 file_size = 1;
|
|
int64 mtime = 2; // unix time in seconds
|
|
uint32 file_mode = 3;
|
|
uint32 uid = 4;
|
|
uint32 gid = 5;
|
|
int64 crtime = 6; // unix time in seconds
|
|
string mime = 7;
|
|
int32 ttl_sec = 10;
|
|
string user_name = 11; // for hdfs
|
|
repeated string group_name = 12; // for hdfs
|
|
string symlink_target = 13;
|
|
bytes md5 = 14;
|
|
uint32 rdev = 16;
|
|
uint64 inode = 17;
|
|
int64 ctime = 18; // unix time in seconds, inode change time
|
|
int32 mtime_ns = 19; // nanosecond component of mtime (0-999999999)
|
|
int32 ctime_ns = 20; // nanosecond component of ctime (0-999999999)
|
|
int32 crtime_ns = 21; // nanosecond component of crtime (0-999999999)
|
|
int64 atime = 22; // unix time in seconds, last access time
|
|
int32 atime_ns = 23; // nanosecond component of atime (0-999999999)
|
|
}
|
|
|
|
message CreateEntryRequest {
|
|
string directory = 1;
|
|
Entry entry = 2;
|
|
bool o_excl = 3;
|
|
bool is_from_other_cluster = 4;
|
|
repeated int32 signatures = 5;
|
|
bool skip_check_parent_directory = 6;
|
|
// Optional precondition evaluated against the current entry atomically with
|
|
// the write, under the filer's per-path lock. The caller must route the
|
|
// key's writes to this entry's owner filer for the check to be authoritative.
|
|
WriteCondition condition = 7;
|
|
}
|
|
|
|
// WriteCondition is the precondition the filer evaluates against the existing
|
|
// entry before writing, under the per-path lock. A failed condition returns
|
|
// FilerError PRECONDITION_FAILED. The client maps request semantics (e.g. RFC
|
|
// 7232) to clauses; the filer just compares.
|
|
//
|
|
// A condition is a list of clauses that ALL must hold (logical AND). One clause
|
|
// is the common case; several express what a single comparison cannot: an ETag
|
|
// set (If-Match / If-None-Match with multiple values), weak-ETag comparison, and
|
|
// compound conditions (e.g. If-Match + If-Unmodified-Since together).
|
|
message WriteCondition {
|
|
enum Kind {
|
|
NONE = 0; // unconditional
|
|
IF_NOT_EXISTS = 1; // fail if the entry exists (If-None-Match: *)
|
|
IF_EXISTS = 2; // fail if the entry is absent (If-Match: *)
|
|
IF_ETAG_MATCH = 3; // fail if absent or etag matches none of the set (If-Match)
|
|
IF_ETAG_NOT_MATCH = 4; // fail if present and etag matches any of the set (If-None-Match)
|
|
IF_UNMODIFIED_SINCE = 5; // fail if present and mtime > unix_time
|
|
IF_MODIFIED_SINCE = 6; // fail if present and mtime <= unix_time
|
|
IF_EXTENDED_NOT_EQUAL = 7; // fail if present and extended[ext_key] == ext_value
|
|
IF_EXTENDED_TIME_ELAPSED = 8; // fail if present and extended[ext_key] (unix seconds) is in the future
|
|
}
|
|
// Clause is one primitive comparison. IF_ETAG_MATCH holds when the current
|
|
// entry's ETag equals any value in etags; IF_ETAG_NOT_MATCH holds when it
|
|
// equals none. allow_weak permits weak-comparison (ignoring the W/ prefix).
|
|
//
|
|
// The IF_EXTENDED_* kinds are generic guards on an extended attribute, used
|
|
// to enforce object-lock without teaching the filer S3 semantics:
|
|
// IF_EXTENDED_NOT_EQUAL expresses a legal hold (block while a key equals a
|
|
// value), and IF_EXTENDED_TIME_ELAPSED expresses retention (block while a
|
|
// stored unix-second deadline is in the future, compared to the filer's
|
|
// clock). The caller composes these and, for governance-bypass, simply omits
|
|
// the retention clause when the bypass is authorized — the filer makes no
|
|
// authorization decision.
|
|
message Clause {
|
|
Kind kind = 1;
|
|
repeated string etags = 2; // ETag set for IF_ETAG_* kinds
|
|
int64 unix_time = 3; // bound (unix seconds) for IF_*_SINCE kinds
|
|
bool allow_weak = 4; // compare ETags ignoring the weak (W/) marker
|
|
string ext_key = 5; // extended attribute name for IF_EXTENDED_* kinds
|
|
string ext_value = 6; // blocking value for IF_EXTENDED_NOT_EQUAL
|
|
string gate_key = 7; // IF_EXTENDED_TIME_ELAPSED: only enforce when extended[gate_key] == gate_value
|
|
string gate_value = 8; // gate value (e.g. retention mode COMPLIANCE for governance bypass)
|
|
}
|
|
repeated Clause clauses = 1; // all must hold (logical AND)
|
|
}
|
|
|
|
// Structured error codes for filer entry operations.
|
|
// Values are stable — do not reorder or reuse numbers.
|
|
enum FilerError {
|
|
OK = 0;
|
|
ENTRY_NAME_TOO_LONG = 1; // name exceeds max_file_name_length
|
|
PARENT_IS_FILE = 2; // parent path component is a file, not a directory
|
|
EXISTING_IS_DIRECTORY = 3; // cannot overwrite directory with file
|
|
EXISTING_IS_FILE = 4; // cannot overwrite file with directory
|
|
ENTRY_ALREADY_EXISTS = 5; // O_EXCL and entry already exists
|
|
PRECONDITION_FAILED = 6; // WriteCondition not satisfied
|
|
}
|
|
|
|
// ObjectMutation is one entry-level change applied by ObjectTransaction. All
|
|
// mutations of a transaction run under a single per-path lock (the request's
|
|
// lock_key) and in order, so the gateway can describe a multi-entry object
|
|
// operation as one request instead of holding a distributed lock across
|
|
// several RPCs. Data-bearing writes (entries with chunks) should be written
|
|
// before the transaction; mutations here are metadata-scoped.
|
|
message ObjectMutation {
|
|
enum Type {
|
|
PUT = 0; // create or replace the entry (entry field)
|
|
DELETE = 1; // delete the entry at directory/name (no error if absent)
|
|
PATCH_EXTENDED = 2; // merge set_extended / remove delete_extended on the entry
|
|
RECOMPUTE_LATEST = 3; // scan a directory and re-point a parent entry (recompute)
|
|
}
|
|
Type type = 1;
|
|
string directory = 2;
|
|
string name = 3; // entry name for DELETE / PATCH_EXTENDED / RECOMPUTE_LATEST (the pointer entry)
|
|
Entry entry = 4; // full entry for PUT
|
|
map<string, bytes> set_extended = 5; // PATCH_EXTENDED: keys to set
|
|
repeated string delete_extended = 6; // PATCH_EXTENDED: keys to remove
|
|
bool is_delete_data = 7; // DELETE: also delete chunk data
|
|
bool is_recursive = 8; // DELETE: recurse into a directory
|
|
Recompute recompute = 9; // RECOMPUTE_LATEST parameters
|
|
bool set_content = 10; // PATCH_EXTENDED: replace Entry.content with content
|
|
bytes content = 11; // PATCH_EXTENDED: new Entry.content when set_content
|
|
bool touch_mtime = 12; // PATCH_EXTENDED: set the entry's Mtime to now (e.g. a metadata-replace copy)
|
|
}
|
|
|
|
// Recompute re-derives a pointer entry (directory/name on the mutation) from the
|
|
// current contents of a scanned directory, atomically under the transaction's
|
|
// lock. It is mechanical: the filer picks the child that sorts first or last by
|
|
// name and copies the requested fields into the pointer; it has no knowledge of
|
|
// what the entries mean. The caller (which does know the versioning scheme)
|
|
// supplies the sort direction and the key mappings. This covers re-pointing the
|
|
// latest version after a specific version is deleted, where the scan must run
|
|
// under the lock.
|
|
message Recompute {
|
|
string scan_dir = 1; // directory whose direct children are scanned
|
|
bool descending = 2; // pick the child that sorts last by name (else first)
|
|
map<string, string> copy_extended = 3; // pointer extended key -> source extended key on the chosen child
|
|
string name_to_key = 4; // if set, store the chosen child's name under this pointer key
|
|
string size_to_key = 5; // if set, store the chosen child's FileSize (decimal) under this pointer key
|
|
string mtime_to_key = 6; // if set, store the chosen child's Mtime (decimal) under this pointer key
|
|
string demote_key = 7; // if set, stamp demote_value on the prior name_to_key target when it changes
|
|
bytes demote_value = 8; // value for demote_key
|
|
string exclude_name = 9; // if set, skip this child when scanning (e.g. a version about to be deleted)
|
|
}
|
|
|
|
// ObjectTransactionRequest applies an ordered list of mutations atomically with
|
|
// respect to other writers of the same object, by holding the filer's per-path
|
|
// lock on lock_key for the whole transaction. The optional condition is checked
|
|
// first, against condition_key when set, else lock_key. Callers set route_key to
|
|
// the object's stable owner ring key; a filer that is not the owner forwards the
|
|
// transaction one hop to the owner, so a stale ring view is tolerated.
|
|
message ObjectTransactionRequest {
|
|
string lock_key = 1; // object path to lock and to evaluate the condition against
|
|
WriteCondition condition = 2; // optional precondition, checked under the lock
|
|
repeated ObjectMutation mutations = 3;
|
|
bool is_from_other_cluster = 4;
|
|
repeated int32 signatures = 5;
|
|
string condition_key = 6; // if set, evaluate the condition against this entry instead of lock_key (still locking lock_key)
|
|
string route_key = 7; // ring key identifying the owner filer; a non-owner forwards the whole transaction to it
|
|
bool is_moved = 8; // set on a forwarded transaction so the receiver applies it locally instead of forwarding again
|
|
}
|
|
|
|
message ObjectTransactionResponse {
|
|
string error = 1;
|
|
FilerError error_code = 2;
|
|
}
|
|
|
|
// PosixLockRange is one advisory byte-range lock. Owner identity is (sid, owner):
|
|
// sid is the mount session, owner the FUSE lock owner within it, so owners from
|
|
// different mounts never alias. end is inclusive (max uint64 = to EOF); is_flock
|
|
// separates the flock and fcntl namespaces, which never conflict.
|
|
message PosixLockRange {
|
|
uint64 start = 1;
|
|
uint64 end = 2;
|
|
uint32 type = 3; // 1=read, 2=write, 3=unlock
|
|
uint64 sid = 4;
|
|
uint64 owner = 5;
|
|
uint32 pid = 6; // holder pid, for get_lk reporting only
|
|
bool is_flock = 7;
|
|
}
|
|
|
|
// PosixLock routes an advisory lock operation to the inode's owner filer, which
|
|
// holds the authoritative in-memory lock table. key is the inode identity ring
|
|
// key (the file path, or hl:<HardLinkId> for a hardlink) used both to resolve the
|
|
// owner and to index the table. A non-owner filer forwards the request one hop;
|
|
// is_moved bounds it so a stale ring view cannot loop.
|
|
message PosixLockRequest {
|
|
string key = 1;
|
|
bool is_moved = 2;
|
|
PosixLockOp op = 3;
|
|
PosixLockRange lock = 4;
|
|
// locks carries the full set a mount holds on key for a KEEP_ALIVE
|
|
// re-assertion, so the current owner filer can rebuild its in-memory state
|
|
// after an ownership change or restart. lock.sid identifies the session.
|
|
repeated PosixLockRange locks = 5;
|
|
// cooling_probe marks a dual-read a new owner sends to the previous owner
|
|
// during a ring change, so the previous owner answers from local state
|
|
// without itself cooling-off (no recursion).
|
|
bool cooling_probe = 6;
|
|
}
|
|
|
|
enum PosixLockOp {
|
|
TRY_LOCK = 0; // grant lock or report conflict (non-blocking)
|
|
UNLOCK = 1; // release lock's owner's locks over its range
|
|
GET_LK = 2; // report a conflicting lock, if any
|
|
RELEASE_POSIX_OWNER = 3; // drop the owner's fcntl locks (flush-time)
|
|
RELEASE_FLOCK_OWNER = 4; // drop the owner's flock locks (release-time)
|
|
KEEP_ALIVE = 5; // renew the session's lease on this owner (lock.sid)
|
|
}
|
|
|
|
message PosixLockResponse {
|
|
bool granted = 1; // for TRY_LOCK: whether the lock was granted
|
|
bool has_conflict = 2; // whether conflict is populated
|
|
PosixLockRange conflict = 3; // the blocking lock (TRY_LOCK conflict / GET_LK result)
|
|
}
|
|
|
|
// ObjectTransactionBatch applies several object transactions in one round trip,
|
|
// each under its own per-path lock and independent of the others (no cross-key
|
|
// atomicity). A caller groups keys that route to the same owner filer and sends
|
|
// one batch per owner, e.g. for a multi-object delete. Each response is parallel
|
|
// to its request.
|
|
message ObjectTransactionBatchRequest {
|
|
repeated ObjectTransactionRequest transactions = 1;
|
|
}
|
|
|
|
message ObjectTransactionBatchResponse {
|
|
repeated ObjectTransactionResponse responses = 1;
|
|
}
|
|
|
|
message CreateEntryResponse {
|
|
string error = 1; // kept for human readability + backward compat
|
|
SubscribeMetadataResponse metadata_event = 2;
|
|
FilerError error_code = 3; // machine-readable error code
|
|
}
|
|
|
|
message UpdateEntryRequest {
|
|
string directory = 1;
|
|
Entry entry = 2;
|
|
bool is_from_other_cluster = 3;
|
|
repeated int32 signatures = 4;
|
|
map<string, bytes> expected_extended = 5;
|
|
}
|
|
message UpdateEntryResponse {
|
|
SubscribeMetadataResponse metadata_event = 1;
|
|
}
|
|
|
|
message TouchAccessTimeRequest {
|
|
string directory = 1;
|
|
string name = 2;
|
|
int64 client_atime_ns = 3; // nanoseconds since epoch; filer may override with relatime
|
|
}
|
|
message TouchAccessTimeResponse {
|
|
int64 persisted_atime_ns = 1; // nanoseconds since epoch; 0 if no update was performed
|
|
bool updated = 2;
|
|
}
|
|
|
|
message AppendToEntryRequest {
|
|
string directory = 1;
|
|
string entry_name = 2;
|
|
repeated FileChunk chunks = 3;
|
|
}
|
|
message AppendToEntryResponse {
|
|
}
|
|
|
|
message DeleteEntryRequest {
|
|
string directory = 1;
|
|
string name = 2;
|
|
// bool is_directory = 3;
|
|
bool is_delete_data = 4;
|
|
bool is_recursive = 5;
|
|
bool ignore_recursive_error = 6;
|
|
bool is_from_other_cluster = 7;
|
|
repeated int32 signatures = 8;
|
|
int64 if_not_modified_after = 9;
|
|
}
|
|
|
|
message DeleteEntryResponse {
|
|
string error = 1;
|
|
SubscribeMetadataResponse metadata_event = 2;
|
|
}
|
|
|
|
message AtomicRenameEntryRequest {
|
|
string old_directory = 1;
|
|
string old_name = 2;
|
|
string new_directory = 3;
|
|
string new_name = 4;
|
|
repeated int32 signatures = 5;
|
|
}
|
|
|
|
message AtomicRenameEntryResponse {
|
|
}
|
|
|
|
message StreamRenameEntryRequest {
|
|
string old_directory = 1;
|
|
string old_name = 2;
|
|
string new_directory = 3;
|
|
string new_name = 4;
|
|
repeated int32 signatures = 5;
|
|
}
|
|
message StreamRenameEntryResponse {
|
|
string directory = 1;
|
|
EventNotification event_notification = 2;
|
|
int64 ts_ns = 3;
|
|
}
|
|
message AssignVolumeRequest {
|
|
int32 count = 1;
|
|
string collection = 2;
|
|
string replication = 3;
|
|
int32 ttl_sec = 4;
|
|
string data_center = 5;
|
|
string path = 6;
|
|
string rack = 7;
|
|
string data_node = 9;
|
|
string disk_type = 8;
|
|
uint64 expected_data_size = 10; // hint for size-aware volume selection
|
|
}
|
|
|
|
message AssignVolumeResponse {
|
|
string file_id = 1;
|
|
int32 count = 4;
|
|
string auth = 5;
|
|
string collection = 6;
|
|
string replication = 7;
|
|
string error = 8;
|
|
Location location = 9;
|
|
}
|
|
|
|
message LookupVolumeRequest {
|
|
repeated string volume_ids = 1;
|
|
}
|
|
|
|
message Locations {
|
|
repeated Location locations = 1;
|
|
}
|
|
|
|
message Location {
|
|
string url = 1;
|
|
string public_url = 2;
|
|
uint32 grpc_port = 3;
|
|
string data_center = 4;
|
|
}
|
|
message LookupVolumeResponse {
|
|
map<string, Locations> locations_map = 1;
|
|
}
|
|
|
|
message Collection {
|
|
string name = 1;
|
|
}
|
|
message CollectionListRequest {
|
|
bool include_normal_volumes = 1;
|
|
bool include_ec_volumes = 2;
|
|
}
|
|
message CollectionListResponse {
|
|
repeated Collection collections = 1;
|
|
}
|
|
message DeleteCollectionRequest {
|
|
string collection = 1;
|
|
}
|
|
|
|
message DeleteCollectionResponse {
|
|
}
|
|
|
|
message StatisticsRequest {
|
|
string replication = 1;
|
|
string collection = 2;
|
|
string ttl = 3;
|
|
string disk_type = 4;
|
|
}
|
|
message StatisticsResponse {
|
|
uint64 total_size = 4;
|
|
uint64 used_size = 5;
|
|
uint64 file_count = 6;
|
|
}
|
|
|
|
message PingRequest {
|
|
string target = 1; // default to ping itself
|
|
string target_type = 2;
|
|
}
|
|
message PingResponse {
|
|
int64 start_time_ns = 1;
|
|
int64 remote_time_ns = 2;
|
|
int64 stop_time_ns = 3;
|
|
}
|
|
|
|
message GetFilerConfigurationRequest {
|
|
}
|
|
message GetFilerConfigurationResponse {
|
|
repeated string masters = 1;
|
|
string replication = 2;
|
|
string collection = 3;
|
|
uint32 max_mb = 4;
|
|
string dir_buckets = 5;
|
|
bool cipher = 7;
|
|
int32 signature = 8;
|
|
string metrics_address = 9;
|
|
int32 metrics_interval_sec = 10;
|
|
string version = 11;
|
|
string cluster_id = 12;
|
|
string filer_group = 13;
|
|
int32 major_version = 14;
|
|
int32 minor_version = 15;
|
|
}
|
|
|
|
message SubscribeMetadataRequest {
|
|
string client_name = 1;
|
|
string path_prefix = 2;
|
|
int64 since_ns = 3;
|
|
int32 signature = 4;
|
|
repeated string path_prefixes = 6;
|
|
int32 client_id = 7;
|
|
int64 until_ns = 8;
|
|
int32 client_epoch = 9;
|
|
repeated string directories = 10; // exact directory to watch
|
|
bool client_supports_batching = 11; // client can unpack SubscribeMetadataResponse.events
|
|
bool client_supports_metadata_chunks = 12; // client can read log file chunks from volume servers
|
|
bool client_supports_idle_heartbeat = 13; // server may send empty responses carrying the current time while the client is caught up
|
|
}
|
|
message SubscribeMetadataResponse {
|
|
string directory = 1;
|
|
EventNotification event_notification = 2;
|
|
int64 ts_ns = 3;
|
|
repeated SubscribeMetadataResponse events = 4; // batch of additional events (backlog catch-up)
|
|
repeated LogFileChunkRef log_file_refs = 5; // log file chunk refs for client direct-read
|
|
}
|
|
// A persisted log file that the client can read directly from volume servers.
|
|
// The file format is: [4-byte size | protobuf LogEntry] repeated.
|
|
// Each LogEntry.Data contains a marshaled SubscribeMetadataResponse.
|
|
message LogFileChunkRef {
|
|
repeated FileChunk chunks = 1; // chunk references (fids) to read from volume servers
|
|
int64 file_ts_ns = 2; // minute-level timestamp of the log file
|
|
string filer_id = 3; // filer signature suffix from log filename
|
|
}
|
|
|
|
message TraverseBfsMetadataRequest {
|
|
string directory = 1;
|
|
repeated string excluded_prefixes = 2;
|
|
}
|
|
message TraverseBfsMetadataResponse {
|
|
string directory = 1;
|
|
Entry entry = 2;
|
|
}
|
|
|
|
message LogEntry {
|
|
int64 ts_ns = 1;
|
|
int32 partition_key_hash = 2;
|
|
bytes data = 3;
|
|
bytes key = 4;
|
|
int64 offset = 5; // Sequential offset within partition
|
|
}
|
|
|
|
message KeepConnectedRequest {
|
|
string name = 1;
|
|
uint32 grpc_port = 2;
|
|
repeated string resources = 3;
|
|
}
|
|
message KeepConnectedResponse {
|
|
}
|
|
|
|
message LocateBrokerRequest {
|
|
string resource = 1;
|
|
}
|
|
|
|
message LocateBrokerResponse {
|
|
bool found = 1;
|
|
// if found, send the exact address
|
|
// if not found, send the full list of existing brokers
|
|
message Resource {
|
|
string grpc_addresses = 1;
|
|
int32 resource_count = 2;
|
|
}
|
|
repeated Resource resources = 2;
|
|
}
|
|
|
|
/////////////////////////
|
|
// Key-Value operations
|
|
/////////////////////////
|
|
message KvGetRequest {
|
|
bytes key = 1;
|
|
}
|
|
message KvGetResponse {
|
|
bytes value = 1;
|
|
string error = 2;
|
|
}
|
|
message KvPutRequest {
|
|
bytes key = 1;
|
|
bytes value = 2;
|
|
}
|
|
message KvPutResponse {
|
|
string error = 1;
|
|
}
|
|
|
|
/////////////////////////
|
|
// path-based configurations
|
|
/////////////////////////
|
|
message FilerConf {
|
|
int32 version = 1;
|
|
message PathConf {
|
|
string location_prefix = 1;
|
|
string collection = 2;
|
|
string replication = 3;
|
|
string ttl = 4;
|
|
string disk_type = 5;
|
|
bool fsync = 6;
|
|
uint32 volume_growth_count = 7;
|
|
bool read_only = 8;
|
|
string data_center = 9;
|
|
string rack = 10;
|
|
string data_node = 11;
|
|
uint32 max_file_name_length = 12;
|
|
bool disable_chunk_deletion = 13;
|
|
bool worm = 14;
|
|
uint64 worm_grace_period_seconds = 15;
|
|
uint64 worm_retention_time_seconds = 16;
|
|
}
|
|
repeated PathConf locations = 2;
|
|
}
|
|
|
|
/////////////////////////
|
|
// Remote Storage related
|
|
/////////////////////////
|
|
message CacheRemoteObjectToLocalClusterRequest {
|
|
string directory = 1;
|
|
string name = 2;
|
|
int32 chunk_concurrency = 3; // parallel chunk downloads per file, 0 = default (8)
|
|
int32 download_concurrency = 4; // multipart download concurrency per chunk (if supported by remote storage), 0 = default (5 for S3)
|
|
}
|
|
message CacheRemoteObjectToLocalClusterResponse {
|
|
Entry entry = 1;
|
|
SubscribeMetadataResponse metadata_event = 2;
|
|
}
|
|
|
|
/////////////////////////
|
|
// distributed lock management
|
|
/////////////////////////
|
|
message LockRequest {
|
|
string name = 1;
|
|
int64 seconds_to_lock = 2;
|
|
string renew_token = 3;
|
|
bool is_moved = 4;
|
|
string owner = 5;
|
|
}
|
|
message LockResponse {
|
|
string renew_token = 1;
|
|
string lock_owner = 2;
|
|
string lock_host_moved_to = 3;
|
|
string error = 4;
|
|
int64 generation = 5;
|
|
}
|
|
message UnlockRequest {
|
|
string name = 1;
|
|
string renew_token = 2;
|
|
bool is_moved = 3;
|
|
}
|
|
message UnlockResponse {
|
|
string error = 1;
|
|
string moved_to = 2;
|
|
}
|
|
message FindLockOwnerRequest {
|
|
string name = 1;
|
|
bool is_moved = 2;
|
|
}
|
|
message FindLockOwnerResponse {
|
|
string owner = 1;
|
|
}
|
|
message Lock {
|
|
string name = 1;
|
|
string renew_token = 2;
|
|
int64 expired_at_ns = 3;
|
|
string owner = 4;
|
|
int64 generation = 5;
|
|
bool is_backup = 6;
|
|
int64 seq = 7;
|
|
}
|
|
message TransferLocksRequest {
|
|
repeated Lock locks = 1;
|
|
}
|
|
message TransferLocksResponse {
|
|
}
|
|
message ReplicateLockRequest {
|
|
string name = 1;
|
|
string renew_token = 2;
|
|
int64 expired_at_ns = 3;
|
|
string owner = 4;
|
|
int64 generation = 5;
|
|
bool is_unlock = 6;
|
|
int64 seq = 7;
|
|
}
|
|
message ReplicateLockResponse {
|
|
}
|
|
|
|
//////////////////////////////////////////////////
|
|
// StreamMutateEntry: ordered bidirectional streaming for all filer mutations.
|
|
// All create/update/delete/rename operations from a single mount go through
|
|
// one stream, preserving mutation ordering and eliminating per-request
|
|
// connection overhead.
|
|
|
|
message StreamMutateEntryRequest {
|
|
uint64 request_id = 1;
|
|
oneof request {
|
|
CreateEntryRequest create_request = 2;
|
|
UpdateEntryRequest update_request = 3;
|
|
DeleteEntryRequest delete_request = 4;
|
|
StreamRenameEntryRequest rename_request = 5;
|
|
}
|
|
}
|
|
|
|
message StreamMutateEntryResponse {
|
|
uint64 request_id = 1;
|
|
bool is_last = 2; // always true except for rename, which sends multiple events
|
|
oneof response {
|
|
CreateEntryResponse create_response = 3;
|
|
UpdateEntryResponse update_response = 4;
|
|
DeleteEntryResponse delete_response = 5;
|
|
StreamRenameEntryResponse rename_response = 6;
|
|
}
|
|
string error = 7; // human-readable error message when the operation failed
|
|
int32 errno = 8; // POSIX errno (e.g. ENOENT=2, ENOTEMPTY=66) for direct FUSE status mapping
|
|
}
|
|
|
|
//////////////////////////////////////////////////
|
|
// Peer chunk sharing — mount-server registry
|
|
//////////////////////////////////////////////////
|
|
|
|
message MountRegisterRequest {
|
|
string peer_addr = 1; // host:port where this mount serves peer chunk requests
|
|
string rack = 2; // locality label (rack); used for peer ranking
|
|
int32 ttl_seconds = 3; // how long the filer should keep this entry without a heartbeat
|
|
string data_center = 4; // locality label (data center); coarser than rack
|
|
}
|
|
|
|
message MountRegisterResponse {
|
|
}
|
|
|
|
message MountListRequest {
|
|
}
|
|
|
|
message MountListResponse {
|
|
repeated MountInfo mounts = 1;
|
|
}
|
|
|
|
message MountInfo {
|
|
string peer_addr = 1;
|
|
string rack = 2;
|
|
int64 last_seen_ns = 3;
|
|
string data_center = 4;
|
|
}
|