Compare commits

...

19 Commits

Author SHA1 Message Date
jkunz 7020810b5e v6.5.0
Default (tags) / security (push) Failing after 1s
Default (tags) / test (push) Failing after 1s
Default (tags) / release (push) Has been skipped
Default (tags) / metadata (push) Has been skipped
2026-05-02 11:14:15 +00:00
jkunz 7f2546e041 feat(bucket-tenants): add persisted bucket-scoped tenant credentials with bucket export and import APIs 2026-05-02 11:14:15 +00:00
jkunz 53d663597a v6.4.1
Default (tags) / security (push) Failing after 1s
Default (tags) / test (push) Failing after 1s
Default (tags) / release (push) Has been skipped
Default (tags) / metadata (push) Has been skipped
2026-04-30 08:23:22 +00:00
jkunz 440197ccf3 fix(build): tighten TypeScript compiler settings and refresh package metadata 2026-04-30 08:23:22 +00:00
jkunz c8d3ed79aa v6.4.0
Default (tags) / security (push) Failing after 1s
Default (tags) / test (push) Failing after 1s
Default (tags) / release (push) Has been skipped
Default (tags) / metadata (push) Has been skipped
2026-04-30 06:08:42 +00:00
jkunz a31e477359 feat(cluster,server,auth): add operational health endpoints, persist cluster topology, and hide credential secrets from runtime listings 2026-04-30 06:08:42 +00:00
jkunz c2b40ee240 v6.3.3
Default (tags) / security (push) Failing after 0s
Default (tags) / test (push) Failing after 0s
Default (tags) / release (push) Has been skipped
Default (tags) / metadata (push) Has been skipped
2026-04-19 12:22:53 +00:00
jkunz 0db138bf42 fix(build): rename npmextra config to .smartconfig and refresh build metadata 2026-04-19 12:22:53 +00:00
jkunz 0e9862efca feat: enhance storage stats and cluster health reporting
- Introduced new data structures for bucket and storage statistics, including BucketSummary, StorageStats, and ClusterHealth.
- Implemented runtime statistics tracking for buckets, including object count and total size.
- Added methods to retrieve storage stats and bucket summaries in the FileStore.
- Enhanced the SmartStorage interface to expose storage stats and cluster health.
- Implemented tests for runtime stats, cluster health, and credential management.
- Added support for runtime-managed credentials with atomic replacement.
- Improved filesystem usage reporting for storage locations.
2026-04-19 11:57:28 +00:00
jkunz c683b02e8c v6.3.2
Default (tags) / security (push) Failing after 0s
Default (tags) / test (push) Failing after 0s
Default (tags) / release (push) Has been skipped
Default (tags) / metadata (push) Has been skipped
2026-03-23 21:21:50 +00:00
jkunz b64be03c2f fix(docs): update license ownership and correct README license file reference 2026-03-23 21:21:50 +00:00
jkunz 494dac1267 v6.3.1
Default (tags) / security (push) Failing after 0s
Default (tags) / test (push) Failing after 0s
Default (tags) / release (push) Has been skipped
Default (tags) / metadata (push) Has been skipped
2026-03-21 22:19:51 +00:00
jkunz cea3407777 fix(cluster): improve shard reconstruction validation and start background healing service 2026-03-21 22:19:51 +00:00
jkunz a009d990d0 v6.3.0
Default (tags) / security (push) Failing after 0s
Default (tags) / test (push) Failing after 0s
Default (tags) / release (push) Has been skipped
Default (tags) / metadata (push) Has been skipped
2026-03-21 22:04:36 +00:00
jkunz 08d545f5db feat(readme): document distributed cluster mode, erasure coding, and QUIC-based architecture 2026-03-21 22:04:36 +00:00
jkunz a0a282c712 v6.2.0
Default (tags) / security (push) Failing after 0s
Default (tags) / test (push) Failing after 0s
Default (tags) / release (push) Has been skipped
Default (tags) / metadata (push) Has been skipped
2026-03-21 22:00:41 +00:00
jkunz 3eb0045676 feat(cluster): add shard healing, drive health heartbeats, and clustered policy directory support 2026-03-21 22:00:41 +00:00
jkunz 639eb5d36c v6.1.0
Default (tags) / security (push) Failing after 0s
Default (tags) / test (push) Failing after 0s
Default (tags) / release (push) Has been skipped
Default (tags) / metadata (push) Has been skipped
2026-03-21 21:50:42 +00:00
jkunz d12d321079 feat(cluster): add clustered storage backend with QUIC transport, erasure coding, and shard management 2026-03-21 21:50:42 +00:00
42 changed files with 13128 additions and 4029 deletions
+52
View File
@@ -0,0 +1,52 @@
{
"@git.zone/tsrust": {
"targets": [
"linux_amd64",
"linux_arm64"
]
},
"@git.zone/cli": {
"projectType": "npm",
"module": {
"githost": "code.foss.global",
"gitscope": "push.rocks",
"gitrepo": "smartstorage",
"description": "A Node.js TypeScript package to create a local S3-compatible storage server using mapped local directories for development and testing purposes.",
"npmPackagename": "@push.rocks/smartstorage",
"license": "MIT",
"projectDomain": "push.rocks",
"keywords": [
"smartstorage",
"S3 Compatible",
"Local Storage Server",
"Node.js",
"TypeScript",
"Local Development",
"Testing",
"Cloud Storage",
"File Storage",
"AWS S3 Compatibility",
"Development Tool",
"Storage Endpoint",
"Storage Simulation",
"Bucket Management",
"File Upload",
"CI/CD Integration",
"Developer Onboarding"
]
},
"release": {
"registries": [
"https://verdaccio.lossless.digital",
"https://registry.npmjs.org"
],
"accessLevel": "public"
}
},
"@git.zone/tsdoc": {
"legal": "\n## License and Legal Information\n\nThis repository contains open-source code licensed under the MIT License. A copy of the license can be found in the [license](./license) file.\n\n**Please note:** The MIT License does not grant permission to use the trade names, trademarks, service marks, or product names of the project, except as required for reasonable and customary use in describing the origin of the work and reproducing the content of the NOTICE file.\n\n### Trademarks\n\nThis project is owned and maintained by Task Venture Capital GmbH. The names and logos associated with Task Venture Capital GmbH and any related products or services are trademarks of Task Venture Capital GmbH or third parties, and are not included within the scope of the MIT license granted herein.\n\nUse of these trademarks must comply with Task Venture Capital GmbH's Trademark Guidelines or the guidelines of the respective third-party owners, and any usage must be approved in writing. Third-party trademarks used herein are the property of their respective owners and used only in a descriptive manner, e.g. for an implementation of an API or similar.\n\n### Company Information\n\nTask Venture Capital GmbH \nRegistered at District Court Bremen HRB 35230 HB, Germany\n\nFor any legal inquiries or further information, please contact us via email at hello@task.vc.\n\nBy using this repository, you acknowledge that you have read this section, agree to comply with its terms, and understand that the licensing of the code does not imply endorsement by Task Venture Capital GmbH of any derivative works.\n"
},
"@ship.zone/szci": {
"npmGlobalTools": []
}
}
+1 -1
View File
@@ -1,7 +1,7 @@
{ {
"json.schemas": [ "json.schemas": [
{ {
"fileMatch": ["/npmextra.json"], "fileMatch": ["/.smartconfig.json"],
"schema": { "schema": {
"type": "object", "type": "object",
"properties": { "properties": {
+97
View File
@@ -1,5 +1,102 @@
# Changelog # Changelog
## 2026-05-02 - 6.5.0 - feat(bucket-tenants)
add persisted bucket-scoped tenant credentials with bucket export and import APIs
- Adds bucket tenant management APIs for creating, rotating, listing, retrieving, and deleting scoped per-bucket credentials.
- Persists runtime credentials under the storage directory so tenant and replaced credentials survive restarts.
- Enforces tenant bucket isolation in auth, including blocking cross-bucket access and copy operations.
- Adds bucket export/import support using the smartstorage.bucket.v1 JSON format.
- Introduces health and metrics APIs plus test coverage for tenant lifecycle, persistence, policy retention, and AWS SDK compatibility.
## 2026-04-30 - 6.4.1 - fix(build)
tighten TypeScript compiler settings and refresh package metadata
- enable noImplicitAny in tsconfig and align the build script with strict compilation
- update package metadata including author, repository URL, and pnpm version
- bump dependency versions for @aws-sdk/client-s3 and @tsclass/tsclass
- refresh README hints and legal text to match the current package setup
## 2026-04-30 - 6.4.0 - feat(cluster,server,auth)
add operational health endpoints, persist cluster topology, and hide credential secrets from runtime listings
- persist cluster identity and topology snapshots under .smartstorage/cluster to support safer clustered restarts and seed-node joins
- add unauthenticated /-/live, /-/ready, /-/health, and /-/metrics endpoints with basic request and storage metrics
- route clustered shard read/write/delete/head operations by drive index and handle join, heartbeat, and topology sync over QUIC
- change runtime credential listing to return access-key metadata only, excluding secretAccessKey values
- add tests for operational endpoints and multi-node cluster persistence and recovery behavior
## 2026-04-19 - 6.3.3 - fix(build)
rename npmextra config to .smartconfig and refresh build metadata
- renames the published project config file from npmextra.json to .smartconfig.json
- updates build and tooling dependencies to newer patch and minor versions
- adds Node type configuration and TypeScript deprecation handling in tsconfig
- refreshes README documentation to match the current build command and runtime management APIs
## Next - feat(credentials)
add runtime credential management APIs
- Expose `listCredentials()` and `replaceCredentials()` through the Rust bridge and the `SmartStorage` TypeScript API.
- Move request authentication onto a native runtime credential store so credential replacement is atomic and effective for new requests immediately without a restart.
- Validate replacement input cleanly by rejecting empty replacement sets, empty credential fields, and duplicate `accessKeyId` values.
- Add runtime credential rotation tests covering initial auth, revocation of old credentials, multiple active credentials, and invalid replacements.
## Next - feat(cluster-health)
add runtime cluster and drive health introspection
- Expose `getClusterHealth()` through the Rust bridge and the `SmartStorage` TypeScript API.
- Report native cluster mode state including local node id, peer status, local drive probe health, quorum health, erasure settings, and tracked healing runtime state.
- Return a clear `{ enabled: false }` response when clustering is not active instead of synthesizing config-based data.
- Add standalone and single-node cluster tests plus README documentation for the best-effort semantics of peer and repair health values.
## Next - feat(stats)
add runtime bucket summaries and storage stats
- Expose `getStorageStats()` and `listBucketSummaries()` through the Rust bridge and the `SmartStorage` TypeScript API.
- Maintain native runtime stats for bucket counts, object counts, and logical stored bytes, initialized from on-disk state at startup and updated on bucket/object mutations.
- Include cheap filesystem-capacity snapshots for the storage directory or configured cluster drive paths.
- Add AWS SDK integration coverage for object add, delete, and bucket delete stats flows and document the cache consistency semantics in the README.
## 2026-03-23 - 6.3.2 - fix(docs)
update license ownership and correct README license file reference
- Adjusts the copyright holder name in the license file
- Fixes the README link to match the lowercase license filename
## 2026-03-21 - 6.3.1 - fix(cluster)
improve shard reconstruction validation and start background healing service
- use the erasure read quorum when reconstructing chunks instead of assuming data shard count
- verify reconstructed shards before writing healed data back to disk
- start the healing service during server initialization with shared local shard stores
- simplify QUIC request handling by decoding the full request buffer including trailing shard data
- clean up unused variables and imports across cluster modules
## 2026-03-21 - 6.3.0 - feat(readme)
document distributed cluster mode, erasure coding, and QUIC-based architecture
- Expand README overview and feature matrix to highlight clustering, multi-drive awareness, and distributed storage capabilities
- Add standalone and cluster mode usage examples plus cluster configuration options
- Document clustering internals including erasure coding, quorum behavior, QUIC transport, self-healing, and on-disk layout
## 2026-03-21 - 6.2.0 - feat(cluster)
add shard healing, drive health heartbeats, and clustered policy directory support
- implements manifest-based healing that scans affected shards on offline nodes, reconstructs data with erasure coding, and rewrites recovered shards to local storage
- includes drive status reporting in membership heartbeats by wiring DriveManager health checks into cluster heartbeat messages
- adds clustered policies directory initialization and exposes policy storage paths from the distributed coordinator
- extends distributed coordinator support for remote shard read and delete operations plus multipart upload session metadata
## 2026-03-21 - 6.1.0 - feat(cluster)
add clustered storage backend with QUIC transport, erasure coding, and shard management
- introduces cluster configuration in Rust and TypeScript, including seed nodes, drive paths, heartbeat settings, and erasure coding options
- adds core cluster modules for membership, topology state, object manifests, placement, shard storage, drive management, healing scaffolding, and inter-node protocol handling
- adds QUIC-based transport for cluster communication and integrates a distributed storage backend alongside the existing standalone FileStore
- updates the server startup path to initialize standalone or clustered storage based on configuration and exposes a basic clusterStatus management endpoint
- refreshes build and dependency versions to support the new clustered storage implementation
## 2026-03-14 - 6.0.1 - fix(rust-bridge) ## 2026-03-14 - 6.0.1 - fix(rust-bridge)
update smartrust and limit RustBridge binary lookup to dist_rust update smartrust and limit RustBridge binary lookup to dist_rust
+3 -1
View File
@@ -1,4 +1,6 @@
Copyright (c) 2021 Lossless GmbH (hello@lossless.com) MIT License
Copyright (c) Task Venture Capital GmbH
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal
-52
View File
@@ -1,52 +0,0 @@
{
"@git.zone/tsrust": {
"targets": [
"linux_amd64",
"linux_arm64"
]
},
"@git.zone/cli": {
"projectType": "npm",
"module": {
"githost": "code.foss.global",
"gitscope": "push.rocks",
"gitrepo": "smartstorage",
"description": "A Node.js TypeScript package to create a local S3-compatible storage server using mapped local directories for development and testing purposes.",
"npmPackagename": "@push.rocks/smartstorage",
"license": "MIT",
"projectDomain": "push.rocks",
"keywords": [
"smartstorage",
"S3 Compatible",
"Local Storage Server",
"Node.js",
"TypeScript",
"Local Development",
"Testing",
"Cloud Storage",
"File Storage",
"AWS S3 Compatibility",
"Development Tool",
"Storage Endpoint",
"Storage Simulation",
"Bucket Management",
"File Upload",
"CI/CD Integration",
"Developer Onboarding"
]
},
"release": {
"registries": [
"https://verdaccio.lossless.digital",
"https://registry.npmjs.org"
],
"accessLevel": "public"
}
},
"@git.zone/tsdoc": {
"legal": "\n## License and Legal Information\n\nThis repository contains open-source code that is licensed under the MIT License. A copy of the MIT License can be found in the [license](license) file within this repository. \n\n**Please note:** The MIT License does not grant permission to use the trade names, trademarks, service marks, or product names of the project, except as required for reasonable and customary use in describing the origin of the work and reproducing the content of the NOTICE file.\n\n### Trademarks\n\nThis project is owned and maintained by Task Venture Capital GmbH. The names and logos associated with Task Venture Capital GmbH and any related products or services are trademarks of Task Venture Capital GmbH and are not included within the scope of the MIT license granted herein. Use of these trademarks must comply with Task Venture Capital GmbH's Trademark Guidelines, and any usage must be approved in writing by Task Venture Capital GmbH.\n\n### Company Information\n\nTask Venture Capital GmbH \nRegistered at District court Bremen HRB 35230 HB, Germany\n\nFor any legal inquiries or if you require further information, please contact us via email at hello@task.vc.\n\nBy using this repository, you acknowledge that you have read this section, agree to comply with its terms, and understand that the licensing of the code does not imply endorsement by Task Venture Capital GmbH of any derivative works.\n"
},
"@ship.zone/szci": {
"npmGlobalTools": []
}
}
+16 -18
View File
@@ -1,27 +1,28 @@
{ {
"name": "@push.rocks/smartstorage", "name": "@push.rocks/smartstorage",
"version": "6.0.1", "version": "6.5.0",
"private": false, "private": false,
"description": "A Node.js TypeScript package to create a local S3-compatible storage server using mapped local directories for development and testing purposes.", "description": "A Node.js TypeScript package to create a local S3-compatible storage server using mapped local directories for development and testing purposes.",
"main": "dist_ts/index.js", "main": "dist_ts/index.js",
"typings": "dist_ts/index.d.ts", "typings": "dist_ts/index.d.ts",
"type": "module", "type": "module",
"author": "Lossless GmbH", "author": "Task Venture Capital GmbH",
"license": "MIT", "license": "MIT",
"scripts": { "scripts": {
"test:before": "(tsrust)",
"test": "(tstest test/ --web --verbose --logfile --timeout 60)", "test": "(tstest test/ --web --verbose --logfile --timeout 60)",
"build": "(tsrust && tsbuild --web --allowimplicitany)", "build": "(tsrust && tsbuild tsfolders)",
"buildDocs": "tsdoc" "buildDocs": "tsdoc"
}, },
"devDependencies": { "devDependencies": {
"@aws-sdk/client-s3": "^3.937.0", "@aws-sdk/client-s3": "^3.1039.0",
"@git.zone/tsbuild": "^3.1.0", "@git.zone/tsbuild": "^4.4.0",
"@git.zone/tsbundle": "^2.5.2", "@git.zone/tsbundle": "^2.10.0",
"@git.zone/tsrun": "^2.0.0", "@git.zone/tsrun": "^2.0.2",
"@git.zone/tsrust": "^1.3.0", "@git.zone/tsrust": "^1.3.2",
"@git.zone/tstest": "^3.1.0", "@git.zone/tstest": "^3.6.3",
"@push.rocks/smartbucket": "^4.3.0", "@push.rocks/smartbucket": "^4.6.0",
"@types/node": "^22.9.0" "@types/node": "^25.6.0"
}, },
"browserslist": [ "browserslist": [
"last 1 chrome versions" "last 1 chrome versions"
@@ -36,13 +37,13 @@
"dist_ts_web/**/*", "dist_ts_web/**/*",
"assets/**/*", "assets/**/*",
"cli.js", "cli.js",
"npmextra.json", ".smartconfig.json",
"readme.md" "readme.md"
], ],
"dependencies": { "dependencies": {
"@push.rocks/smartpath": "^6.0.0", "@push.rocks/smartpath": "^6.0.0",
"@push.rocks/smartrust": "^1.3.2", "@push.rocks/smartrust": "^1.3.2",
"@tsclass/tsclass": "^9.3.0" "@tsclass/tsclass": "^9.5.1"
}, },
"keywords": [ "keywords": [
"smartstorage", "smartstorage",
@@ -66,13 +67,10 @@
"homepage": "https://code.foss.global/push.rocks/smartstorage#readme", "homepage": "https://code.foss.global/push.rocks/smartstorage#readme",
"repository": { "repository": {
"type": "git", "type": "git",
"url": "ssh://git@code.foss.global:29419/push.rocks/smartstorage.git" "url": "https://code.foss.global/push.rocks/smartstorage.git"
}, },
"bugs": { "bugs": {
"url": "https://code.foss.global/push.rocks/smartstorage/issues" "url": "https://code.foss.global/push.rocks/smartstorage/issues"
}, },
"packageManager": "pnpm@10.14.0+sha512.ad27a79641b49c3e481a16a805baa71817a04bbe06a38d17e60e2eaee83f6a146c6a688125f5792e48dd5ba30e7da52a5cda4c3992b9ccf333f9ce223af84748", "packageManager": "pnpm@10.28.2"
"pnpm": {
"overrides": {}
}
} }
+2988 -3743
View File
File diff suppressed because it is too large Load Diff
+40 -4
View File
@@ -1,6 +1,6 @@
# Project Hints for smartstorage # Project Hints for smartstorage
## Current State (v6.0.0) ## Current State (v6.4.0)
- **Rust-powered S3-compatible storage server** via `@push.rocks/smartrust` IPC bridge - **Rust-powered S3-compatible storage server** via `@push.rocks/smartrust` IPC bridge
- High-performance: streaming I/O, zero-copy, backpressure, range seek - High-performance: streaming I/O, zero-copy, backpressure, range seek
@@ -11,6 +11,15 @@
- **Bucket policies** (AWS/MinIO-compatible JSON policies, public access support) - **Bucket policies** (AWS/MinIO-compatible JSON policies, public access support)
- CORS support - CORS support
- ListBuckets, ListObjects (v1/v2), CopyObject - ListBuckets, ListObjects (v1/v2), CopyObject
- Runtime bucket summaries and storage stats via the Rust bridge (no S3 list scans)
- Cluster health introspection via the Rust bridge (node membership, local drive probes, quorum, healing state)
- Runtime credential listing and atomic replacement via the Rust bridge
- Runtime credentials persist under `{storage}/.smartstorage/credentials.json`
- Bucket tenant APIs provision scoped per-bucket credentials and enforce the scope before bucket-policy/default-auth authorization
- Per-bucket export/import uses `smartstorage.bucket.v1` JSON with object payloads encoded per object
- Cluster identity and topology snapshots persist under `{storage}/.smartstorage/cluster/`
- S3-side operational endpoints are available at `/-/live`, `/-/ready`, `/-/health`, and `/-/metrics`
- Runtime credential listing returns access-key metadata only; secrets are write-only
## Architecture ## Architecture
@@ -20,6 +29,7 @@
- `management.rs` - IPC loop (newline-delimited JSON over stdin/stdout) - `management.rs` - IPC loop (newline-delimited JSON over stdin/stdout)
- `server.rs` - hyper 1.x HTTP server, routing, CORS, auth+policy pipeline, all S3-compatible handlers - `server.rs` - hyper 1.x HTTP server, routing, CORS, auth+policy pipeline, all S3-compatible handlers
- `storage.rs` - FileStore: filesystem-backed storage, multipart manager, `.policies/` dir - `storage.rs` - FileStore: filesystem-backed storage, multipart manager, `.policies/` dir
- `storage.rs` also owns the runtime stats cache and standalone storage scans used by the bridge stats API
- `xml_response.rs` - S3-compatible XML response builders - `xml_response.rs` - S3-compatible XML response builders
- `error.rs` - StorageError codes with HTTP status mapping - `error.rs` - StorageError codes with HTTP status mapping
- `auth.rs` - AWS SigV4 signature verification (HMAC-SHA256, clock skew, constant-time compare) - `auth.rs` - AWS SigV4 signature verification (HMAC-SHA256, clock skew, constant-time compare)
@@ -37,6 +47,27 @@
| `start` | `{ config: ISmartStorageConfig }` | Init storage + HTTP server | | `start` | `{ config: ISmartStorageConfig }` | Init storage + HTTP server |
| `stop` | `{}` | Graceful shutdown | | `stop` | `{}` | Graceful shutdown |
| `createBucket` | `{ name: string }` | Create bucket directory | | `createBucket` | `{ name: string }` | Create bucket directory |
| `createBucketTenant` | `{ bucketName, accessKeyId, secretAccessKey, region? }` | Create bucket and scoped persisted credential |
| `deleteBucketTenant` | `{ bucketName, accessKeyId? }` | Revoke scoped credential or delete tenant bucket recursively |
| `rotateBucketTenantCredentials` | `{ bucketName, accessKeyId, secretAccessKey, region? }` | Replace scoped credential for one bucket |
| `listBucketTenants` | `{}` | Return scoped credential metadata |
| `getBucketTenantCredential` | `{ bucketName }` | Return one scoped credential including secret for descriptor generation |
| `exportBucket` | `{ bucketName }` | Export one bucket's objects and metadata |
| `importBucket` | `{ bucketName, source }` | Import a `smartstorage.bucket.v1` bucket export |
| `getStorageStats` | `{}` | Return cached bucket/global runtime stats + storage location capacity snapshots |
| `listBucketSummaries` | `{}` | Return cached per-bucket runtime summaries |
| `listCredentials` | `{}` | Return the active runtime auth credential set |
| `replaceCredentials` | `{ credentials: IStorageCredential[] }` | Atomically replace the runtime auth credential set |
| `getClusterHealth` | `{}` | Return runtime cluster health or `{ enabled: false }` in standalone mode |
### Operational HTTP Endpoints
| Endpoint | Purpose |
|----------|---------|
| `GET /-/live` | Process liveness |
| `GET /-/ready` | S3 readiness and cluster quorum readiness |
| `GET /-/health` | JSON storage, cluster, and runtime health |
| `GET /-/metrics` | Prometheus text metrics |
### Storage Layout ### Storage Layout
- Objects: `{root}/{bucket}/{key}._storage_object` - Objects: `{root}/{bucket}/{key}._storage_object`
@@ -44,12 +75,13 @@
- MD5: `{root}/{bucket}/{key}._storage_object.md5` - MD5: `{root}/{bucket}/{key}._storage_object.md5`
- Multipart: `{root}/.multipart/{upload_id}/part-{N}` - Multipart: `{root}/.multipart/{upload_id}/part-{N}`
- Policies: `{root}/.policies/{bucket}.policy.json` - Policies: `{root}/.policies/{bucket}.policy.json`
- Runtime credentials: `{root}/.smartstorage/credentials.json`
## Build ## Build
- `pnpm build` runs `tsrust && tsbuild --web --allowimplicitany` - `pnpm build` runs `tsrust && tsbuild tsfolders`
- `tsrust` compiles Rust to `dist_rust/ruststorage` - `tsrust` compiles Rust to `dist_rust/ruststorage`
- Targets: linux_amd64, linux_arm64 (configured in npmextra.json) - Targets: linux_amd64, linux_arm64 (configured in .smartconfig.json)
## Dependencies ## Dependencies
@@ -60,7 +92,11 @@
## Testing ## Testing
- `test/test.aws-sdk.node.ts` - AWS SDK v3 compatibility (10 tests, auth disabled, port 3337) - `test/test.aws-sdk.node.ts` - AWS SDK v3 compatibility + runtime stats + standalone cluster health coverage (19 tests, auth disabled, port 3337)
- `test/test.bucket-tenants.node.ts` - bucket tenant provisioning, per-bucket isolation, restart persistence, export/import, policy persistence, rotation, revoke/delete, AWS SDK v3 compatibility (12 tests, port 3361)
- `test/test.credentials.node.ts` - runtime credential rotation coverage (10 tests, auth enabled, port 3349)
- `test/test.health-http.node.ts` - unauthenticated operational endpoint coverage (3 tests, port 3353)
- `test/test.cluster-health.node.ts` - single-node cluster health coverage (4 tests, S3 port 3348, QUIC port 4348)
- `test/test.auth.node.ts` - Auth + bucket policy integration (20 tests, auth enabled, port 3344) - `test/test.auth.node.ts` - Auth + bucket policy integration (20 tests, auth enabled, port 3344)
- `test/test.policy-crud.node.ts` - Policy API CRUD + validation edge cases (17 tests, port 3345) - `test/test.policy-crud.node.ts` - Policy API CRUD + validation edge cases (17 tests, port 3345)
- `test/test.policy-eval.node.ts` - Policy evaluation: principals, actions, resources, deny-vs-allow (22 tests, port 3346) - `test/test.policy-eval.node.ts` - Policy evaluation: principals, actions, resources, deny-vs-allow (22 tests, port 3346)
+421 -47
View File
@@ -1,6 +1,6 @@
# @push.rocks/smartstorage # @push.rocks/smartstorage
A high-performance, S3-compatible local storage server powered by a **Rust core** with a clean TypeScript API. Drop-in replacement for AWS S3 during development and testing — no cloud, no Docker, no MinIO. Just `npm install` and go. A high-performance, S3-compatible storage server powered by a **Rust core** with a clean TypeScript API. Runs standalone for dev/test — or scales out as a **distributed, erasure-coded cluster** with QUIC-based inter-node communication. No cloud, no Docker. Just install the package and go. 🚀
## Issue Reporting and Security ## Issue Reporting and Security
@@ -15,23 +15,38 @@ For reporting bugs, issues, or security vulnerabilities, please visit [community
| Large file uploads | Streaming, zero-copy | Yes | OOM risk | | Large file uploads | Streaming, zero-copy | Yes | OOM risk |
| Range requests | Seek-based | Yes | Full read | | Range requests | Seek-based | Yes | Full read |
| Language | Rust + TypeScript | Go | JavaScript | | Language | Rust + TypeScript | Go | JavaScript |
| Multipart uploads | Full support | Yes | No | | Multipart uploads | Full support | Yes | No |
| Auth | AWS SigV4 (full verification) | Full IAM | Basic | | Auth | AWS SigV4 (full verification) | Full IAM | Basic |
| Bucket policies | IAM-style evaluation | Yes | No | | Bucket policies | IAM-style evaluation | Yes | No |
| Clustering | ✅ Erasure-coded, QUIC | Yes | No |
| Multi-drive awareness | ✅ Per-drive health | Yes | No |
### Core Features ### Core Features
- **Rust-powered HTTP server** — hyper 1.x with streaming I/O, zero-copy, backpressure - 🦀 **Rust-powered HTTP server** — hyper 1.x with streaming I/O, zero-copy, backpressure
- **Full S3-compatible API** — works with AWS SDK v3, SmartBucket, any S3 client - 📦 **Full S3-compatible API** — works with AWS SDK v3, SmartBucket, any S3 client
- **Filesystem-backed storage** — buckets map to directories, objects to files - 💾 **Filesystem-backed storage** — buckets map to directories, objects to files
- **Streaming multipart uploads** — large files without memory pressure - 📤 **Streaming multipart uploads** — large files without memory pressure
- **Byte-range requests** — `seek()` directly to the requested byte offset - 📐 **Byte-range requests**`seek()` directly to the requested byte offset
- **AWS SigV4 authentication** — full signature verification with constant-time comparison and 15-min clock skew enforcement - 🔐 **AWS SigV4 authentication** — full signature verification with constant-time comparison
- **Bucket policies** — IAM-style JSON policies with Allow/Deny evaluation, wildcard matching, and anonymous access support - 📋 **Bucket policies** — IAM-style JSON policies with Allow/Deny evaluation and wildcard matching
- **CORS middleware** — configurable cross-origin support - 🌐 **CORS middleware** — configurable cross-origin support
- **Structured logging** — tracing-based, error through debug levels - 🧹 **Clean slate mode**wipe storage on startup for test isolation
- **Clean slate mode** — wipe storage on startup for test isolation - 📊 **Runtime storage stats**cheap bucket summaries and global counts without S3 list scans
- **Test-first design** — start/stop in milliseconds, no port conflicts - 🔑 **Runtime credential rotation**list and replace active auth credentials without mutating internals
- 🧩 **Bucket tenants** — provision one scoped S3 credential per bucket with restart persistence
-**Test-first design** — start/stop in milliseconds, no port conflicts
### Clustering Features
- 🔗 **Erasure coding** — Reed-Solomon (configurable k data + m parity shards) for storage efficiency and fault tolerance
- 🚄 **QUIC transport** — multiplexed, encrypted inter-node communication via `quinn` with zero head-of-line blocking
- 💽 **Multi-drive awareness** — each node manages multiple independent storage paths with health monitoring
- 🩺 **Cluster health introspection** — query native node, drive, quorum, and healing status for product dashboards
- 🤝 **Cluster membership** — static seed config + runtime join, heartbeat-based failure detection
- ✍️ **Quorum writes** — data is only acknowledged after k+1 shards are persisted
- 📖 **Quorum reads** — reconstruct from any k available shards, local-first fast path
- 🩹 **Self-healing** — background scanner detects and reconstructs missing/corrupt shards
## Installation ## Installation
@@ -43,6 +58,8 @@ pnpm add @push.rocks/smartstorage -D
## Quick Start ## Quick Start
### Standalone Mode (Dev & Test)
```typescript ```typescript
import { SmartStorage } from '@push.rocks/smartstorage'; import { SmartStorage } from '@push.rocks/smartstorage';
@@ -63,6 +80,31 @@ const descriptor = await storage.getStorageDescriptor();
await storage.stop(); await storage.stop();
``` ```
### Cluster Mode (Distributed)
```typescript
import { SmartStorage } from '@push.rocks/smartstorage';
const storage = await SmartStorage.createAndStart({
server: { port: 3000 },
cluster: {
enabled: true,
nodeId: 'node-1',
quicPort: 4000,
seedNodes: ['192.168.1.11:4000', '192.168.1.12:4000'],
erasure: {
dataShards: 4, // k: minimum shards to reconstruct data
parityShards: 2, // m: fault tolerance (can lose up to m shards)
},
drives: {
paths: ['/mnt/disk1', '/mnt/disk2', '/mnt/disk3'],
},
},
});
```
Objects are automatically split into chunks (default 4 MB), erasure-coded into 6 shards (4 data + 2 parity), and distributed across drives/nodes. Any 4 of 6 shards can reconstruct the original data.
## Configuration ## Configuration
All config fields are optional — sensible defaults are applied automatically. All config fields are optional — sensible defaults are applied automatically.
@@ -75,7 +117,7 @@ const config: ISmartStorageConfig = {
port: 3000, // Default: 3000 port: 3000, // Default: 3000
address: '0.0.0.0', // Default: '0.0.0.0' address: '0.0.0.0', // Default: '0.0.0.0'
silent: false, // Default: false silent: false, // Default: false
region: 'us-east-1', // Default: 'us-east-1' — used for SigV4 signing region: 'us-east-1', // Default: 'us-east-1' — used for SigV4 signing
}, },
storage: { storage: {
directory: './my-data', // Default: .nogit/bucketsDir directory: './my-data', // Default: .nogit/bucketsDir
@@ -111,6 +153,22 @@ const config: ISmartStorageConfig = {
expirationDays: 7, expirationDays: 7,
cleanupIntervalMinutes: 60, cleanupIntervalMinutes: 60,
}, },
cluster: { // Optional — omit for standalone mode
enabled: true,
nodeId: 'node-1', // Auto-generated UUID if omitted
quicPort: 4000, // Default: 4000
seedNodes: [], // Addresses of existing cluster members
erasure: {
dataShards: 4, // Default: 4
parityShards: 2, // Default: 2
chunkSizeBytes: 4194304, // Default: 4 MB
},
drives: {
paths: ['/mnt/disk1', '/mnt/disk2'],
},
heartbeatIntervalMs: 5000, // Default: 5000
heartbeatTimeoutMs: 30000, // Default: 30000
},
}; };
const storage = await SmartStorage.createAndStart(config); const storage = await SmartStorage.createAndStart(config);
@@ -147,6 +205,173 @@ const storage = await SmartStorage.createAndStart({
}); });
``` ```
## Runtime Credentials
```typescript
const credentials = await storage.listCredentials();
await storage.replaceCredentials([
{
accessKeyId: 'ADMINA',
secretAccessKey: 'super-secret-a',
},
{
accessKeyId: 'ADMINB',
secretAccessKey: 'super-secret-b',
},
]);
```
```typescript
interface IStorageCredential {
accessKeyId: string;
secretAccessKey: string;
bucketName?: string;
region?: string;
}
```
- `listCredentials()` returns the Rust core's current runtime credential set.
- `replaceCredentials()` swaps the full set atomically and persists it under the storage root. On success, new requests use the new set immediately and the old credentials stop authenticating immediately.
- Requests that were already authenticated before the replacement keep running; auth is evaluated when each request starts.
- No restart is required, and runtime-created credentials survive restart unless `storage.cleanSlate` removes the storage directory.
- Replacement input must contain at least one credential, each `accessKeyId` and `secretAccessKey` must be non-empty, and `accessKeyId` values must be unique.
## Bucket Tenants
Bucket tenants are designed for platform services that need one bucket and one scoped S3 credential per app. Tenant credentials are enforced by the auth layer before the normal bucket-policy/default-auth pipeline, so a scoped credential cannot list all buckets or access another bucket even when it has a valid SigV4 signature.
```typescript
const tenant = await storage.createBucketTenant({
bucketName: 'workapp-123',
});
// Directly usable by AWS SDK v3 or env injection
const client = new S3Client({
endpoint: `http://${tenant.endpoint}:${tenant.port}`,
region: tenant.region,
credentials: {
accessKeyId: tenant.accessKeyId,
secretAccessKey: tenant.secretAccessKey,
},
forcePathStyle: true,
});
console.log(tenant.env.S3_BUCKET);
console.log(tenant.env.AWS_ACCESS_KEY_ID);
```
```typescript
await storage.rotateBucketTenantCredentials({ bucketName: 'workapp-123' });
await storage.deleteBucketTenant({ bucketName: 'workapp-123', accessKeyId: tenant.accessKeyId });
const descriptor = await storage.getBucketTenantDescriptor({ bucketName: 'workapp-123' });
const tenants = await storage.listBucketTenants();
```
- `createBucketTenant()` creates the bucket if needed and stores a scoped credential for that bucket.
- `rotateBucketTenantCredentials()` replaces the active scoped credential for the bucket and persists the new credential.
- `deleteBucketTenant({ bucketName, accessKeyId })` revokes one scoped credential and keeps the bucket.
- `deleteBucketTenant({ bucketName })` revokes scoped credentials for the bucket and deletes the bucket contents recursively.
- Tenant credentials can list, read, write, and delete objects in their assigned bucket, but cannot list all buckets, access other buckets, copy from other buckets, delete buckets, or mutate bucket policies.
- Bucket tenant APIs require `auth.enabled: true`.
## Bucket Backup/Restore
```typescript
const appBackup = await storage.exportBucket({ bucketName: 'workapp-123' });
await storage.importBucket({ bucketName: 'workapp-123-restore', source: appBackup });
```
- `exportBucket()` returns a self-contained `smartstorage.bucket.v1` JSON export with only the selected bucket's objects and object metadata.
- `importBucket()` creates the target bucket if needed and restores the exported objects into that bucket.
- Exports do not include credentials, policies, or unrelated tenant data.
## Health and Metrics APIs
```typescript
const health = await storage.getHealth();
const metrics = await storage.getMetrics();
```
- `getHealth()` reports running state, storage directory, auth enabled state, credential counts, bucket count, object count, total bytes, and cluster health.
- `getMetrics()` returns numeric counters and a Prometheus text snippet for bucket, object, byte, tenant credential, and cluster-enabled metrics.
## Runtime Stats
```typescript
const stats = await storage.getStorageStats();
const bucketSummaries = await storage.listBucketSummaries();
console.log(stats.bucketCount);
console.log(stats.totalObjectCount);
console.log(stats.totalStorageBytes);
console.log(bucketSummaries[0]?.name, bucketSummaries[0]?.objectCount);
```
```typescript
interface IBucketSummary {
name: string;
objectCount: number;
totalSizeBytes: number;
creationDate?: number;
}
interface IStorageLocationSummary {
path: string;
totalBytes?: number;
availableBytes?: number;
usedBytes?: number;
}
interface IStorageStats {
bucketCount: number;
totalObjectCount: number;
totalStorageBytes: number;
buckets: IBucketSummary[];
storageDirectory: string;
storageLocations?: IStorageLocationSummary[];
}
```
- `bucketCount`, `totalObjectCount`, `totalStorageBytes`, and per-bucket totals are logical object stats maintained by the Rust runtime. They count object payload bytes, not sidecar files or erasure-coded shard overhead.
- smartstorage initializes these values from native on-disk state at startup, then keeps them in memory and updates them when bucket/object mutations succeed. Stats reads do not issue S3 `ListObjects` or rescan every object.
- Values are exact for mutations performed through smartstorage after startup. Direct filesystem edits outside smartstorage are not watched; restart the server to resync.
- `storageLocations` is a cheap filesystem-capacity snapshot. Standalone mode reports the storage directory. Cluster mode reports the configured drive paths.
## Cluster Health
```typescript
const clusterHealth = await storage.getClusterHealth();
if (!clusterHealth.enabled) {
console.log('Cluster mode is disabled');
} else {
console.log(clusterHealth.nodeId, clusterHealth.quorumHealthy);
console.log(clusterHealth.peers);
console.log(clusterHealth.drives);
}
```
```typescript
interface IClusterHealth {
enabled: boolean;
nodeId?: string;
quorumHealthy?: boolean;
majorityHealthy?: boolean;
peers?: IClusterPeerHealth[];
drives?: IClusterDriveHealth[];
erasure?: IClusterErasureHealth;
repairs?: IClusterRepairHealth;
}
```
- `getClusterHealth()` is served by the Rust core. The TypeScript wrapper does not infer values from static config.
- Standalone mode returns `{ enabled: false }`.
- Peer status is the local node's current view of cluster membership and heartbeats, so it is best-effort and may lag real network state.
- Drive health is based on live native probe checks on the configured local drive paths. Capacity values are cheap filesystem snapshots.
- `quorumHealthy` means the local node currently sees majority quorum and enough available placements in every erasure set to satisfy the configured write quorum.
- Repair fields expose the background healer's currently available runtime state. They are best-effort and limited to what the engine tracks today, such as whether a scan is active, the last completed run, and the last error.
## Usage with AWS SDK v3 ## Usage with AWS SDK v3
```typescript ```typescript
@@ -207,7 +432,7 @@ const files = await dir.listFiles();
## Multipart Uploads ## Multipart Uploads
For files larger than 5 MB, use multipart uploads. smartstorage handles them with **streaming I/O** — parts are written directly to disk, never buffered in memory. For files larger than 5 MB, use multipart uploads. smartstorage handles them with **streaming I/O** — parts are written directly to disk, never buffered in memory. In cluster mode, each part is independently erasure-coded and distributed.
```typescript ```typescript
import { import {
@@ -255,8 +480,6 @@ When `auth.enabled` is `true`, the auth pipeline works as follows:
### Setting a Bucket Policy ### Setting a Bucket Policy
Use the S3 `PutBucketPolicy` API (or any S3 client that supports it):
```typescript ```typescript
import { PutBucketPolicyCommand } from '@aws-sdk/client-s3'; import { PutBucketPolicyCommand } from '@aws-sdk/client-s3';
@@ -294,6 +517,81 @@ await client.send(new PutBucketPolicyCommand({
Deleting a bucket automatically removes its associated policy. Deleting a bucket automatically removes its associated policy.
## Clustering Deep Dive 🔗
smartstorage can run as a distributed storage cluster where multiple nodes cooperate to store and retrieve data with built-in redundancy.
### How It Works
```
Client ──HTTP PUT──▶ Node A (coordinator)
├─ Split object into 4 MB chunks
├─ Erasure-code each chunk (4 data + 2 parity = 6 shards)
├──QUIC──▶ Node B (shard writes)
├──QUIC──▶ Node C (shard writes)
└─ Local disk (shard writes)
```
1. **Any node can coordinate** — the client connects to any cluster member
2. **Objects are chunked** — large objects split into fixed-size pieces (default 4 MB)
3. **Each chunk is erasure-coded** — Reed-Solomon produces k data + m parity shards
4. **Shards are distributed** — placed across different nodes and drives for fault isolation
5. **Quorum guarantees consistency** — writes need k+1 acks, reads need k shards
### Erasure Coding
With the default `4+2` configuration:
- Storage overhead: **33%** (vs. 200% for 3x replication)
- Fault tolerance: **any 2 drives/nodes can fail** simultaneously
- Read efficiency: only **4 of 6 shards** needed to reconstruct data
| Config | Total Shards | Overhead | Tolerance | Min Nodes |
|--------|-------------|----------|-----------|-----------|
| 4+2 | 6 | 33% | 2 failures | 3 |
| 6+3 | 9 | 50% | 3 failures | 5 |
| 2+1 | 3 | 50% | 1 failure | 2 |
### QUIC Transport
Inter-node communication uses [QUIC](https://en.wikipedia.org/wiki/QUIC) via the `quinn` library:
- 🔒 **Built-in TLS** — self-signed certs auto-generated at cluster init
- 🔀 **Multiplexed streams** — concurrent shard transfers without head-of-line blocking
-**Connection pooling** — persistent connections to peer nodes
- 🌊 **Natural backpressure** — QUIC flow control prevents overloading slow peers
### Cluster Membership
- **Static seed nodes** — initial cluster defined in config
- **Runtime join** — new nodes can join a running cluster
- **Heartbeat monitoring** — every 5s (configurable), with suspect/offline detection
- **Split-brain prevention** — nodes only mark peers offline when they have majority
### Self-Healing
A background scanner periodically (default: every 24h):
1. Checks shard checksums (CRC32C) for bit-rot detection
2. Identifies shards on offline nodes
3. Reconstructs missing shards from remaining data using Reed-Solomon
4. Places healed shards on healthy drives
Healing runs at low priority to avoid impacting foreground I/O.
### Erasure Set Formation
Drives are organized into fixed **erasure sets** at cluster initialization:
```
3 nodes × 4 drives each = 12 drives total
With 6-shard erasure sets → 2 erasure sets
Set 0: Node1-Disk0, Node2-Disk0, Node3-Disk0, Node1-Disk1, Node2-Disk1, Node3-Disk1
Set 1: Node1-Disk2, Node2-Disk2, Node3-Disk2, Node1-Disk3, Node2-Disk3, Node3-Disk3
```
Drives are interleaved across nodes for maximum fault isolation. New nodes form new erasure sets — existing data is never rebalanced.
## Testing Integration ## Testing Integration
```typescript ```typescript
@@ -341,6 +639,34 @@ Gracefully stop the server and kill the Rust process.
Create a storage bucket. Create a storage bucket.
#### `createBucketTenant(options): Promise<IBucketTenantDescriptor>`
Create a bucket tenant with a generated or supplied scoped credential. Options: `{ bucketName, accessKeyId?, secretAccessKey?, region? }`.
#### `deleteBucketTenant(options): Promise<void>`
Revoke a tenant credential or delete the full tenant bucket. Options: `{ bucketName, accessKeyId? }`.
#### `rotateBucketTenantCredentials(options): Promise<IBucketTenantDescriptor>`
Replace the scoped credential for a bucket tenant. Options: `{ bucketName, accessKeyId?, secretAccessKey?, region? }`.
#### `listBucketTenants(): Promise<IBucketTenantMetadata[]>`
List scoped tenant credential metadata without returning secrets.
#### `getBucketTenantDescriptor(options): Promise<IBucketTenantDescriptor>`
Return endpoint, port, region, bucket, access key, secret key, SSL flag, legacy descriptor fields, and S3/AWS env values for the bucket tenant.
#### `exportBucket(options): Promise<IBucketExport>`
Export one bucket's objects and metadata into a `smartstorage.bucket.v1` JSON object.
#### `importBucket(options): Promise<void>`
Import a `smartstorage.bucket.v1` JSON object into the target bucket. Options: `{ bucketName, source }`.
#### `getStorageDescriptor(options?): Promise<IS3Descriptor>` #### `getStorageDescriptor(options?): Promise<IS3Descriptor>`
Get connection details for S3-compatible clients. Returns: Get connection details for S3-compatible clients. Returns:
@@ -353,36 +679,70 @@ Get connection details for S3-compatible clients. Returns:
| `accessSecret` | `string` | Secret key from first configured credential | | `accessSecret` | `string` | Secret key from first configured credential |
| `useSsl` | `boolean` | Always `false` (plain HTTP) | | `useSsl` | `boolean` | Always `false` (plain HTTP) |
#### `getStorageStats(): Promise<IStorageStats>`
Read cached logical bucket and object totals from the Rust runtime without issuing S3 list calls.
#### `listBucketSummaries(): Promise<IBucketSummary[]>`
Get per-bucket logical object counts and total payload sizes.
#### `listCredentials(): Promise<IStorageCredential[]>`
Return the currently active runtime credential set.
#### `replaceCredentials(credentials: IStorageCredential[]): Promise<void>`
Atomically replace the active runtime credential set without restarting the server.
#### `getClusterHealth(): Promise<IClusterHealth>`
Read the Rust core's current cluster, drive, quorum, and repair health snapshot. Standalone mode returns `{ enabled: false }`.
#### `getHealth(): Promise<ISmartStorageHealth>`
Return running state, storage directory, auth state, credential counts, bucket count, object count, total bytes, and cluster health.
#### `getMetrics(): Promise<ISmartStorageMetrics>`
Return numeric metrics plus a Prometheus text snippet for operational scraping.
## Architecture ## Architecture
smartstorage uses a **hybrid Rust + TypeScript** architecture: smartstorage uses a **hybrid Rust + TypeScript** architecture:
``` ```
┌─────────────────────────────────┐ ┌──────────────────────────────────────────────
│ Your Code (AWS SDK, etc.) │ │ Your Code (AWS SDK, SmartBucket, etc.)
│ ↕ HTTP (localhost:3000) │ │ ↕ HTTP (localhost:3000)
├─────────────────────────────────┤ ├──────────────────────────────────────────────
│ ruststorage binary (Rust) │ │ ruststorage binary (Rust)
│ ├─ hyper 1.x HTTP server │ │ ├─ hyper 1.x HTTP server
│ ├─ S3 path-style routing │ │ ├─ S3 path-style routing
│ ├─ Streaming storage layer │ ├─ StorageBackend (Standalone or Clustered)
├─ Multipart manager │ ├─ FileStore (single-node mode)
├─ SigV4 auth + policy engine │ └─ DistributedStore (cluster mode)
├─ CORS middleware │ ├─ ErasureCoder (Reed-Solomon)
└─ S3 XML response builder │ ├─ ShardStore (per-drive storage)
├─────────────────────────────────┤ │ │ ├─ QuicTransport (quinn) │
TypeScript (thin IPC wrapper) │ ├─ ClusterState & Membership
├─ SmartStorage class │ └─ HealingService
│ ├─ RustBridge (stdin/stdout) │ ├─ SigV4 auth + policy engine
─ Config & S3 descriptor ─ CORS middleware
└─────────────────────────────────┘ │ └─ S3 XML response builder │
├──────────────────────────────────────────────┤
│ TypeScript (thin IPC wrapper) │
│ ├─ SmartStorage class │
│ ├─ RustBridge (stdin/stdout JSON IPC) │
│ └─ Config & S3 descriptor │
└──────────────────────────────────────────────┘
``` ```
**Why Rust?** The TypeScript implementation had critical perf issues: OOM on multipart uploads (parts buffered in memory), double stream copying, file descriptor leaks on HEAD requests, full-file reads for range requests, and no backpressure. The Rust binary solves all of these with streaming I/O, zero-copy, and direct `seek()` for range requests. **Why Rust?** The original TypeScript implementation had critical perf issues: OOM on multipart uploads (parts buffered in memory), double stream copying, file descriptor leaks on HEAD requests, full-file reads for range requests, and no backpressure. The Rust binary solves all of these with streaming I/O, zero-copy, and direct `seek()` for range requests.
**IPC Protocol:** TypeScript spawns the `ruststorage` binary with `--management` and communicates via newline-delimited JSON over stdin/stdout. Commands: `start`, `stop`, `createBucket`. **IPC Protocol:** TypeScript communicates with the `ruststorage` binary over newline-delimited JSON via stdin/stdout. The current management commands are `start`, `stop`, `createBucket`, `createBucketTenant`, `deleteBucketTenant`, `rotateBucketTenantCredentials`, `listBucketTenants`, `getBucketTenantCredential`, `exportBucket`, `importBucket`, `getStorageStats`, `listBucketSummaries`, `listCredentials`, `replaceCredentials`, and `getClusterHealth`.
### S3-Compatible Operations Supported ### S3-Compatible Operations
| Operation | Method | Path | | Operation | Method | Path |
|-----------|--------|------| |-----------|--------|------|
@@ -407,31 +767,45 @@ smartstorage uses a **hybrid Rust + TypeScript** architecture:
### On-Disk Format ### On-Disk Format
**Standalone mode:**
``` ```
{storage.directory}/ {storage.directory}/
{bucket}/ {bucket}/
{key}._storage_object # Object data {key}._storage_object # Object data
{key}._storage_object.metadata.json # Metadata (content-type, x-amz-meta-*, etc.) {key}._storage_object.metadata.json # Metadata (content-type, x-amz-meta-*, etc.)
{key}._storage_object.md5 # Cached MD5 hash {key}._storage_object.md5 # Cached MD5 hash
.multipart/ .multipart/
{upload-id}/ {upload-id}/
metadata.json # Upload metadata (bucket, key, parts) metadata.json # Upload metadata
part-1 # Part data files part-1, part-2, ... # Part data files
part-2
...
.policies/ .policies/
{bucket}.policy.json # Bucket policy (IAM JSON format) {bucket}.policy.json # Bucket policy (IAM JSON format)
```
**Cluster mode:**
```
{drive_path}/.smartstorage/
format.json # Drive metadata (cluster ID, erasure set)
data/{bucket}/{key_hash}/{key}/
chunk-{N}/shard-{M}.dat # Erasure-coded shard data
chunk-{N}/shard-{M}.meta # Shard metadata (checksum, size)
{storage.directory}/
.manifests/{bucket}/
{key}.manifest.json # Object manifest (shard placements, checksums)
.buckets/{bucket}/ # Bucket metadata
.policies/{bucket}.policy.json # Bucket policies
``` ```
## Related Packages ## Related Packages
- [`@push.rocks/smartbucket`](https://code.foss.global/push.rocks/smartbucket) — High-level S3-compatible abstraction layer - [`@push.rocks/smartbucket`](https://code.foss.global/push.rocks/smartbucket) — High-level S3-compatible abstraction layer
- [`@push.rocks/smartrust`](https://code.foss.global/push.rocks/smartrust) — TypeScript <-> Rust IPC bridge - [`@push.rocks/smartrust`](https://code.foss.global/push.rocks/smartrust) — TypeScript Rust IPC bridge
- [`@git.zone/tsrust`](https://code.foss.global/git.zone/tsrust) — Rust cross-compilation for npm packages - [`@git.zone/tsrust`](https://code.foss.global/git.zone/tsrust) — Rust cross-compilation for npm packages
## License and Legal Information ## License and Legal Information
This repository contains open-source code licensed under the MIT License. A copy of the license can be found in the [LICENSE](./LICENSE) file. This repository contains open-source code licensed under the MIT License. A copy of the license can be found in the [license](./license) file.
**Please note:** The MIT License does not grant permission to use the trade names, trademarks, service marks, or product names of the project, except as required for reasonable and customary use in describing the origin of the work and reproducing the content of the NOTICE file. **Please note:** The MIT License does not grant permission to use the trade names, trademarks, service marks, or product names of the project, except as required for reasonable and customary use in describing the origin of the work and reproducing the content of the NOTICE file.
+959 -19
View File
File diff suppressed because it is too large Load Diff
+11
View File
@@ -28,6 +28,17 @@ percent-encoding = "2"
url = "2" url = "2"
chrono = { version = "0.4", features = ["serde"] } chrono = { version = "0.4", features = ["serde"] }
futures-core = "0.3" futures-core = "0.3"
futures = "0.3"
async-trait = "0.1"
reed-solomon-erasure = { version = "6", features = ["simd-accel"] }
xxhash-rust = { version = "0.8", features = ["xxh64"] }
crc32c = "0.6"
bincode = "1"
quinn = "0.11"
rustls = { version = "0.23", default-features = false, features = ["ring", "std"] }
rcgen = "0.13"
dashmap = "6"
hmac = "0.12" hmac = "0.12"
sha2 = "0.10" sha2 = "0.10"
hex = "0.4" hex = "0.4"
libc = "0.2"
+17
View File
@@ -57,6 +57,7 @@ pub struct RequestContext {
pub action: StorageAction, pub action: StorageAction,
pub bucket: Option<String>, pub bucket: Option<String>,
pub key: Option<String>, pub key: Option<String>,
pub source_bucket: Option<String>,
} }
impl RequestContext { impl RequestContext {
@@ -90,6 +91,7 @@ pub fn resolve_action(req: &Request<Incoming>) -> RequestContext {
action: StorageAction::ListAllMyBuckets, action: StorageAction::ListAllMyBuckets,
bucket: None, bucket: None,
key: None, key: None,
source_bucket: None,
} }
} }
1 => { 1 => {
@@ -113,6 +115,7 @@ pub fn resolve_action(req: &Request<Incoming>) -> RequestContext {
action, action,
bucket: Some(bucket), bucket: Some(bucket),
key: None, key: None,
source_bucket: None,
} }
} }
2 => { 2 => {
@@ -123,6 +126,18 @@ pub fn resolve_action(req: &Request<Incoming>) -> RequestContext {
let has_part_number = query.contains_key("partNumber"); let has_part_number = query.contains_key("partNumber");
let has_upload_id = query.contains_key("uploadId"); let has_upload_id = query.contains_key("uploadId");
let has_uploads = query.contains_key("uploads"); let has_uploads = query.contains_key("uploads");
let source_bucket = if has_copy_source {
req.headers()
.get("x-amz-copy-source")
.and_then(|value| value.to_str().ok())
.map(|source| {
let source = source.trim_start_matches('/');
let first_slash = source.find('/').unwrap_or(source.len());
percent_decode(&source[..first_slash])
})
} else {
None
};
let action = match &method { let action = match &method {
&Method::PUT if has_part_number && has_upload_id => StorageAction::UploadPart, &Method::PUT if has_part_number && has_upload_id => StorageAction::UploadPart,
@@ -141,12 +156,14 @@ pub fn resolve_action(req: &Request<Incoming>) -> RequestContext {
action, action,
bucket: Some(bucket), bucket: Some(bucket),
key: Some(key), key: Some(key),
source_bucket,
} }
} }
_ => RequestContext { _ => RequestContext {
action: StorageAction::ListAllMyBuckets, action: StorageAction::ListAllMyBuckets,
bucket: None, bucket: None,
key: None, key: None,
source_bucket: None,
}, },
} }
} }
+258 -22
View File
@@ -2,9 +2,12 @@ use hmac::{Hmac, Mac};
use hyper::body::Incoming; use hyper::body::Incoming;
use hyper::Request; use hyper::Request;
use sha2::{Digest, Sha256}; use sha2::{Digest, Sha256};
use std::collections::HashMap; use std::collections::{HashMap, HashSet};
use std::path::PathBuf;
use tokio::fs;
use tokio::sync::RwLock;
use crate::config::{Credential, SmartStorageConfig}; use crate::config::{AuthConfig, Credential};
use crate::error::StorageError; use crate::error::StorageError;
type HmacSha256 = Hmac<Sha256>; type HmacSha256 = Hmac<Sha256>;
@@ -13,6 +16,7 @@ type HmacSha256 = Hmac<Sha256>;
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct AuthenticatedIdentity { pub struct AuthenticatedIdentity {
pub access_key_id: String, pub access_key_id: String,
pub bucket_name: Option<String>,
} }
/// Parsed components of an AWS4-HMAC-SHA256 Authorization header. /// Parsed components of an AWS4-HMAC-SHA256 Authorization header.
@@ -27,7 +31,7 @@ struct SigV4Header {
/// Verify the request's SigV4 signature. Returns the caller identity on success. /// Verify the request's SigV4 signature. Returns the caller identity on success.
pub fn verify_request( pub fn verify_request(
req: &Request<Incoming>, req: &Request<Incoming>,
config: &SmartStorageConfig, credentials: &[Credential],
) -> Result<AuthenticatedIdentity, StorageError> { ) -> Result<AuthenticatedIdentity, StorageError> {
let auth_header = req let auth_header = req
.headers() .headers()
@@ -47,7 +51,7 @@ pub fn verify_request(
let parsed = parse_auth_header(auth_header)?; let parsed = parse_auth_header(auth_header)?;
// Look up credential // Look up credential
let credential = find_credential(&parsed.access_key_id, config) let credential = find_credential(&parsed.access_key_id, credentials)
.ok_or_else(StorageError::invalid_access_key_id)?; .ok_or_else(StorageError::invalid_access_key_id)?;
// Get x-amz-date // Get x-amz-date
@@ -55,11 +59,7 @@ pub fn verify_request(
.headers() .headers()
.get("x-amz-date") .get("x-amz-date")
.and_then(|v| v.to_str().ok()) .and_then(|v| v.to_str().ok())
.or_else(|| { .or_else(|| req.headers().get("date").and_then(|v| v.to_str().ok()))
req.headers()
.get("date")
.and_then(|v| v.to_str().ok())
})
.ok_or_else(|| StorageError::missing_security_header("Missing x-amz-date header"))?; .ok_or_else(|| StorageError::missing_security_header("Missing x-amz-date header"))?;
// Enforce 15-min clock skew // Enforce 15-min clock skew
@@ -76,10 +76,7 @@ pub fn verify_request(
let canonical_request = build_canonical_request(req, &parsed.signed_headers, content_sha256); let canonical_request = build_canonical_request(req, &parsed.signed_headers, content_sha256);
// Build string to sign // Build string to sign
let scope = format!( let scope = format!("{}/{}/s3/aws4_request", parsed.date_stamp, parsed.region);
"{}/{}/s3/aws4_request",
parsed.date_stamp, parsed.region
);
let canonical_hash = hex::encode(Sha256::digest(canonical_request.as_bytes())); let canonical_hash = hex::encode(Sha256::digest(canonical_request.as_bytes()));
let string_to_sign = format!( let string_to_sign = format!(
"AWS4-HMAC-SHA256\n{}\n{}\n{}", "AWS4-HMAC-SHA256\n{}\n{}\n{}",
@@ -104,6 +101,7 @@ pub fn verify_request(
Ok(AuthenticatedIdentity { Ok(AuthenticatedIdentity {
access_key_id: parsed.access_key_id, access_key_id: parsed.access_key_id,
bucket_name: credential.bucket_name.clone(),
}) })
} }
@@ -130,10 +128,9 @@ fn parse_auth_header(header: &str) -> Result<SigV4Header, StorageError> {
} }
} }
let credential_str = credential_str let credential_str = credential_str.ok_or_else(StorageError::authorization_header_malformed)?;
.ok_or_else(StorageError::authorization_header_malformed)?; let signed_headers_str =
let signed_headers_str = signed_headers_str signed_headers_str.ok_or_else(StorageError::authorization_header_malformed)?;
.ok_or_else(StorageError::authorization_header_malformed)?;
let signature = signature_str let signature = signature_str
.ok_or_else(StorageError::authorization_header_malformed)? .ok_or_else(StorageError::authorization_header_malformed)?
.to_string(); .to_string();
@@ -163,21 +160,260 @@ fn parse_auth_header(header: &str) -> Result<SigV4Header, StorageError> {
} }
/// Find a credential by access key ID. /// Find a credential by access key ID.
fn find_credential<'a>(access_key_id: &str, config: &'a SmartStorageConfig) -> Option<&'a Credential> { fn find_credential<'a>(
config access_key_id: &str,
.auth credentials: &'a [Credential],
.credentials ) -> Option<&'a Credential> {
credentials
.iter() .iter()
.find(|c| c.access_key_id == access_key_id) .find(|c| c.access_key_id == access_key_id)
} }
#[derive(Debug)]
pub struct RuntimeCredentialStore {
enabled: bool,
credentials: RwLock<Vec<Credential>>,
persistence_path: Option<PathBuf>,
}
#[derive(Debug, Clone, serde::Serialize)]
#[serde(rename_all = "camelCase")]
pub struct CredentialMetadata {
pub access_key_id: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub bucket_name: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub region: Option<String>,
}
#[derive(Debug, Clone, serde::Serialize)]
#[serde(rename_all = "camelCase")]
pub struct BucketTenantMetadata {
pub bucket_name: String,
pub access_key_id: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub region: Option<String>,
}
impl RuntimeCredentialStore {
pub async fn new(
config: &AuthConfig,
persistence_path: Option<PathBuf>,
) -> anyhow::Result<Self> {
let credentials = match persistence_path.as_ref() {
Some(path) if path.exists() => {
let content = fs::read_to_string(path).await?;
let credentials: Vec<Credential> = serde_json::from_str(&content)?;
validate_credentials(&credentials)
.map_err(|error| anyhow::anyhow!(error.message))?;
credentials
}
_ => config.credentials.clone(),
};
Ok(Self {
enabled: config.enabled,
credentials: RwLock::new(credentials),
persistence_path,
})
}
pub fn enabled(&self) -> bool {
self.enabled
}
pub async fn list_credentials(&self) -> Vec<CredentialMetadata> {
self.credentials
.read()
.await
.iter()
.map(|credential| CredentialMetadata {
access_key_id: credential.access_key_id.clone(),
bucket_name: credential.bucket_name.clone(),
region: credential.region.clone(),
})
.collect()
}
pub async fn snapshot_credentials(&self) -> Vec<Credential> {
self.credentials.read().await.clone()
}
pub async fn replace_credentials(
&self,
credentials: Vec<Credential>,
) -> Result<(), StorageError> {
validate_credentials(&credentials)?;
self.persist_credentials(&credentials).await?;
*self.credentials.write().await = credentials;
Ok(())
}
pub async fn replace_bucket_tenant_credential(
&self,
bucket_name: &str,
mut credential: Credential,
) -> Result<Credential, StorageError> {
validate_bucket_scope(bucket_name)?;
credential.bucket_name = Some(bucket_name.to_string());
let mut credentials = self.credentials.read().await.clone();
if credentials.iter().any(|existing| {
existing.access_key_id == credential.access_key_id
&& existing.bucket_name.as_deref() != Some(bucket_name)
}) {
return Err(StorageError::invalid_request(
"Credential accessKeyId is already assigned to another principal.",
));
}
credentials.retain(|existing| existing.bucket_name.as_deref() != Some(bucket_name));
credentials.push(credential.clone());
validate_credentials(&credentials)?;
self.persist_credentials(&credentials).await?;
*self.credentials.write().await = credentials;
Ok(credential)
}
pub async fn remove_bucket_tenant_credentials(
&self,
bucket_name: &str,
access_key_id: Option<&str>,
) -> Result<usize, StorageError> {
validate_bucket_scope(bucket_name)?;
let mut credentials = self.credentials.read().await.clone();
let before = credentials.len();
credentials.retain(|credential| {
if credential.bucket_name.as_deref() != Some(bucket_name) {
return true;
}
if let Some(access_key_id) = access_key_id {
credential.access_key_id != access_key_id
} else {
false
}
});
let removed = before.saturating_sub(credentials.len());
if credentials.is_empty() {
return Err(StorageError::invalid_request(
"Cannot remove the last active credential.",
));
}
self.persist_credentials(&credentials).await?;
*self.credentials.write().await = credentials;
Ok(removed)
}
pub async fn list_bucket_tenants(&self) -> Vec<BucketTenantMetadata> {
let mut tenants: Vec<BucketTenantMetadata> = self
.credentials
.read()
.await
.iter()
.filter_map(|credential| {
credential
.bucket_name
.as_ref()
.map(|bucket_name| BucketTenantMetadata {
bucket_name: bucket_name.clone(),
access_key_id: credential.access_key_id.clone(),
region: credential.region.clone(),
})
})
.collect();
tenants.sort_by(|a, b| {
a.bucket_name
.cmp(&b.bucket_name)
.then_with(|| a.access_key_id.cmp(&b.access_key_id))
});
tenants
}
pub async fn get_bucket_tenant_credential(&self, bucket_name: &str) -> Option<Credential> {
self.credentials
.read()
.await
.iter()
.find(|credential| credential.bucket_name.as_deref() == Some(bucket_name))
.cloned()
}
async fn persist_credentials(&self, credentials: &[Credential]) -> Result<(), StorageError> {
let Some(path) = self.persistence_path.as_ref() else {
return Ok(());
};
if let Some(parent) = path.parent() {
fs::create_dir_all(parent)
.await
.map_err(|error| StorageError::internal_error(&error.to_string()))?;
}
let temp_path = path.with_extension("json.tmp");
let json = serde_json::to_string_pretty(credentials)
.map_err(|error| StorageError::internal_error(&error.to_string()))?;
fs::write(&temp_path, json)
.await
.map_err(|error| StorageError::internal_error(&error.to_string()))?;
fs::rename(&temp_path, path)
.await
.map_err(|error| StorageError::internal_error(&error.to_string()))?;
Ok(())
}
}
fn validate_bucket_scope(bucket_name: &str) -> Result<(), StorageError> {
if bucket_name.trim().is_empty() {
return Err(StorageError::invalid_request(
"Bucket tenant bucketName must not be empty.",
));
}
Ok(())
}
fn validate_credentials(credentials: &[Credential]) -> Result<(), StorageError> {
if credentials.is_empty() {
return Err(StorageError::invalid_request(
"Credential replacement requires at least one credential.",
));
}
let mut seen_access_keys = HashSet::new();
for credential in credentials {
if credential.access_key_id.trim().is_empty() {
return Err(StorageError::invalid_request(
"Credential accessKeyId must not be empty.",
));
}
if credential.secret_access_key.trim().is_empty() {
return Err(StorageError::invalid_request(
"Credential secretAccessKey must not be empty.",
));
}
if !seen_access_keys.insert(credential.access_key_id.as_str()) {
return Err(StorageError::invalid_request(
"Credential accessKeyId values must be unique.",
));
}
}
Ok(())
}
/// Check clock skew (15 minutes max). /// Check clock skew (15 minutes max).
fn check_clock_skew(amz_date: &str) -> Result<(), StorageError> { fn check_clock_skew(amz_date: &str) -> Result<(), StorageError> {
// Parse ISO 8601 basic format: YYYYMMDDTHHMMSSZ // Parse ISO 8601 basic format: YYYYMMDDTHHMMSSZ
let parsed = chrono::NaiveDateTime::parse_from_str(amz_date, "%Y%m%dT%H%M%SZ") let parsed = chrono::NaiveDateTime::parse_from_str(amz_date, "%Y%m%dT%H%M%SZ")
.map_err(|_| StorageError::authorization_header_malformed())?; .map_err(|_| StorageError::authorization_header_malformed())?;
let request_time = chrono::DateTime::<chrono::Utc>::from_naive_utc_and_offset(parsed, chrono::Utc); let request_time =
chrono::DateTime::<chrono::Utc>::from_naive_utc_and_offset(parsed, chrono::Utc);
let now = chrono::Utc::now(); let now = chrono::Utc::now();
let diff = (now - request_time).num_seconds().unsigned_abs(); let diff = (now - request_time).num_seconds().unsigned_abs();
+95
View File
@@ -0,0 +1,95 @@
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct ClusterConfig {
pub enabled: bool,
#[serde(default)]
pub node_id: Option<String>,
#[serde(default = "default_quic_port")]
pub quic_port: u16,
#[serde(default)]
pub seed_nodes: Vec<String>,
#[serde(default)]
pub erasure: ErasureConfig,
#[serde(default)]
pub drives: DriveConfig,
#[serde(default = "default_heartbeat_interval")]
pub heartbeat_interval_ms: u64,
#[serde(default = "default_heartbeat_timeout")]
pub heartbeat_timeout_ms: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct ErasureConfig {
#[serde(default = "default_data_shards")]
pub data_shards: usize,
#[serde(default = "default_parity_shards")]
pub parity_shards: usize,
#[serde(default = "default_chunk_size")]
pub chunk_size_bytes: usize,
}
impl ErasureConfig {
pub fn total_shards(&self) -> usize {
self.data_shards + self.parity_shards
}
/// Minimum shards needed for a write to succeed (data_shards + 1)
pub fn write_quorum(&self) -> usize {
self.data_shards + 1
}
/// Minimum shards needed to reconstruct data
pub fn read_quorum(&self) -> usize {
self.data_shards
}
}
impl Default for ErasureConfig {
fn default() -> Self {
Self {
data_shards: default_data_shards(),
parity_shards: default_parity_shards(),
chunk_size_bytes: default_chunk_size(),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct DriveConfig {
#[serde(default)]
pub paths: Vec<String>,
}
impl Default for DriveConfig {
fn default() -> Self {
Self { paths: Vec::new() }
}
}
fn default_quic_port() -> u16 {
4000
}
fn default_heartbeat_interval() -> u64 {
5000
}
fn default_heartbeat_timeout() -> u64 {
30000
}
fn default_data_shards() -> usize {
4
}
fn default_parity_shards() -> usize {
2
}
fn default_chunk_size() -> usize {
4 * 1024 * 1024 // 4 MB
}
File diff suppressed because it is too large Load Diff
+286
View File
@@ -0,0 +1,286 @@
use super::config::DriveConfig;
use anyhow::Result;
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use std::path::{Path, PathBuf};
use tokio::fs;
// ============================
// Drive format (on-disk metadata)
// ============================
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct DriveFormat {
pub cluster_id: String,
pub erasure_set_id: u32,
pub drive_index_in_set: u32,
pub format_version: u32,
}
// ============================
// Drive state tracking
// ============================
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum DriveStatus {
Online,
Degraded,
Offline,
Healing,
}
#[derive(Debug, Clone)]
pub struct DriveStats {
pub total_bytes: u64,
pub available_bytes: u64,
pub used_bytes: u64,
pub avg_write_latency_us: u64,
pub avg_read_latency_us: u64,
pub error_count: u64,
pub last_error: Option<String>,
pub last_check: DateTime<Utc>,
}
impl Default for DriveStats {
fn default() -> Self {
Self {
total_bytes: 0,
available_bytes: 0,
used_bytes: 0,
avg_write_latency_us: 0,
avg_read_latency_us: 0,
error_count: 0,
last_error: None,
last_check: Utc::now(),
}
}
}
#[derive(Debug, Clone)]
pub struct DriveState {
pub path: PathBuf,
pub format: Option<DriveFormat>,
pub status: DriveStatus,
pub stats: DriveStats,
}
// ============================
// Drive manager
// ============================
pub struct DriveManager {
drives: Vec<DriveState>,
}
impl DriveManager {
/// Initialize drive manager with configured drive paths.
pub async fn new(config: &DriveConfig) -> Result<Self> {
let paths: Vec<PathBuf> = config.paths.iter().map(PathBuf::from).collect();
Self::from_paths(&paths).await
}
/// Initialize drive manager from an explicit list of resolved paths.
pub async fn from_paths(paths: &[PathBuf]) -> Result<Self> {
let mut drives = Vec::with_capacity(paths.len());
for path in paths {
let storage_dir = path.join(".smartstorage");
// Ensure the drive directory exists
fs::create_dir_all(&storage_dir).await?;
// Try to read existing format
let format = Self::read_format(&storage_dir).await;
let status = if path.exists() {
DriveStatus::Online
} else {
DriveStatus::Offline
};
drives.push(DriveState {
path: path.clone(),
format,
status,
stats: DriveStats::default(),
});
}
Ok(Self { drives })
}
/// Format drives for a new cluster. Stamps each drive with cluster and erasure set info.
pub async fn format_drives(
&mut self,
cluster_id: &str,
erasure_set_assignments: &[(u32, u32)], // (erasure_set_id, drive_index_in_set)
) -> Result<()> {
if erasure_set_assignments.len() != self.drives.len() {
anyhow::bail!(
"Erasure set assignments count ({}) doesn't match drive count ({})",
erasure_set_assignments.len(),
self.drives.len()
);
}
for (drive, (set_id, drive_idx)) in
self.drives.iter_mut().zip(erasure_set_assignments.iter())
{
let format = DriveFormat {
cluster_id: cluster_id.to_string(),
erasure_set_id: *set_id,
drive_index_in_set: *drive_idx,
format_version: 1,
};
let storage_dir = drive.path.join(".smartstorage");
fs::create_dir_all(&storage_dir).await?;
let format_path = storage_dir.join("format.json");
let json = serde_json::to_string_pretty(&format)?;
fs::write(&format_path, json).await?;
drive.format = Some(format);
}
Ok(())
}
/// Get the number of drives managed.
pub fn drive_count(&self) -> usize {
self.drives.len()
}
/// Get a drive's state by index.
pub fn drive(&self, index: usize) -> Option<&DriveState> {
self.drives.get(index)
}
/// Get all drives.
pub fn drives(&self) -> &[DriveState] {
&self.drives
}
/// Get a cloneable snapshot of current drive states.
pub fn snapshot(&self) -> Vec<DriveState> {
self.drives.clone()
}
/// Get drives that are online.
pub fn online_drives(&self) -> Vec<usize> {
self.drives
.iter()
.enumerate()
.filter(|(_, d)| d.status == DriveStatus::Online)
.map(|(i, _)| i)
.collect()
}
/// Check health of a specific drive by writing and reading a probe file.
pub async fn check_drive_health(&mut self, index: usize) -> Result<DriveStatus> {
let drive = self
.drives
.get_mut(index)
.ok_or_else(|| anyhow::anyhow!("Drive index {} out of range", index))?;
let probe_path = drive.path.join(".smartstorage").join(".health_probe");
let start = std::time::Instant::now();
// Write probe
match fs::write(&probe_path, b"health_check").await {
Ok(()) => {}
Err(e) => {
drive.stats.error_count += 1;
drive.stats.last_error = Some(e.to_string());
drive.status = DriveStatus::Offline;
drive.stats.last_check = Utc::now();
return Ok(DriveStatus::Offline);
}
}
// Read probe
match fs::read(&probe_path).await {
Ok(_) => {}
Err(e) => {
drive.stats.error_count += 1;
drive.stats.last_error = Some(e.to_string());
drive.status = DriveStatus::Offline;
drive.stats.last_check = Utc::now();
return Ok(DriveStatus::Offline);
}
}
// Clean up probe
let _ = fs::remove_file(&probe_path).await;
let latency = start.elapsed();
if let Some((total_bytes, available_bytes, used_bytes)) = filesystem_usage(&drive.path) {
drive.stats.total_bytes = total_bytes;
drive.stats.available_bytes = available_bytes;
drive.stats.used_bytes = used_bytes;
}
drive.stats.avg_write_latency_us = latency.as_micros() as u64;
drive.stats.last_check = Utc::now();
// Mark degraded if latency is too high (>5 seconds)
if latency.as_secs() > 5 {
drive.status = DriveStatus::Degraded;
} else {
drive.status = DriveStatus::Online;
}
Ok(drive.status.clone())
}
/// Run health checks on all drives.
pub async fn check_all_drives(&mut self) -> Vec<(usize, DriveStatus)> {
let mut results = Vec::new();
let count = self.drives.len();
for i in 0..count {
match self.check_drive_health(i).await {
Ok(status) => results.push((i, status)),
Err(e) => {
tracing::error!(drive = i, error = %e, "Drive health check failed");
results.push((i, DriveStatus::Offline));
}
}
}
results
}
// Internal helpers
async fn read_format(storage_dir: &Path) -> Option<DriveFormat> {
let format_path = storage_dir.join("format.json");
let content = fs::read_to_string(&format_path).await.ok()?;
serde_json::from_str(&content).ok()
}
}
#[cfg(unix)]
fn filesystem_usage(path: &Path) -> Option<(u64, u64, u64)> {
use std::ffi::CString;
use std::os::unix::ffi::OsStrExt;
let path_bytes = path.as_os_str().as_bytes();
let c_path = CString::new(path_bytes).ok()?;
let mut stat: libc::statvfs = unsafe { std::mem::zeroed() };
if unsafe { libc::statvfs(c_path.as_ptr(), &mut stat) } != 0 {
return None;
}
let block_size = stat.f_frsize as u64;
let total_bytes = stat.f_blocks as u64 * block_size;
let available_bytes = stat.f_bavail as u64 * block_size;
let free_bytes = stat.f_bfree as u64 * block_size;
let used_bytes = total_bytes.saturating_sub(free_bytes);
Some((total_bytes, available_bytes, used_bytes))
}
#[cfg(not(unix))]
fn filesystem_usage(_path: &Path) -> Option<(u64, u64, u64)> {
None
}
+246
View File
@@ -0,0 +1,246 @@
use anyhow::Result;
use reed_solomon_erasure::galois_8::ReedSolomon;
use super::config::ErasureConfig;
/// Erasure coder that splits data into data+parity shards using Reed-Solomon.
///
/// Objects are processed in fixed-size chunks (stripes). Each chunk is independently
/// erasure-coded, enabling streaming encode/decode without buffering entire objects.
pub struct ErasureCoder {
rs: ReedSolomon,
config: ErasureConfig,
}
impl ErasureCoder {
pub fn new(config: &ErasureConfig) -> Result<Self> {
let rs = ReedSolomon::new(config.data_shards, config.parity_shards)
.map_err(|e| anyhow::anyhow!("Failed to create Reed-Solomon encoder: {:?}", e))?;
Ok(Self {
rs,
config: config.clone(),
})
}
pub fn config(&self) -> &ErasureConfig {
&self.config
}
/// Encode a single chunk of data into data+parity shards.
///
/// The input data is split into `data_shards` equal-size pieces (padded if needed),
/// then `parity_shards` parity pieces are computed.
///
/// Returns a Vec of length `data_shards + parity_shards`, where:
/// - indices 0..data_shards are data shards
/// - indices data_shards..total are parity shards
pub fn encode_chunk(&self, data: &[u8]) -> Result<Vec<Vec<u8>>> {
let k = self.config.data_shards;
let m = self.config.parity_shards;
// Compute shard size: each data shard holds ceil(data_len / k) bytes
let shard_size = (data.len() + k - 1) / k;
if shard_size == 0 {
anyhow::bail!("Cannot encode empty data");
}
// Pad input to fill exactly k shards
let mut padded = data.to_vec();
padded.resize(shard_size * k, 0);
// Split into k data shards
let mut shards: Vec<Vec<u8>> = padded.chunks(shard_size).map(|c| c.to_vec()).collect();
// Add m empty parity shards
for _ in 0..m {
shards.push(vec![0u8; shard_size]);
}
// Compute parity in-place
self.rs
.encode(&mut shards)
.map_err(|e| anyhow::anyhow!("Reed-Solomon encoding failed: {:?}", e))?;
Ok(shards)
}
/// Decode (reconstruct) original data from a partial set of shards.
///
/// `shards` must have length == total_shards (data + parity).
/// At least `data_shards` entries must be `Some`. Missing shards are `None`.
/// `original_size` is the original data size before padding, used to truncate.
///
/// Returns the reconstructed original data.
pub fn decode_chunk(
&self,
shards: &mut Vec<Option<Vec<u8>>>,
original_size: usize,
) -> Result<Vec<u8>> {
let k = self.config.data_shards;
let total = self.config.total_shards();
if shards.len() != total {
anyhow::bail!(
"Expected {} shards, got {}",
total,
shards.len()
);
}
let available = shards.iter().filter(|s| s.is_some()).count();
if available < k {
anyhow::bail!(
"Need at least {} shards for reconstruction, only {} available",
k,
available
);
}
// Reconstruct missing shards
self.rs
.reconstruct(shards)
.map_err(|e| anyhow::anyhow!("Reed-Solomon reconstruction failed: {:?}", e))?;
// Concatenate data shards (first k) and truncate to original size
let mut result = Vec::with_capacity(original_size);
for i in 0..k {
if let Some(ref shard) = shards[i] {
result.extend_from_slice(shard);
} else {
anyhow::bail!("Data shard {} missing after reconstruction", i);
}
}
result.truncate(original_size);
Ok(result)
}
/// Verify that all shards are consistent (no corruption).
pub fn verify(&self, shards: &[Vec<u8>]) -> Result<bool> {
let shard_refs: Vec<&[u8]> = shards.iter().map(|s| s.as_slice()).collect();
self.rs
.verify(&shard_refs)
.map_err(|e| anyhow::anyhow!("Reed-Solomon verification failed: {:?}", e))
}
}
#[cfg(test)]
mod tests {
use super::*;
fn test_config() -> ErasureConfig {
ErasureConfig {
data_shards: 4,
parity_shards: 2,
chunk_size_bytes: 4 * 1024 * 1024,
}
}
#[test]
fn test_encode_decode_roundtrip() {
let coder = ErasureCoder::new(&test_config()).unwrap();
let original = b"Hello, erasure coding! This is a test of the Reed-Solomon implementation.";
let shards = coder.encode_chunk(original).unwrap();
assert_eq!(shards.len(), 6); // 4 data + 2 parity
// All shards should be the same size
let shard_size = shards[0].len();
for s in &shards {
assert_eq!(s.len(), shard_size);
}
// Reconstruct with all shards present
let mut shard_opts: Vec<Option<Vec<u8>>> = shards.iter().map(|s| Some(s.clone())).collect();
let recovered = coder.decode_chunk(&mut shard_opts, original.len()).unwrap();
assert_eq!(&recovered, original);
}
#[test]
fn test_decode_with_missing_shards() {
let coder = ErasureCoder::new(&test_config()).unwrap();
let original = b"Testing reconstruction with missing shards - this should work with 4 of 6.";
let shards = coder.encode_chunk(original).unwrap();
// Remove 2 shards (the maximum we can tolerate with 2 parity)
let mut shard_opts: Vec<Option<Vec<u8>>> = shards.iter().map(|s| Some(s.clone())).collect();
shard_opts[1] = None; // Remove data shard 1
shard_opts[4] = None; // Remove parity shard 0
let recovered = coder.decode_chunk(&mut shard_opts, original.len()).unwrap();
assert_eq!(&recovered, original);
}
#[test]
fn test_decode_with_too_many_missing() {
let coder = ErasureCoder::new(&test_config()).unwrap();
let original = b"This should fail with 3 missing shards.";
let shards = coder.encode_chunk(original).unwrap();
// Remove 3 shards (more than parity count of 2)
let mut shard_opts: Vec<Option<Vec<u8>>> = shards.iter().map(|s| Some(s.clone())).collect();
shard_opts[0] = None;
shard_opts[2] = None;
shard_opts[5] = None;
let result = coder.decode_chunk(&mut shard_opts, original.len());
assert!(result.is_err());
}
#[test]
fn test_encode_large_data() {
let coder = ErasureCoder::new(&test_config()).unwrap();
// 1 MB of data
let original: Vec<u8> = (0..1_000_000).map(|i| (i % 256) as u8).collect();
let shards = coder.encode_chunk(&original).unwrap();
assert_eq!(shards.len(), 6);
// Each shard should be ~250KB (1MB / 4 data shards, rounded up)
let expected_shard_size = (original.len() + 3) / 4;
assert_eq!(shards[0].len(), expected_shard_size);
// Verify roundtrip
let mut shard_opts: Vec<Option<Vec<u8>>> = shards.iter().map(|s| Some(s.clone())).collect();
let recovered = coder.decode_chunk(&mut shard_opts, original.len()).unwrap();
assert_eq!(recovered, original);
}
#[test]
fn test_verify_shards() {
let coder = ErasureCoder::new(&test_config()).unwrap();
let original = b"Verify test data";
let shards = coder.encode_chunk(original).unwrap();
assert!(coder.verify(&shards).unwrap());
// Corrupt a shard
let mut corrupted = shards.clone();
corrupted[0][0] ^= 0xFF;
assert!(!coder.verify(&corrupted).unwrap());
}
#[test]
fn test_small_config() {
// Minimum viable: 2 data + 1 parity
let config = ErasureConfig {
data_shards: 2,
parity_shards: 1,
chunk_size_bytes: 1024,
};
let coder = ErasureCoder::new(&config).unwrap();
let original = b"Small config test";
let shards = coder.encode_chunk(original).unwrap();
assert_eq!(shards.len(), 3);
// Remove 1 shard
let mut shard_opts: Vec<Option<Vec<u8>>> = shards.iter().map(|s| Some(s.clone())).collect();
shard_opts[0] = None;
let recovered = coder.decode_chunk(&mut shard_opts, original.len()).unwrap();
assert_eq!(&recovered, original);
}
}
+416
View File
@@ -0,0 +1,416 @@
use anyhow::Result;
use chrono::{DateTime, Utc};
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Duration;
use tokio::fs;
use tokio::sync::RwLock;
use super::config::ErasureConfig;
use super::erasure::ErasureCoder;
use super::metadata::ObjectManifest;
use super::shard_store::{ShardId, ShardStore};
use super::state::ClusterState;
/// Background healing service that scans for under-replicated shards
/// and reconstructs them.
pub struct HealingService {
state: Arc<ClusterState>,
erasure_coder: ErasureCoder,
local_shard_stores: Vec<Arc<ShardStore>>,
manifest_dir: PathBuf,
scan_interval: Duration,
runtime_state: Arc<RwLock<HealingRuntimeState>>,
}
impl HealingService {
pub fn new(
state: Arc<ClusterState>,
erasure_config: &ErasureConfig,
local_shard_stores: Vec<Arc<ShardStore>>,
manifest_dir: PathBuf,
scan_interval_hours: u64,
runtime_state: Arc<RwLock<HealingRuntimeState>>,
) -> Result<Self> {
let scan_interval = Duration::from_secs(scan_interval_hours * 3600);
if let Ok(mut state_guard) = runtime_state.try_write() {
state_guard.scan_interval_ms = scan_interval.as_millis() as u64;
}
Ok(Self {
state,
erasure_coder: ErasureCoder::new(erasure_config)?,
local_shard_stores,
manifest_dir,
scan_interval,
runtime_state,
})
}
pub fn runtime_state(&self) -> Arc<RwLock<HealingRuntimeState>> {
self.runtime_state.clone()
}
/// Run the healing loop as a background task.
pub async fn run(&self, mut shutdown: tokio::sync::watch::Receiver<bool>) {
let mut interval = tokio::time::interval(self.scan_interval);
// Skip the first immediate tick
interval.tick().await;
loop {
tokio::select! {
_ = interval.tick() => {
let started_at = Utc::now();
self.mark_healing_started(started_at).await;
tracing::info!("Starting healing scan");
match self.heal_scan().await {
Ok(stats) => {
self.mark_healing_finished(started_at, Some(stats.clone()), None).await;
tracing::info!(
checked = stats.shards_checked,
healed = stats.shards_healed,
errors = stats.errors,
"Healing scan completed"
);
}
Err(e) => {
self.mark_healing_finished(started_at, None, Some(e.to_string())).await;
tracing::error!("Healing scan failed: {}", e);
}
}
}
_ = shutdown.changed() => {
tracing::info!("Healing service shutting down");
break;
}
}
}
}
async fn mark_healing_started(&self, started_at: DateTime<Utc>) {
let mut runtime_state = self.runtime_state.write().await;
runtime_state.active = true;
runtime_state.scan_interval_ms = self.scan_interval.as_millis() as u64;
runtime_state.last_run_started_at = Some(started_at);
runtime_state.last_error = None;
}
async fn mark_healing_finished(
&self,
started_at: DateTime<Utc>,
stats: Option<HealStats>,
last_error: Option<String>,
) {
let finished_at = Utc::now();
let mut runtime_state = self.runtime_state.write().await;
runtime_state.active = false;
runtime_state.scan_interval_ms = self.scan_interval.as_millis() as u64;
runtime_state.last_run_completed_at = Some(finished_at);
runtime_state.last_duration_ms = Some(
finished_at
.signed_duration_since(started_at)
.num_milliseconds()
.max(0) as u64,
);
if let Some(stats) = stats {
runtime_state.last_stats = Some(stats);
}
runtime_state.last_error = last_error;
}
/// Scan all manifests for shards on offline nodes, reconstruct and re-place them.
async fn heal_scan(&self) -> Result<HealStats> {
let mut stats = HealStats::default();
let offline_nodes = self.state.offline_nodes().await;
if offline_nodes.is_empty() {
tracing::debug!("No offline nodes, skipping heal scan");
return Ok(stats);
}
// Check that we have majority before healing (split-brain prevention)
if !self.state.has_majority().await {
tracing::warn!("No majority quorum, skipping heal to prevent split-brain");
return Ok(stats);
}
tracing::info!(
"Found {} offline nodes, scanning for affected shards",
offline_nodes.len()
);
// Iterate all bucket directories under manifest_dir
let mut bucket_entries = match fs::read_dir(&self.manifest_dir).await {
Ok(e) => e,
Err(_) => return Ok(stats),
};
while let Some(bucket_entry) = bucket_entries.next_entry().await? {
if !bucket_entry.metadata().await?.is_dir() {
continue;
}
let bucket_name = bucket_entry.file_name().to_string_lossy().to_string();
if bucket_name.starts_with('.') {
continue;
}
// Scan manifests in this bucket
self.heal_bucket(&bucket_name, &offline_nodes, &mut stats)
.await;
// Yield to avoid starving foreground I/O
tokio::task::yield_now().await;
}
Ok(stats)
}
async fn heal_bucket(
&self,
bucket: &str,
offline_nodes: &[String],
stats: &mut HealStats,
) {
let bucket_dir = self.manifest_dir.join(bucket);
let manifests = match self.collect_manifests(&bucket_dir).await {
Ok(m) => m,
Err(e) => {
tracing::warn!(bucket = bucket, error = %e, "Failed to list manifests");
stats.errors += 1;
return;
}
};
let local_id = self.state.local_node_id().to_string();
for manifest in &manifests {
for chunk in &manifest.chunks {
// Check if any shard in this chunk is on an offline node
let affected: Vec<_> = chunk
.shard_placements
.iter()
.filter(|p| offline_nodes.contains(&p.node_id))
.collect();
if affected.is_empty() {
continue;
}
stats.shards_checked += chunk.shard_placements.len() as u64;
// Try to reconstruct missing shards from available ones
let k = manifest.data_shards;
let total = manifest.data_shards + manifest.parity_shards;
// Count available shards (those NOT on offline nodes)
let available_count = chunk
.shard_placements
.iter()
.filter(|p| !offline_nodes.contains(&p.node_id))
.count();
if available_count < k {
tracing::error!(
bucket = manifest.bucket,
key = manifest.key,
chunk = chunk.chunk_index,
available = available_count,
needed = k,
"Cannot heal chunk: not enough available shards"
);
stats.errors += 1;
continue;
}
// Fetch available shards (only local ones for now)
let mut shards: Vec<Option<Vec<u8>>> = vec![None; total];
let mut fetched = 0usize;
for placement in &chunk.shard_placements {
if offline_nodes.contains(&placement.node_id) {
continue; // Skip offline nodes
}
if fetched >= k {
break;
}
if placement.node_id == local_id {
let shard_id = ShardId {
bucket: manifest.bucket.clone(),
key: manifest.key.clone(),
chunk_index: chunk.chunk_index,
shard_index: placement.shard_index,
};
let store_idx = placement.drive_id.parse::<usize>().unwrap_or(0);
if let Some(store) = self.local_shard_stores.get(store_idx) {
if let Ok((data, _)) = store.read_shard(&shard_id).await {
shards[placement.shard_index as usize] = Some(data);
fetched += 1;
}
}
}
// TODO: fetch from other online remote nodes
}
if fetched < k {
tracing::warn!(
bucket = manifest.bucket,
key = manifest.key,
chunk = chunk.chunk_index,
"Not enough local shards to heal, skipping"
);
continue;
}
// Reconstruct all shards
let reconstructed = match self.erasure_coder.decode_chunk(
&mut shards,
chunk.data_size,
) {
Ok(_) => true,
Err(e) => {
tracing::error!(
bucket = manifest.bucket,
key = manifest.key,
chunk = chunk.chunk_index,
error = %e,
"Reconstruction failed"
);
stats.errors += 1;
false
}
};
if !reconstructed {
continue;
}
// Re-encode to get all shards back (including the missing ones)
let full_data_size = chunk.data_size;
let mut data_buf = Vec::with_capacity(full_data_size);
for i in 0..k {
if let Some(ref shard) = shards[i] {
data_buf.extend_from_slice(shard);
}
}
data_buf.truncate(full_data_size);
let all_shards = match self.erasure_coder.encode_chunk(&data_buf) {
Ok(s) => s,
Err(e) => {
tracing::error!(error = %e, "Re-encoding for heal failed");
stats.errors += 1;
continue;
}
};
// Verify reconstructed shards are consistent
if !self.erasure_coder.verify(&all_shards).unwrap_or(false) {
tracing::error!(
bucket = manifest.bucket,
key = manifest.key,
chunk = chunk.chunk_index,
"Shard verification failed after reconstruction"
);
stats.errors += 1;
continue;
}
// Write the missing shards to the first available local drive
for affected_placement in &affected {
let shard_idx = affected_placement.shard_index as usize;
if shard_idx < all_shards.len() {
let shard_data = &all_shards[shard_idx];
let checksum = crc32c::crc32c(shard_data);
let shard_id = ShardId {
bucket: manifest.bucket.clone(),
key: manifest.key.clone(),
chunk_index: chunk.chunk_index,
shard_index: affected_placement.shard_index,
};
// Place on first available local drive
if let Some(store) = self.local_shard_stores.first() {
match store.write_shard(&shard_id, shard_data, checksum).await {
Ok(()) => {
stats.shards_healed += 1;
tracing::info!(
bucket = manifest.bucket,
key = manifest.key,
chunk = chunk.chunk_index,
shard = affected_placement.shard_index,
"Shard healed successfully"
);
}
Err(e) => {
tracing::error!(error = %e, "Failed to write healed shard");
stats.errors += 1;
}
}
}
}
}
tokio::task::yield_now().await;
}
}
}
/// Collect all manifests under a bucket directory.
async fn collect_manifests(&self, dir: &std::path::Path) -> Result<Vec<ObjectManifest>> {
let mut manifests = Vec::new();
self.collect_manifests_recursive(dir, &mut manifests).await?;
Ok(manifests)
}
fn collect_manifests_recursive<'a>(
&'a self,
dir: &'a std::path::Path,
manifests: &'a mut Vec<ObjectManifest>,
) -> std::pin::Pin<Box<dyn std::future::Future<Output = Result<()>> + Send + 'a>> {
Box::pin(async move {
let mut entries = match fs::read_dir(dir).await {
Ok(e) => e,
Err(_) => return Ok(()),
};
while let Some(entry) = entries.next_entry().await? {
let meta = entry.metadata().await?;
let name = entry.file_name().to_string_lossy().to_string();
if meta.is_dir() {
self.collect_manifests_recursive(&entry.path(), manifests)
.await?;
} else if name.ends_with(".manifest.json") {
if let Ok(content) = fs::read_to_string(entry.path()).await {
if let Ok(manifest) = serde_json::from_str::<ObjectManifest>(&content) {
manifests.push(manifest);
}
}
}
}
Ok(())
})
}
}
#[derive(Debug, Clone, Default)]
pub struct HealStats {
pub shards_checked: u64,
pub shards_healed: u64,
pub errors: u64,
}
#[derive(Debug, Clone, Default)]
pub struct HealingRuntimeState {
pub active: bool,
pub scan_interval_ms: u64,
pub last_run_started_at: Option<DateTime<Utc>>,
pub last_run_completed_at: Option<DateTime<Utc>>,
pub last_duration_ms: Option<u64>,
pub last_stats: Option<HealStats>,
pub last_error: Option<String>,
}
+243
View File
@@ -0,0 +1,243 @@
use anyhow::Result;
use std::net::SocketAddr;
use std::sync::Arc;
use std::time::Duration;
use tokio::sync::Mutex;
use super::drive_manager::{DriveManager, DriveStatus};
use super::protocol::{
ClusterRequest, ClusterResponse, DriveStateInfo, HeartbeatMessage, JoinRequestMessage,
NodeInfo,
};
use super::quic_transport::QuicTransport;
use super::state::ClusterState;
/// Manages cluster membership: heartbeating, joining, failure detection.
pub struct MembershipManager {
state: Arc<ClusterState>,
transport: Arc<QuicTransport>,
heartbeat_interval: Duration,
heartbeat_timeout: Duration,
local_node_info: NodeInfo,
drive_manager: Option<Arc<Mutex<DriveManager>>>,
}
impl MembershipManager {
pub fn new(
state: Arc<ClusterState>,
transport: Arc<QuicTransport>,
heartbeat_interval_ms: u64,
heartbeat_timeout_ms: u64,
local_node_info: NodeInfo,
) -> Self {
Self {
state,
transport,
heartbeat_interval: Duration::from_millis(heartbeat_interval_ms),
heartbeat_timeout: Duration::from_millis(heartbeat_timeout_ms),
local_node_info,
drive_manager: None,
}
}
/// Set the drive manager for health reporting in heartbeats.
pub fn with_drive_manager(mut self, dm: Arc<Mutex<DriveManager>>) -> Self {
self.drive_manager = Some(dm);
self
}
/// Join the cluster by contacting seed nodes.
/// Sends a JoinRequest to each seed node until one accepts.
pub async fn join_cluster(&self, seed_nodes: &[String], allow_bootstrap_on_failure: bool) -> Result<()> {
if seed_nodes.is_empty() {
tracing::info!("No seed nodes configured, starting as initial cluster node");
self.state.add_node(self.local_node_info.clone()).await;
return Ok(());
}
for seed in seed_nodes {
let addr: SocketAddr = match seed.parse() {
Ok(a) => a,
Err(e) => {
tracing::warn!("Invalid seed node address '{}': {}", seed, e);
continue;
}
};
tracing::info!("Attempting to join cluster via seed node {}", seed);
match self.try_join(addr).await {
Ok(()) => {
tracing::info!("Successfully joined cluster via {}", seed);
return Ok(());
}
Err(e) => {
tracing::warn!("Failed to join via {}: {}", seed, e);
}
}
}
if allow_bootstrap_on_failure {
tracing::warn!("Could not reach any seed nodes, bootstrapping a new cluster because no persisted topology exists");
self.state.add_node(self.local_node_info.clone()).await;
return Ok(());
}
anyhow::bail!("Could not reach any configured seed nodes; refusing unsafe cluster bootstrap")
}
async fn try_join(&self, addr: SocketAddr) -> Result<()> {
let conn = self
.transport
.get_connection("seed", addr)
.await?;
let request = ClusterRequest::JoinRequest(JoinRequestMessage {
node_info: self.local_node_info.clone(),
});
let response = self.transport.send_request(&conn, &request).await?;
match response {
ClusterResponse::JoinResponse(join_resp) => {
if join_resp.accepted {
if let Some(topology) = &join_resp.topology {
let topology_contains_self = topology
.nodes
.iter()
.any(|node| node.node_id == self.local_node_info.node_id);
self.state.apply_topology(topology).await;
if !topology_contains_self {
self.state.add_node(self.local_node_info.clone()).await;
}
tracing::info!(
"Applied cluster topology (version {}, {} nodes, {} erasure sets)",
topology.version,
topology.nodes.len(),
topology.erasure_sets.len(),
);
}
Ok(())
} else {
anyhow::bail!(
"Join rejected: {}",
join_resp.error.unwrap_or_default()
)
}
}
ClusterResponse::Error(e) => {
anyhow::bail!("Join error: {} - {}", e.code, e.message)
}
_ => anyhow::bail!("Unexpected response to join request"),
}
}
/// Run the heartbeat loop. Sends heartbeats to all peers periodically.
pub async fn heartbeat_loop(self: Arc<Self>, mut shutdown: tokio::sync::watch::Receiver<bool>) {
let mut interval = tokio::time::interval(self.heartbeat_interval);
loop {
tokio::select! {
_ = interval.tick() => {
self.send_heartbeats().await;
}
_ = shutdown.changed() => break,
}
}
}
async fn send_heartbeats(&self) {
let peers = self
.state
.all_nodes()
.await
.into_iter()
.filter(|node| node.info.node_id != self.local_node_info.node_id)
.collect::<Vec<_>>();
let topology_version = self.state.version().await;
let mut responded = Vec::new();
// Collect drive health states
let drive_states = self.collect_drive_states().await;
for peer in &peers {
let addr: SocketAddr = match peer.info.quic_addr.parse() {
Ok(a) => a,
Err(_) => continue,
};
let heartbeat = ClusterRequest::Heartbeat(HeartbeatMessage {
node_id: self.local_node_info.node_id.clone(),
timestamp: chrono::Utc::now().to_rfc3339(),
drive_states: drive_states.clone(),
topology_version,
});
match tokio::time::timeout(
self.heartbeat_timeout,
self.send_heartbeat_to_peer(&peer.info.node_id, addr, &heartbeat),
)
.await
{
Ok(Ok(())) => {
responded.push(peer.info.node_id.clone());
}
Ok(Err(e)) => {
tracing::debug!(
peer = %peer.info.node_id,
error = %e,
"Heartbeat failed"
);
}
Err(_) => {
tracing::debug!(peer = %peer.info.node_id, "Heartbeat timed out");
}
}
}
// Update state based on responses
let status_changes = self.state.tick_heartbeats(&responded).await;
for (node_id, status) in &status_changes {
tracing::info!(node = %node_id, status = ?status, "Node status changed");
}
}
async fn send_heartbeat_to_peer(
&self,
node_id: &str,
addr: SocketAddr,
heartbeat: &ClusterRequest,
) -> Result<()> {
let conn = self.transport.get_connection(node_id, addr).await?;
let _response = self.transport.send_request(&conn, heartbeat).await?;
Ok(())
}
/// Collect drive health states from the DriveManager, if available.
async fn collect_drive_states(&self) -> Vec<DriveStateInfo> {
let dm = match &self.drive_manager {
Some(dm) => dm,
None => return Vec::new(),
};
let mut manager = dm.lock().await;
let results = manager.check_all_drives().await;
results
.into_iter()
.map(|(idx, status)| {
let status_str = match status {
DriveStatus::Online => "online",
DriveStatus::Degraded => "degraded",
DriveStatus::Offline => "offline",
DriveStatus::Healing => "healing",
};
DriveStateInfo {
drive_index: idx as u32,
status: status_str.to_string(),
}
})
.collect()
}
}
+85
View File
@@ -0,0 +1,85 @@
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
/// Full manifest describing how an object is stored across erasure-coded shards.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct ObjectManifest {
/// Bucket name
pub bucket: String,
/// Object key
pub key: String,
/// Unique version ID for this write
pub version_id: String,
/// Total object size in bytes
pub size: u64,
/// MD5 hex digest of the complete object
pub content_md5: String,
/// Content type
pub content_type: String,
/// User metadata (x-amz-meta-*, content-type, etc.)
pub metadata: HashMap<String, String>,
/// When the object was created
pub created_at: String,
/// Last modified timestamp
pub last_modified: String,
/// Number of data shards used
pub data_shards: usize,
/// Number of parity shards used
pub parity_shards: usize,
/// Chunk size in bytes (last chunk may be smaller)
pub chunk_size: usize,
/// Per-chunk shard placement info
pub chunks: Vec<ChunkManifest>,
}
/// Describes the shards for a single chunk of an object.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct ChunkManifest {
/// Index of this chunk (0-based)
pub chunk_index: u32,
/// Actual data size of this chunk (before erasure coding)
pub data_size: usize,
/// Where each shard was placed
pub shard_placements: Vec<ShardPlacement>,
}
/// Describes where a specific shard is stored.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct ShardPlacement {
/// Shard index within the erasure set (0..data_shards+parity_shards)
pub shard_index: u32,
/// Node that holds this shard
pub node_id: String,
/// Drive ID on that node
pub drive_id: String,
/// CRC32C checksum of the shard data
pub checksum: u32,
/// Size of the shard data in bytes
pub shard_size: usize,
}
/// Manifest for a multipart upload in progress.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct MultipartUploadManifest {
pub upload_id: String,
pub bucket: String,
pub key: String,
pub initiated: String,
pub metadata: HashMap<String, String>,
/// Per-part manifests, keyed by part number.
pub parts: HashMap<u32, PartManifest>,
}
/// Manifest for a single part of a multipart upload.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct PartManifest {
pub part_number: u32,
pub size: u64,
pub md5: String,
pub chunks: Vec<ChunkManifest>,
}
+17
View File
@@ -0,0 +1,17 @@
// Cluster modules contain forward-looking public API that is incrementally wired.
// Allow dead_code for methods/structs not yet called from the main server path.
#![allow(dead_code)]
pub mod config;
pub mod coordinator;
pub mod drive_manager;
pub mod erasure;
pub mod healing;
pub mod membership;
pub mod metadata;
pub mod persistence;
pub mod placement;
pub mod protocol;
pub mod quic_transport;
pub mod shard_store;
pub mod state;
+77
View File
@@ -0,0 +1,77 @@
use anyhow::Result;
use serde::{Deserialize, Serialize};
use std::path::{Path, PathBuf};
use tokio::fs;
use super::protocol::ClusterTopology;
const CLUSTER_METADATA_DIR: &str = ".smartstorage/cluster";
const IDENTITY_FILE: &str = "identity.json";
const TOPOLOGY_FILE: &str = "topology.json";
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct ClusterIdentity {
pub schema_version: u32,
pub node_id: String,
pub cluster_id: String,
}
impl ClusterIdentity {
pub fn new(node_id: String, cluster_id: String) -> Self {
Self {
schema_version: 1,
node_id,
cluster_id,
}
}
}
pub fn cluster_metadata_dir(storage_directory: &str) -> PathBuf {
PathBuf::from(storage_directory).join(CLUSTER_METADATA_DIR)
}
pub fn identity_path(metadata_dir: &Path) -> PathBuf {
metadata_dir.join(IDENTITY_FILE)
}
pub fn topology_path(metadata_dir: &Path) -> PathBuf {
metadata_dir.join(TOPOLOGY_FILE)
}
pub async fn load_identity(path: &Path) -> Result<Option<ClusterIdentity>> {
match fs::read_to_string(path).await {
Ok(content) => Ok(Some(serde_json::from_str(&content)?)),
Err(error) if error.kind() == std::io::ErrorKind::NotFound => Ok(None),
Err(error) => Err(error.into()),
}
}
pub async fn persist_identity(path: &Path, identity: &ClusterIdentity) -> Result<()> {
write_json_atomic(path, identity).await
}
pub async fn load_topology(path: &Path) -> Result<Option<ClusterTopology>> {
match fs::read_to_string(path).await {
Ok(content) => Ok(Some(serde_json::from_str(&content)?)),
Err(error) if error.kind() == std::io::ErrorKind::NotFound => Ok(None),
Err(error) => Err(error.into()),
}
}
pub async fn persist_topology(path: &Path, topology: &ClusterTopology) -> Result<()> {
write_json_atomic(path, topology).await
}
async fn write_json_atomic<T: Serialize>(path: &Path, value: &T) -> Result<()> {
let parent = path
.parent()
.ok_or_else(|| anyhow::anyhow!("Cluster metadata path has no parent"))?;
fs::create_dir_all(parent).await?;
let temp_path = path.with_extension("json.tmp");
let content = serde_json::to_string_pretty(value)?;
fs::write(&temp_path, content).await?;
fs::rename(&temp_path, path).await?;
Ok(())
}
+140
View File
@@ -0,0 +1,140 @@
use xxhash_rust::xxh64::xxh64;
/// Determines which erasure set an object belongs to, based on consistent hashing.
///
/// Uses xxhash64 of "{bucket}/{key}" to deterministically map objects to erasure sets.
/// This is stateless — any node can independently compute the placement.
pub fn erasure_set_for_object(bucket: &str, key: &str, num_erasure_sets: u32) -> u32 {
if num_erasure_sets == 0 {
return 0;
}
let hash_input = format!("{}/{}", bucket, key);
let hash = xxh64(hash_input.as_bytes(), 0);
(hash % num_erasure_sets as u64) as u32
}
/// Represents a drive location within the cluster topology.
#[derive(Debug, Clone)]
pub struct DriveLocation {
pub node_id: String,
pub drive_index: u32,
}
/// An erasure set: a fixed group of drives that together store one complete
/// set of shards for any object placed on them.
#[derive(Debug, Clone)]
pub struct ErasureSet {
pub set_id: u32,
/// Ordered drives: index = shard_index
pub drives: Vec<DriveLocation>,
}
/// Form erasure sets from the available drives across all nodes.
///
/// Interleaves drives from different nodes for fault isolation:
/// e.g., with 3 nodes x 4 drives and total_shards=6:
/// Set 0: N0-D0, N1-D0, N2-D0, N0-D1, N1-D1, N2-D1
/// Set 1: N0-D2, N1-D2, N2-D2, N0-D3, N1-D3, N2-D3
pub fn form_erasure_sets(
nodes: &[(String, u32)], // (node_id, drive_count)
total_shards: usize,
) -> Vec<ErasureSet> {
// Collect all drives as (node_id, drive_index), interleaved by node
let max_drives = nodes.iter().map(|(_, count)| *count).max().unwrap_or(0) as usize;
let mut all_drives: Vec<DriveLocation> = Vec::new();
for drive_idx in 0..max_drives {
for (node_id, drive_count) in nodes {
if (drive_idx as u32) < *drive_count {
all_drives.push(DriveLocation {
node_id: node_id.clone(),
drive_index: drive_idx as u32,
});
}
}
}
// Form sets of total_shards drives each
let num_sets = all_drives.len() / total_shards;
let mut sets = Vec::with_capacity(num_sets);
for set_idx in 0..num_sets {
let start = set_idx * total_shards;
let end = start + total_shards;
let drives = all_drives[start..end].to_vec();
sets.push(ErasureSet {
set_id: set_idx as u32,
drives,
});
}
sets
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_erasure_set_assignment_deterministic() {
let set_a = erasure_set_for_object("mybucket", "mykey", 4);
let set_b = erasure_set_for_object("mybucket", "mykey", 4);
assert_eq!(set_a, set_b);
}
#[test]
fn test_erasure_set_distribution() {
// Check that objects are distributed across sets
let num_sets = 4u32;
let mut counts = vec![0u32; num_sets as usize];
for i in 0..1000 {
let key = format!("key-{}", i);
let set = erasure_set_for_object("bucket", &key, num_sets);
assert!(set < num_sets);
counts[set as usize] += 1;
}
// Each set should have some objects (not all in one set)
for count in &counts {
assert!(*count > 100, "Expected >100, got {}", count);
}
}
#[test]
fn test_form_erasure_sets_3x4() {
// 3 nodes, 4 drives each, 6 shards per set => 2 sets
let nodes = vec![
("node1".to_string(), 4),
("node2".to_string(), 4),
("node3".to_string(), 4),
];
let sets = form_erasure_sets(&nodes, 6);
assert_eq!(sets.len(), 2);
// Set 0 should interleave across nodes
let set0_nodes: Vec<&str> = sets[0].drives.iter().map(|d| d.node_id.as_str()).collect();
assert_eq!(set0_nodes, vec!["node1", "node2", "node3", "node1", "node2", "node3"]);
// Set 1 should also interleave
let set1_nodes: Vec<&str> = sets[1].drives.iter().map(|d| d.node_id.as_str()).collect();
assert_eq!(set1_nodes, vec!["node1", "node2", "node3", "node1", "node2", "node3"]);
// Drive indices should be different between sets
let set0_drives: Vec<u32> = sets[0].drives.iter().map(|d| d.drive_index).collect();
let set1_drives: Vec<u32> = sets[1].drives.iter().map(|d| d.drive_index).collect();
assert_eq!(set0_drives, vec![0, 0, 0, 1, 1, 1]);
assert_eq!(set1_drives, vec![2, 2, 2, 3, 3, 3]);
}
#[test]
fn test_form_erasure_sets_remainder() {
// 2 nodes, 3 drives each, 4 shards => 1 set (2 drives left over)
let nodes = vec![
("a".to_string(), 3),
("b".to_string(), 3),
];
let sets = form_erasure_sets(&nodes, 4);
assert_eq!(sets.len(), 1);
assert_eq!(sets[0].drives.len(), 4);
}
}
+388
View File
@@ -0,0 +1,388 @@
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use super::metadata::ObjectManifest;
/// All inter-node cluster messages, serialized with bincode over QUIC streams.
///
/// Each message type gets its own bidirectional QUIC stream.
/// For shard data transfers, the header is sent first (bincode),
/// then raw shard bytes follow on the same stream.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum ClusterRequest {
// ============================
// Shard operations
// ============================
/// Write a shard to a specific drive on the target node.
/// Shard data follows after this header on the same stream.
ShardWrite(ShardWriteRequest),
/// Read a shard from the target node.
ShardRead(ShardReadRequest),
/// Delete a shard from the target node.
ShardDelete(ShardDeleteRequest),
/// Check if a shard exists and get its metadata.
ShardHead(ShardHeadRequest),
// ============================
// Manifest operations
// ============================
/// Store an object manifest on the target node.
ManifestWrite(ManifestWriteRequest),
/// Retrieve an object manifest from the target node.
ManifestRead(ManifestReadRequest),
/// Delete an object manifest from the target node.
ManifestDelete(ManifestDeleteRequest),
/// List all manifests for a bucket on the target node.
ManifestList(ManifestListRequest),
// ============================
// Cluster management
// ============================
/// Periodic heartbeat.
Heartbeat(HeartbeatMessage),
/// Request to join the cluster.
JoinRequest(JoinRequestMessage),
/// Synchronize cluster topology.
TopologySync(TopologySyncMessage),
// ============================
// Healing
// ============================
/// Request a shard to be reconstructed and placed on a target drive.
HealRequest(HealRequestMessage),
}
/// Responses to cluster requests.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum ClusterResponse {
// Shard ops
ShardWriteAck(ShardWriteAck),
ShardReadResponse(ShardReadResponse),
ShardDeleteAck(ShardDeleteAck),
ShardHeadResponse(ShardHeadResponse),
// Manifest ops
ManifestWriteAck(ManifestWriteAck),
ManifestReadResponse(ManifestReadResponse),
ManifestDeleteAck(ManifestDeleteAck),
ManifestListResponse(ManifestListResponse),
// Cluster mgmt
HeartbeatAck(HeartbeatAckMessage),
JoinResponse(JoinResponseMessage),
TopologySyncAck(TopologySyncAckMessage),
// Healing
HealResponse(HealResponseMessage),
// Error
Error(ErrorResponse),
}
// ============================
// Shard operation messages
// ============================
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ShardWriteRequest {
pub request_id: String,
pub bucket: String,
pub key: String,
pub chunk_index: u32,
pub shard_index: u32,
pub drive_index: u32,
pub shard_data_length: u64,
pub checksum: u32, // crc32c of shard data
pub object_metadata: HashMap<String, String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ShardWriteAck {
pub request_id: String,
pub success: bool,
pub error: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ShardReadRequest {
pub request_id: String,
pub bucket: String,
pub key: String,
pub chunk_index: u32,
pub shard_index: u32,
pub drive_index: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ShardReadResponse {
pub request_id: String,
pub found: bool,
pub shard_data_length: u64,
pub checksum: u32,
// Shard data follows on the stream after this header
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ShardDeleteRequest {
pub request_id: String,
pub bucket: String,
pub key: String,
pub chunk_index: u32,
pub shard_index: u32,
pub drive_index: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ShardDeleteAck {
pub request_id: String,
pub success: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ShardHeadRequest {
pub request_id: String,
pub bucket: String,
pub key: String,
pub chunk_index: u32,
pub shard_index: u32,
pub drive_index: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ShardHeadResponse {
pub request_id: String,
pub found: bool,
pub data_size: u64,
pub checksum: u32,
}
// ============================
// Manifest operation messages
// ============================
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ManifestWriteRequest {
pub request_id: String,
pub manifest: ObjectManifest,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ManifestWriteAck {
pub request_id: String,
pub success: bool,
pub error: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ManifestReadRequest {
pub request_id: String,
pub bucket: String,
pub key: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ManifestReadResponse {
pub request_id: String,
pub found: bool,
pub manifest: Option<ObjectManifest>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ManifestDeleteRequest {
pub request_id: String,
pub bucket: String,
pub key: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ManifestDeleteAck {
pub request_id: String,
pub success: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ManifestListRequest {
pub request_id: String,
pub bucket: String,
pub prefix: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ManifestListResponse {
pub request_id: String,
pub manifests: Vec<ObjectManifest>,
}
// ============================
// Cluster management messages
// ============================
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DriveStateInfo {
pub drive_index: u32,
pub status: String, // "online", "degraded", "offline", "healing"
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HeartbeatMessage {
pub node_id: String,
pub timestamp: String,
pub drive_states: Vec<DriveStateInfo>,
pub topology_version: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HeartbeatAckMessage {
pub node_id: String,
pub timestamp: String,
pub topology_version: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct NodeInfo {
pub node_id: String,
pub quic_addr: String,
pub s3_addr: String,
pub drive_count: u32,
pub status: String,
pub version: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct JoinRequestMessage {
pub node_info: NodeInfo,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ClusterTopology {
pub version: u64,
pub cluster_id: String,
pub nodes: Vec<NodeInfo>,
pub erasure_sets: Vec<ErasureSetInfo>,
pub data_shards: usize,
pub parity_shards: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ErasureSetInfo {
pub set_id: u32,
pub drives: Vec<DriveLocationInfo>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DriveLocationInfo {
pub node_id: String,
pub drive_index: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct JoinResponseMessage {
pub accepted: bool,
pub topology: Option<ClusterTopology>,
pub error: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TopologySyncMessage {
pub topology: ClusterTopology,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TopologySyncAckMessage {
pub accepted: bool,
pub current_version: u64,
}
// ============================
// Healing messages
// ============================
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HealRequestMessage {
pub request_id: String,
pub bucket: String,
pub key: String,
pub chunk_index: u32,
pub shard_index: u32,
pub target_node_id: String,
pub target_drive_index: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HealResponseMessage {
pub request_id: String,
pub success: bool,
pub error: Option<String>,
}
// ============================
// Error response
// ============================
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ErrorResponse {
pub request_id: String,
pub code: String,
pub message: String,
}
// ============================
// Wire format helpers
// ============================
/// Serialize a request to bincode bytes with a 4-byte length prefix.
pub fn encode_request(req: &ClusterRequest) -> anyhow::Result<Vec<u8>> {
let payload = bincode::serialize(req)?;
let mut buf = Vec::with_capacity(4 + payload.len());
buf.extend_from_slice(&(payload.len() as u32).to_le_bytes());
buf.extend_from_slice(&payload);
Ok(buf)
}
/// Serialize a response to bincode bytes with a 4-byte length prefix.
pub fn encode_response(resp: &ClusterResponse) -> anyhow::Result<Vec<u8>> {
let payload = bincode::serialize(resp)?;
let mut buf = Vec::with_capacity(4 + payload.len());
buf.extend_from_slice(&(payload.len() as u32).to_le_bytes());
buf.extend_from_slice(&payload);
Ok(buf)
}
/// Read a length-prefixed bincode message from raw bytes.
/// Returns (decoded message, bytes consumed).
pub fn decode_request(data: &[u8]) -> anyhow::Result<(ClusterRequest, usize)> {
if data.len() < 4 {
anyhow::bail!("Not enough data for length prefix");
}
let len = u32::from_le_bytes([data[0], data[1], data[2], data[3]]) as usize;
if data.len() < 4 + len {
anyhow::bail!("Not enough data for message body");
}
let msg: ClusterRequest = bincode::deserialize(&data[4..4 + len])?;
Ok((msg, 4 + len))
}
/// Read a length-prefixed bincode response from raw bytes.
pub fn decode_response(data: &[u8]) -> anyhow::Result<(ClusterResponse, usize)> {
if data.len() < 4 {
anyhow::bail!("Not enough data for length prefix");
}
let len = u32::from_le_bytes([data[0], data[1], data[2], data[3]]) as usize;
if data.len() < 4 + len {
anyhow::bail!("Not enough data for message body");
}
let msg: ClusterResponse = bincode::deserialize(&data[4..4 + len])?;
Ok((msg, 4 + len))
}
+645
View File
@@ -0,0 +1,645 @@
use anyhow::Result;
use dashmap::DashMap;
use quinn::{ClientConfig, Endpoint, ServerConfig as QuinnServerConfig};
use rustls::pki_types::{CertificateDer, PrivateKeyDer, PrivatePkcs8KeyDer};
use std::net::SocketAddr;
use std::sync::Arc;
use super::protocol::{
self, ClusterRequest, ClusterResponse, ShardReadResponse, ShardWriteAck, ShardWriteRequest,
};
use super::shard_store::{ShardId, ShardStore};
use super::state::{ClusterState, NodeStatus};
/// QUIC transport layer for inter-node communication.
///
/// Manages a QUIC endpoint for both sending and receiving cluster messages.
/// Uses self-signed TLS certificates generated at init time.
/// Maintains a connection pool to peer nodes.
pub struct QuicTransport {
endpoint: Endpoint,
/// Cached connections to peer nodes: node_id -> Connection
connections: Arc<DashMap<String, quinn::Connection>>,
local_node_id: String,
}
impl QuicTransport {
/// Create a new QUIC transport, binding to the specified address.
pub async fn new(bind_addr: SocketAddr, local_node_id: String) -> Result<Self> {
let (server_config, client_config) = Self::generate_tls_configs()?;
let mut endpoint = Endpoint::server(server_config, bind_addr)?;
endpoint.set_default_client_config(client_config);
Ok(Self {
endpoint,
connections: Arc::new(DashMap::new()),
local_node_id,
})
}
/// Get or establish a connection to a peer node.
pub async fn get_connection(
&self,
node_id: &str,
addr: SocketAddr,
) -> Result<quinn::Connection> {
// Check cache first
if let Some(conn) = self.connections.get(node_id) {
if conn.close_reason().is_none() {
return Ok(conn.clone());
}
// Connection is closed, remove from cache
drop(conn);
self.connections.remove(node_id);
}
// Establish new connection
let conn = self
.endpoint
.connect(addr, "smartstorage")?
.await?;
self.connections
.insert(node_id.to_string(), conn.clone());
Ok(conn)
}
/// Send a cluster request and receive the response.
pub async fn send_request(
&self,
conn: &quinn::Connection,
request: &ClusterRequest,
) -> Result<ClusterResponse> {
let (mut send, mut recv) = conn.open_bi().await?;
// Encode and send request
let encoded = protocol::encode_request(request)?;
send.write_all(&encoded).await?;
send.finish()?;
// Read response
let response_data = recv.read_to_end(64 * 1024 * 1024).await?; // 64MB max
let (response, _) = protocol::decode_response(&response_data)?;
Ok(response)
}
/// Send a shard write request with streaming data.
///
/// Sends the request header first, then streams the shard data bytes.
pub async fn send_shard_write(
&self,
conn: &quinn::Connection,
request: ShardWriteRequest,
shard_data: &[u8],
) -> Result<ShardWriteAck> {
let (mut send, mut recv) = conn.open_bi().await?;
// Send request header
let encoded = protocol::encode_request(&ClusterRequest::ShardWrite(request))?;
send.write_all(&encoded).await?;
// Stream shard data
send.write_all(shard_data).await?;
send.finish()?;
// Read ack
let response_data = recv.read_to_end(1024).await?;
let (response, _) = protocol::decode_response(&response_data)?;
match response {
ClusterResponse::ShardWriteAck(ack) => Ok(ack),
ClusterResponse::Error(e) => {
anyhow::bail!("Shard write error: {} - {}", e.code, e.message)
}
other => anyhow::bail!("Unexpected response to shard write: {:?}", other),
}
}
/// Send a shard read request and receive the shard data.
///
/// Returns (shard_data, checksum).
pub async fn send_shard_read(
&self,
conn: &quinn::Connection,
request: &ClusterRequest,
) -> Result<Option<(Vec<u8>, u32)>> {
let (mut send, mut recv) = conn.open_bi().await?;
// Send request
let encoded = protocol::encode_request(request)?;
send.write_all(&encoded).await?;
send.finish()?;
// Read response header
let mut header_len_buf = [0u8; 4];
recv.read_exact(&mut header_len_buf).await?;
let header_len = u32::from_le_bytes(header_len_buf) as usize;
let mut header_buf = vec![0u8; header_len];
recv.read_exact(&mut header_buf).await?;
let response: ClusterResponse = bincode::deserialize(&header_buf)?;
match response {
ClusterResponse::ShardReadResponse(read_resp) => {
if !read_resp.found {
return Ok(None);
}
// Read shard data that follows
let mut shard_data = vec![0u8; read_resp.shard_data_length as usize];
recv.read_exact(&mut shard_data).await?;
Ok(Some((shard_data, read_resp.checksum)))
}
ClusterResponse::Error(e) => {
anyhow::bail!("Shard read error: {} - {}", e.code, e.message)
}
other => anyhow::bail!("Unexpected response to shard read: {:?}", other),
}
}
/// Accept incoming connections and dispatch to the handler.
pub async fn accept_loop(
self: Arc<Self>,
shard_stores: Vec<Arc<ShardStore>>,
cluster_state: Option<Arc<ClusterState>>,
mut shutdown: tokio::sync::watch::Receiver<bool>,
) {
loop {
tokio::select! {
incoming = self.endpoint.accept() => {
match incoming {
Some(incoming_conn) => {
let transport = self.clone();
let stores = shard_stores.clone();
let state = cluster_state.clone();
tokio::spawn(async move {
match incoming_conn.await {
Ok(conn) => {
transport.handle_connection(conn, stores, state).await;
}
Err(e) => {
tracing::error!("Failed to accept QUIC connection: {}", e);
}
}
});
}
None => break,
}
}
_ = shutdown.changed() => break,
}
}
}
/// Handle a single QUIC connection (may have multiple streams).
async fn handle_connection(
self: Arc<Self>,
conn: quinn::Connection,
shard_stores: Vec<Arc<ShardStore>>,
cluster_state: Option<Arc<ClusterState>>,
) {
loop {
match conn.accept_bi().await {
Ok((send, recv)) => {
let stores = shard_stores.clone();
let state = cluster_state.clone();
let transport = self.clone();
tokio::spawn(async move {
if let Err(e) = transport.handle_stream(send, recv, stores, state).await {
tracing::error!("Stream handler error: {}", e);
}
});
}
Err(quinn::ConnectionError::ApplicationClosed(_)) => break,
Err(e) => {
tracing::error!("Connection error: {}", e);
break;
}
}
}
}
/// Handle a single bidirectional stream (one request-response exchange).
async fn handle_stream(
self: Arc<Self>,
mut send: quinn::SendStream,
mut recv: quinn::RecvStream,
shard_stores: Vec<Arc<ShardStore>>,
cluster_state: Option<Arc<ClusterState>>,
) -> Result<()> {
// Read the full request (length-prefixed bincode + optional trailing data)
let raw = recv.read_to_end(64 * 1024 * 1024).await?; // 64MB max
let (request, header_len) = protocol::decode_request(&raw)?;
match request {
ClusterRequest::ShardWrite(write_req) => {
// Shard data follows the header in the raw buffer
let shard_data = &raw[header_len..];
let drive_index = write_req.drive_index;
let shard_id = ShardId {
bucket: write_req.bucket,
key: write_req.key,
chunk_index: write_req.chunk_index,
shard_index: write_req.shard_index,
};
let result = match Self::shard_store_for_drive(&shard_stores, drive_index) {
Ok(store) => store.write_shard(&shard_id, &shard_data, write_req.checksum).await,
Err(error) => Err(error),
};
let ack = ShardWriteAck {
request_id: write_req.request_id,
success: result.is_ok(),
error: result.err().map(|e| e.to_string()),
};
let response = protocol::encode_response(&ClusterResponse::ShardWriteAck(ack))?;
send.write_all(&response).await?;
send.finish()?;
}
ClusterRequest::ShardRead(read_req) => {
let drive_index = read_req.drive_index;
let shard_id = ShardId {
bucket: read_req.bucket,
key: read_req.key,
chunk_index: read_req.chunk_index,
shard_index: read_req.shard_index,
};
let store = match Self::shard_store_for_drive(&shard_stores, drive_index) {
Ok(store) => store,
Err(error) => {
Self::send_error_response(&mut send, "InvalidDrive", error.to_string()).await?;
return Ok(());
}
};
match store.read_shard(&shard_id).await {
Ok((data, checksum)) => {
let header = ShardReadResponse {
request_id: read_req.request_id,
found: true,
shard_data_length: data.len() as u64,
checksum,
};
// Send header
let header_bytes = bincode::serialize(&ClusterResponse::ShardReadResponse(header))?;
send.write_all(&(header_bytes.len() as u32).to_le_bytes()).await?;
send.write_all(&header_bytes).await?;
// Send shard data
send.write_all(&data).await?;
send.finish()?;
}
Err(_) => {
let header = ShardReadResponse {
request_id: read_req.request_id,
found: false,
shard_data_length: 0,
checksum: 0,
};
let header_bytes = bincode::serialize(&ClusterResponse::ShardReadResponse(header))?;
send.write_all(&(header_bytes.len() as u32).to_le_bytes()).await?;
send.write_all(&header_bytes).await?;
send.finish()?;
}
}
}
ClusterRequest::ShardDelete(del_req) => {
let drive_index = del_req.drive_index;
let shard_id = ShardId {
bucket: del_req.bucket,
key: del_req.key,
chunk_index: del_req.chunk_index,
shard_index: del_req.shard_index,
};
let result = match Self::shard_store_for_drive(&shard_stores, drive_index) {
Ok(store) => store.delete_shard(&shard_id).await,
Err(error) => Err(error),
};
let ack = protocol::ClusterResponse::ShardDeleteAck(protocol::ShardDeleteAck {
request_id: del_req.request_id,
success: result.is_ok(),
});
let response = protocol::encode_response(&ack)?;
send.write_all(&response).await?;
send.finish()?;
}
ClusterRequest::ShardHead(head_req) => {
let drive_index = head_req.drive_index;
let shard_id = ShardId {
bucket: head_req.bucket,
key: head_req.key,
chunk_index: head_req.chunk_index,
shard_index: head_req.shard_index,
};
let store = match Self::shard_store_for_drive(&shard_stores, drive_index) {
Ok(store) => store,
Err(error) => {
Self::send_error_response(&mut send, "InvalidDrive", error.to_string()).await?;
return Ok(());
}
};
let resp = match store.head_shard(&shard_id).await {
Ok(Some(meta)) => protocol::ShardHeadResponse {
request_id: head_req.request_id,
found: true,
data_size: meta.data_size,
checksum: meta.checksum,
},
_ => protocol::ShardHeadResponse {
request_id: head_req.request_id,
found: false,
data_size: 0,
checksum: 0,
},
};
let response =
protocol::encode_response(&ClusterResponse::ShardHeadResponse(resp))?;
send.write_all(&response).await?;
send.finish()?;
}
ClusterRequest::JoinRequest(join_req) => {
let Some(state) = cluster_state else {
let err = protocol::ErrorResponse {
request_id: String::new(),
code: "ClusterDisabled".to_string(),
message: "Cluster state is not available".to_string(),
};
let response = protocol::encode_response(&ClusterResponse::Error(err))?;
send.write_all(&response).await?;
send.finish()?;
return Ok(());
};
let joining_node_id = join_req.node_info.node_id.clone();
state.add_node(join_req.node_info).await;
let topology = state.to_topology().await;
let node_drives: Vec<(String, u32)> = topology
.nodes
.iter()
.map(|node| (node.node_id.clone(), node.drive_count))
.collect();
let erasure_sets = super::placement::form_erasure_sets(
&node_drives,
topology.data_shards + topology.parity_shards,
);
state.set_erasure_sets(erasure_sets).await;
let response_topology = state.to_topology().await;
let response = protocol::encode_response(&ClusterResponse::JoinResponse(
protocol::JoinResponseMessage {
accepted: true,
topology: Some(response_topology.clone()),
error: None,
},
))?;
send.write_all(&response).await?;
send.finish()?;
self.broadcast_topology(&state, Some(response_topology), None, Some(&joining_node_id)).await;
}
ClusterRequest::Heartbeat(heartbeat) => {
let Some(state) = cluster_state else {
let err = protocol::ErrorResponse {
request_id: String::new(),
code: "ClusterDisabled".to_string(),
message: "Cluster state is not available".to_string(),
};
let response = protocol::encode_response(&ClusterResponse::Error(err))?;
send.write_all(&response).await?;
send.finish()?;
return Ok(());
};
let peer_node_id = heartbeat.node_id.clone();
let peer_topology_version = heartbeat.topology_version;
state.record_heartbeat(&heartbeat.node_id).await;
let local_topology_version = state.version().await;
let response = protocol::encode_response(&ClusterResponse::HeartbeatAck(
protocol::HeartbeatAckMessage {
node_id: state.local_node_id().to_string(),
timestamp: chrono::Utc::now().to_rfc3339(),
topology_version: local_topology_version,
},
))?;
send.write_all(&response).await?;
send.finish()?;
if local_topology_version > peer_topology_version {
self.broadcast_topology(&state, None, Some(&peer_node_id), None).await;
}
}
ClusterRequest::TopologySync(sync) => {
let Some(state) = cluster_state else {
let err = protocol::ErrorResponse {
request_id: String::new(),
code: "ClusterDisabled".to_string(),
message: "Cluster state is not available".to_string(),
};
let response = protocol::encode_response(&ClusterResponse::Error(err))?;
send.write_all(&response).await?;
send.finish()?;
return Ok(());
};
state.apply_topology(&sync.topology).await;
let response = protocol::encode_response(&ClusterResponse::TopologySyncAck(
protocol::TopologySyncAckMessage {
accepted: true,
current_version: state.version().await,
},
))?;
send.write_all(&response).await?;
send.finish()?;
}
_ => {
let err = protocol::ErrorResponse {
request_id: String::new(),
code: "NotImplemented".to_string(),
message: "This cluster operation is not yet implemented".to_string(),
};
let response = protocol::encode_response(&ClusterResponse::Error(err))?;
send.write_all(&response).await?;
send.finish()?;
}
}
Ok(())
}
fn shard_store_for_drive(
shard_stores: &[Arc<ShardStore>],
drive_index: u32,
) -> Result<Arc<ShardStore>> {
shard_stores
.get(drive_index as usize)
.cloned()
.ok_or_else(|| anyhow::anyhow!("Drive {} not found", drive_index))
}
async fn send_error_response(
send: &mut quinn::SendStream,
code: &str,
message: String,
) -> Result<()> {
let err = protocol::ErrorResponse {
request_id: String::new(),
code: code.to_string(),
message,
};
let response = protocol::encode_response(&ClusterResponse::Error(err))?;
send.write_all(&response).await?;
send.finish()?;
Ok(())
}
async fn broadcast_topology(
&self,
state: &Arc<ClusterState>,
topology: Option<protocol::ClusterTopology>,
target_node_id: Option<&str>,
skip_node_id: Option<&str>,
) {
let topology = match topology {
Some(topology) => topology,
None => state.to_topology().await,
};
let nodes = state.all_nodes().await;
for node in nodes {
if node.info.node_id == state.local_node_id() {
continue;
}
if let Some(target_node_id) = target_node_id {
if node.info.node_id != target_node_id {
continue;
}
}
if matches!(skip_node_id, Some(skip_node_id) if node.info.node_id == skip_node_id) {
continue;
}
if node.status != NodeStatus::Online {
continue;
}
let addr = match node.info.quic_addr.parse() {
Ok(addr) => addr,
Err(error) => {
tracing::warn!(node = %node.info.node_id, error = %error, "Skipping topology sync for invalid peer address");
continue;
}
};
let conn = match self.get_connection(&node.info.node_id, addr).await {
Ok(conn) => conn,
Err(error) => {
tracing::warn!(node = %node.info.node_id, error = %error, "Failed to connect for topology sync");
continue;
}
};
let request = ClusterRequest::TopologySync(protocol::TopologySyncMessage {
topology: topology.clone(),
});
if let Err(error) = self.send_request(&conn, &request).await {
tracing::warn!(node = %node.info.node_id, error = %error, "Failed to send topology sync");
}
}
}
/// Generate self-signed TLS certificates for cluster-internal communication.
fn generate_tls_configs() -> Result<(QuinnServerConfig, ClientConfig)> {
// Generate self-signed certificate
let cert = rcgen::generate_simple_self_signed(vec!["smartstorage".to_string()])?;
let cert_der = CertificateDer::from(cert.cert);
let key_der = PrivateKeyDer::Pkcs8(PrivatePkcs8KeyDer::from(cert.key_pair.serialize_der()));
// Server config
let mut server_crypto = rustls::ServerConfig::builder()
.with_no_client_auth()
.with_single_cert(vec![cert_der.clone()], key_der.clone_key())?;
server_crypto.alpn_protocols = vec![b"smartstorage".to_vec()];
let server_config = QuinnServerConfig::with_crypto(Arc::new(
quinn::crypto::rustls::QuicServerConfig::try_from(server_crypto)?,
));
// Client config: skip server certificate verification (cluster-internal)
let mut client_crypto = rustls::ClientConfig::builder()
.dangerous()
.with_custom_certificate_verifier(Arc::new(SkipServerVerification))
.with_no_client_auth();
client_crypto.alpn_protocols = vec![b"smartstorage".to_vec()];
let client_config = ClientConfig::new(Arc::new(
quinn::crypto::rustls::QuicClientConfig::try_from(client_crypto)?,
));
Ok((server_config, client_config))
}
/// Close the QUIC endpoint gracefully.
pub fn close(&self) {
self.endpoint
.close(quinn::VarInt::from_u32(0), b"shutdown");
}
/// Get the local node ID.
pub fn local_node_id(&self) -> &str {
&self.local_node_id
}
}
/// Certificate verifier that skips verification (for cluster-internal self-signed certs).
#[derive(Debug)]
struct SkipServerVerification;
impl rustls::client::danger::ServerCertVerifier for SkipServerVerification {
fn verify_server_cert(
&self,
_end_entity: &CertificateDer<'_>,
_intermediates: &[CertificateDer<'_>],
_server_name: &rustls::pki_types::ServerName<'_>,
_ocsp_response: &[u8],
_now: rustls::pki_types::UnixTime,
) -> Result<rustls::client::danger::ServerCertVerified, rustls::Error> {
Ok(rustls::client::danger::ServerCertVerified::assertion())
}
fn verify_tls12_signature(
&self,
_message: &[u8],
_cert: &CertificateDer<'_>,
_dss: &rustls::DigitallySignedStruct,
) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
Ok(rustls::client::danger::HandshakeSignatureValid::assertion())
}
fn verify_tls13_signature(
&self,
_message: &[u8],
_cert: &CertificateDer<'_>,
_dss: &rustls::DigitallySignedStruct,
) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
Ok(rustls::client::danger::HandshakeSignatureValid::assertion())
}
fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
vec![
rustls::SignatureScheme::RSA_PKCS1_SHA256,
rustls::SignatureScheme::RSA_PKCS1_SHA384,
rustls::SignatureScheme::RSA_PKCS1_SHA512,
rustls::SignatureScheme::ECDSA_NISTP256_SHA256,
rustls::SignatureScheme::ECDSA_NISTP384_SHA384,
rustls::SignatureScheme::ED25519,
rustls::SignatureScheme::RSA_PSS_SHA256,
rustls::SignatureScheme::RSA_PSS_SHA384,
rustls::SignatureScheme::RSA_PSS_SHA512,
]
}
}
+226
View File
@@ -0,0 +1,226 @@
use anyhow::Result;
use serde::{Deserialize, Serialize};
use std::path::PathBuf;
use tokio::fs;
use tokio::io::AsyncWriteExt;
/// Identifies a specific shard on disk.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
pub struct ShardId {
pub bucket: String,
pub key: String,
pub chunk_index: u32,
pub shard_index: u32,
}
/// Per-shard metadata stored alongside shard data.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ShardMeta {
pub shard_index: u32,
pub chunk_index: u32,
pub data_size: u64,
pub checksum: u32, // crc32c
}
/// Manages shard storage on a single drive.
///
/// Layout on disk:
/// ```text
/// {base_path}/.smartstorage/data/{bucket}/{key_prefix}/{key}/
/// chunk-{N}/shard-{M}.dat (shard data)
/// chunk-{N}/shard-{M}.meta (shard metadata JSON)
/// ```
pub struct ShardStore {
base_path: PathBuf,
}
impl ShardStore {
pub fn new(base_path: PathBuf) -> Self {
Self { base_path }
}
/// Write a shard to disk atomically (write to temp file, then rename).
pub async fn write_shard(
&self,
shard_id: &ShardId,
data: &[u8],
checksum: u32,
) -> Result<()> {
let shard_path = self.shard_data_path(shard_id);
let meta_path = self.shard_meta_path(shard_id);
// Ensure parent directory exists
if let Some(parent) = shard_path.parent() {
fs::create_dir_all(parent).await?;
}
// Write data atomically via temp file + rename
let temp_data_path = shard_path.with_extension("dat.tmp");
{
let mut file = fs::File::create(&temp_data_path).await?;
file.write_all(data).await?;
file.flush().await?;
file.sync_all().await?;
}
fs::rename(&temp_data_path, &shard_path).await?;
// Write metadata
let meta = ShardMeta {
shard_index: shard_id.shard_index,
chunk_index: shard_id.chunk_index,
data_size: data.len() as u64,
checksum,
};
let meta_json = serde_json::to_string(&meta)?;
let temp_meta_path = meta_path.with_extension("meta.tmp");
fs::write(&temp_meta_path, meta_json).await?;
fs::rename(&temp_meta_path, &meta_path).await?;
Ok(())
}
/// Read a shard's data from disk.
pub async fn read_shard(&self, shard_id: &ShardId) -> Result<(Vec<u8>, u32)> {
let shard_path = self.shard_data_path(shard_id);
let meta_path = self.shard_meta_path(shard_id);
let data = fs::read(&shard_path).await?;
let meta_json = fs::read_to_string(&meta_path).await?;
let meta: ShardMeta = serde_json::from_str(&meta_json)?;
Ok((data, meta.checksum))
}
/// Check if a shard exists and return its metadata.
pub async fn head_shard(&self, shard_id: &ShardId) -> Result<Option<ShardMeta>> {
let meta_path = self.shard_meta_path(shard_id);
if !meta_path.exists() {
return Ok(None);
}
let meta_json = fs::read_to_string(&meta_path).await?;
let meta: ShardMeta = serde_json::from_str(&meta_json)?;
Ok(Some(meta))
}
/// Delete a shard and its metadata.
pub async fn delete_shard(&self, shard_id: &ShardId) -> Result<()> {
let shard_path = self.shard_data_path(shard_id);
let meta_path = self.shard_meta_path(shard_id);
let _ = fs::remove_file(&shard_path).await;
let _ = fs::remove_file(&meta_path).await;
// Clean up empty parent directories
self.cleanup_empty_dirs(shard_id).await;
Ok(())
}
/// List all shard IDs for a given bucket and key (across all chunks).
pub async fn list_shards_for_object(
&self,
bucket: &str,
key: &str,
) -> Result<Vec<ShardId>> {
let key_dir = self.key_dir(bucket, key);
if !key_dir.exists() {
return Ok(Vec::new());
}
let mut result = Vec::new();
let mut entries = fs::read_dir(&key_dir).await?;
while let Some(entry) = entries.next_entry().await? {
let name = entry.file_name().to_string_lossy().to_string();
if !name.starts_with("chunk-") || !entry.metadata().await?.is_dir() {
continue;
}
let chunk_index: u32 = match name.strip_prefix("chunk-").and_then(|s| s.parse().ok()) {
Some(idx) => idx,
None => continue,
};
let mut chunk_entries = fs::read_dir(entry.path()).await?;
while let Some(shard_entry) = chunk_entries.next_entry().await? {
let shard_name = shard_entry.file_name().to_string_lossy().to_string();
if shard_name.starts_with("shard-") && shard_name.ends_with(".dat") {
let shard_index: u32 = match shard_name
.strip_prefix("shard-")
.and_then(|s| s.strip_suffix(".dat"))
.and_then(|s| s.parse().ok())
{
Some(idx) => idx,
None => continue,
};
result.push(ShardId {
bucket: bucket.to_string(),
key: key.to_string(),
chunk_index,
shard_index,
});
}
}
}
result.sort_by(|a, b| {
a.chunk_index
.cmp(&b.chunk_index)
.then(a.shard_index.cmp(&b.shard_index))
});
Ok(result)
}
// ============================
// Path helpers
// ============================
fn data_root(&self) -> PathBuf {
self.base_path.join(".smartstorage").join("data")
}
fn key_prefix(key: &str) -> String {
// Use first 2 hex chars of a simple hash for directory fan-out
let hash = xxhash_rust::xxh64::xxh64(key.as_bytes(), 0);
format!("{:02x}", hash & 0xFF)
}
fn key_dir(&self, bucket: &str, key: &str) -> PathBuf {
self.data_root()
.join(bucket)
.join(Self::key_prefix(key))
.join(key)
}
fn chunk_dir(&self, shard_id: &ShardId) -> PathBuf {
self.key_dir(&shard_id.bucket, &shard_id.key)
.join(format!("chunk-{}", shard_id.chunk_index))
}
fn shard_data_path(&self, shard_id: &ShardId) -> PathBuf {
self.chunk_dir(shard_id)
.join(format!("shard-{}.dat", shard_id.shard_index))
}
fn shard_meta_path(&self, shard_id: &ShardId) -> PathBuf {
self.chunk_dir(shard_id)
.join(format!("shard-{}.meta", shard_id.shard_index))
}
async fn cleanup_empty_dirs(&self, shard_id: &ShardId) {
// Try to remove chunk dir if empty
let chunk_dir = self.chunk_dir(shard_id);
let _ = fs::remove_dir(&chunk_dir).await; // fails silently if not empty
// Try to remove key dir if empty
let key_dir = self.key_dir(&shard_id.bucket, &shard_id.key);
let _ = fs::remove_dir(&key_dir).await;
// Try to remove prefix dir if empty
if let Some(prefix_dir) = key_dir.parent() {
let _ = fs::remove_dir(prefix_dir).await;
}
}
}
+343
View File
@@ -0,0 +1,343 @@
use std::collections::HashMap;
use std::path::PathBuf;
use std::sync::Arc;
use tokio::sync::RwLock;
use super::placement::{DriveLocation, ErasureSet};
use super::persistence;
use super::protocol::{ClusterTopology, ErasureSetInfo, DriveLocationInfo, NodeInfo};
/// Node status for tracking liveness.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum NodeStatus {
Online,
Suspect, // missed 2+ heartbeats
Offline, // missed 5+ heartbeats
}
/// Tracked state for a peer node.
#[derive(Debug, Clone)]
pub struct NodeState {
pub info: NodeInfo,
pub status: NodeStatus,
pub missed_heartbeats: u32,
pub last_heartbeat: chrono::DateTime<chrono::Utc>,
}
/// Shared cluster state, protected by RwLock for concurrent access.
pub struct ClusterState {
inner: Arc<RwLock<ClusterStateInner>>,
local_node_id: String,
topology_path: Option<PathBuf>,
}
struct ClusterStateInner {
cluster_id: String,
version: u64,
nodes: HashMap<String, NodeState>,
erasure_sets: Vec<ErasureSet>,
data_shards: usize,
parity_shards: usize,
}
impl ClusterState {
pub fn new(
local_node_id: String,
cluster_id: String,
data_shards: usize,
parity_shards: usize,
topology_path: Option<PathBuf>,
) -> Self {
Self {
inner: Arc::new(RwLock::new(ClusterStateInner {
cluster_id,
version: 0,
nodes: HashMap::new(),
erasure_sets: Vec::new(),
data_shards,
parity_shards,
})),
local_node_id,
topology_path,
}
}
pub fn local_node_id(&self) -> &str {
&self.local_node_id
}
pub async fn cluster_id(&self) -> String {
self.inner.read().await.cluster_id.clone()
}
/// Register a node in the cluster.
pub async fn add_node(&self, info: NodeInfo) {
{
let mut inner = self.inner.write().await;
let node_id = info.node_id.clone();
inner.nodes.insert(
node_id,
NodeState {
info,
status: NodeStatus::Online,
missed_heartbeats: 0,
last_heartbeat: chrono::Utc::now(),
},
);
inner.version += 1;
}
self.persist_topology_snapshot().await;
}
/// Remove a node from the cluster.
pub async fn remove_node(&self, node_id: &str) {
{
let mut inner = self.inner.write().await;
inner.nodes.remove(node_id);
inner.version += 1;
}
self.persist_topology_snapshot().await;
}
/// Update heartbeat for a node (reset missed count).
pub async fn record_heartbeat(&self, node_id: &str) {
let mut inner = self.inner.write().await;
if let Some(node) = inner.nodes.get_mut(node_id) {
node.missed_heartbeats = 0;
node.status = NodeStatus::Online;
node.last_heartbeat = chrono::Utc::now();
}
}
/// Increment missed heartbeat count for all nodes, updating status.
/// Called by the heartbeat checker when a round completes.
pub async fn tick_heartbeats(&self, responded_nodes: &[String]) -> Vec<(String, NodeStatus)> {
let mut inner = self.inner.write().await;
let mut status_changes = Vec::new();
for (node_id, node) in inner.nodes.iter_mut() {
if *node_id == self.local_node_id {
continue; // Don't track self
}
if responded_nodes.contains(node_id) {
node.missed_heartbeats = 0;
if node.status != NodeStatus::Online {
node.status = NodeStatus::Online;
status_changes.push((node_id.clone(), NodeStatus::Online));
}
} else {
node.missed_heartbeats += 1;
let new_status = if node.missed_heartbeats >= 5 {
NodeStatus::Offline
} else if node.missed_heartbeats >= 2 {
NodeStatus::Suspect
} else {
NodeStatus::Online
};
if new_status != node.status {
node.status = new_status.clone();
status_changes.push((node_id.clone(), new_status));
}
}
}
status_changes
}
/// Set erasure sets (typically done once during cluster formation).
pub async fn set_erasure_sets(&self, sets: Vec<ErasureSet>) {
{
let mut inner = self.inner.write().await;
inner.erasure_sets = sets;
inner.version += 1;
}
self.persist_topology_snapshot().await;
}
/// Get the erasure set for a given object based on consistent hashing.
pub async fn get_erasure_set_for_object(&self, bucket: &str, key: &str) -> Option<ErasureSet> {
let inner = self.inner.read().await;
if inner.erasure_sets.is_empty() {
return None;
}
let set_idx = super::placement::erasure_set_for_object(
bucket,
key,
inner.erasure_sets.len() as u32,
);
inner.erasure_sets.get(set_idx as usize).cloned()
}
/// Get all erasure sets.
pub async fn erasure_sets(&self) -> Vec<ErasureSet> {
self.inner.read().await.erasure_sets.clone()
}
/// Get current topology version.
pub async fn version(&self) -> u64 {
self.inner.read().await.version
}
/// Get all online node IDs (excluding self).
pub async fn online_peers(&self) -> Vec<NodeInfo> {
let inner = self.inner.read().await;
inner
.nodes
.values()
.filter(|n| n.status == NodeStatus::Online && n.info.node_id != self.local_node_id)
.map(|n| n.info.clone())
.collect()
}
/// Get all nodes.
pub async fn all_nodes(&self) -> Vec<NodeState> {
self.inner.read().await.nodes.values().cloned().collect()
}
/// Get node info by ID.
pub async fn get_node(&self, node_id: &str) -> Option<NodeInfo> {
self.inner
.read()
.await
.nodes
.get(node_id)
.map(|n| n.info.clone())
}
/// Get offline node IDs.
pub async fn offline_nodes(&self) -> Vec<String> {
self.inner
.read()
.await
.nodes
.values()
.filter(|n| n.status == NodeStatus::Offline)
.map(|n| n.info.node_id.clone())
.collect()
}
/// Check if a majority of nodes are reachable (for split-brain prevention).
pub async fn has_majority(&self) -> bool {
let inner = self.inner.read().await;
let total = inner.nodes.len();
if total == 0 {
return true;
}
let online = inner
.nodes
.values()
.filter(|n| n.status == NodeStatus::Online)
.count();
online > total / 2
}
/// Export the current topology as a protocol message.
pub async fn to_topology(&self) -> ClusterTopology {
let inner = self.inner.read().await;
ClusterTopology {
version: inner.version,
cluster_id: inner.cluster_id.clone(),
nodes: inner.nodes.values().map(|n| n.info.clone()).collect(),
erasure_sets: inner
.erasure_sets
.iter()
.map(|set| ErasureSetInfo {
set_id: set.set_id,
drives: set
.drives
.iter()
.map(|d| DriveLocationInfo {
node_id: d.node_id.clone(),
drive_index: d.drive_index,
})
.collect(),
})
.collect(),
data_shards: inner.data_shards,
parity_shards: inner.parity_shards,
}
}
/// Import topology from a protocol message (e.g., received from a peer during join).
pub async fn apply_topology(&self, topology: &ClusterTopology) {
let applied = {
let mut inner = self.inner.write().await;
// Only apply if newer and from the same cluster lineage. A node that has not yet
// joined any topology may adopt the seed cluster ID during its first join.
if topology.version <= inner.version {
return;
}
if topology.cluster_id != inner.cluster_id {
if inner.nodes.is_empty() {
inner.cluster_id = topology.cluster_id.clone();
} else {
return;
}
}
inner.version = topology.version;
inner.data_shards = topology.data_shards;
inner.parity_shards = topology.parity_shards;
let now = chrono::Utc::now();
for node_info in &topology.nodes {
let existing_status = inner.nodes.get(&node_info.node_id).map(|node| node.status.clone());
let existing_missed_heartbeats = inner
.nodes
.get(&node_info.node_id)
.map(|node| node.missed_heartbeats);
let existing_last_heartbeat = inner
.nodes
.get(&node_info.node_id)
.map(|node| node.last_heartbeat);
inner.nodes.insert(
node_info.node_id.clone(),
NodeState {
info: node_info.clone(),
status: existing_status.unwrap_or(NodeStatus::Online),
missed_heartbeats: existing_missed_heartbeats.unwrap_or(0),
last_heartbeat: existing_last_heartbeat.unwrap_or(now),
},
);
}
inner.nodes.retain(|node_id, _| topology.nodes.iter().any(|node| &node.node_id == node_id));
// Update erasure sets
inner.erasure_sets = topology
.erasure_sets
.iter()
.map(|set| ErasureSet {
set_id: set.set_id,
drives: set
.drives
.iter()
.map(|d| DriveLocation {
node_id: d.node_id.clone(),
drive_index: d.drive_index,
})
.collect(),
})
.collect();
true
};
if applied {
self.persist_topology_snapshot().await;
}
}
async fn persist_topology_snapshot(&self) {
let Some(path) = &self.topology_path else {
return;
};
let topology = self.to_topology().await;
if let Err(error) = persistence::persist_topology(path, &topology).await {
tracing::warn!(error = %error, "Failed to persist cluster topology snapshot");
}
}
}
+9 -2
View File
@@ -1,5 +1,7 @@
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use crate::cluster::config::ClusterConfig;
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
pub struct SmartStorageConfig { pub struct SmartStorageConfig {
@@ -10,6 +12,8 @@ pub struct SmartStorageConfig {
pub logging: LoggingConfig, pub logging: LoggingConfig,
pub limits: LimitsConfig, pub limits: LimitsConfig,
pub multipart: MultipartConfig, pub multipart: MultipartConfig,
#[serde(default)]
pub cluster: Option<ClusterConfig>,
} }
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -41,11 +45,14 @@ pub struct AuthConfig {
} }
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct Credential { pub struct Credential {
#[serde(rename = "accessKeyId")]
pub access_key_id: String, pub access_key_id: String,
#[serde(rename = "secretAccessKey")]
pub secret_access_key: String, pub secret_access_key: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub bucket_name: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub region: Option<String>,
} }
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
+1
View File
@@ -1,5 +1,6 @@
mod action; mod action;
mod auth; mod auth;
mod cluster;
mod config; mod config;
mod management; mod management;
mod policy; mod policy;
+359 -14
View File
@@ -4,8 +4,10 @@ use serde_json::Value;
use std::io::Write; use std::io::Write;
use tokio::io::{AsyncBufReadExt, BufReader}; use tokio::io::{AsyncBufReadExt, BufReader};
use crate::config::Credential;
use crate::config::SmartStorageConfig; use crate::config::SmartStorageConfig;
use crate::server::StorageServer; use crate::server::StorageServer;
use crate::storage::BucketExport;
#[derive(Deserialize)] #[derive(Deserialize)]
struct IpcRequest { struct IpcRequest {
@@ -90,17 +92,15 @@ pub async fn management_loop() -> Result<()> {
config: SmartStorageConfig, config: SmartStorageConfig,
} }
match serde_json::from_value::<StartParams>(req.params) { match serde_json::from_value::<StartParams>(req.params) {
Ok(params) => { Ok(params) => match StorageServer::start(params.config).await {
match StorageServer::start(params.config).await { Ok(s) => {
Ok(s) => { server = Some(s);
server = Some(s); send_response(id, serde_json::json!({}));
send_response(id, serde_json::json!({}));
}
Err(e) => {
send_error(id, format!("Failed to start server: {}", e));
}
} }
} Err(e) => {
send_error(id, format!("Failed to start server: {}", e));
}
},
Err(e) => { Err(e) => {
send_error(id, format!("Invalid start params: {}", e)); send_error(id, format!("Invalid start params: {}", e));
} }
@@ -125,10 +125,7 @@ pub async fn management_loop() -> Result<()> {
send_response(id, serde_json::json!({})); send_response(id, serde_json::json!({}));
} }
Err(e) => { Err(e) => {
send_error( send_error(id, format!("Failed to create bucket: {}", e));
id,
format!("Failed to create bucket: {}", e),
);
} }
} }
} else { } else {
@@ -140,6 +137,354 @@ pub async fn management_loop() -> Result<()> {
} }
} }
} }
"createBucketTenant" => {
#[derive(Deserialize)]
#[serde(rename_all = "camelCase")]
struct CreateBucketTenantParams {
bucket_name: String,
access_key_id: String,
secret_access_key: String,
region: Option<String>,
}
match serde_json::from_value::<CreateBucketTenantParams>(req.params) {
Ok(params) => {
if let Some(ref s) = server {
let credential = Credential {
access_key_id: params.access_key_id,
secret_access_key: params.secret_access_key,
bucket_name: Some(params.bucket_name.clone()),
region: params.region,
};
match s
.create_bucket_tenant(&params.bucket_name, credential)
.await
{
Ok(credential) => match serde_json::to_value(credential) {
Ok(value) => send_response(id, value),
Err(error) => send_error(
id,
format!("Failed to serialize bucket tenant: {}", error),
),
},
Err(error) => send_error(
id,
format!("Failed to create bucket tenant: {}", error),
),
}
} else {
send_error(id, "Server not started".to_string());
}
}
Err(error) => {
send_error(id, format!("Invalid createBucketTenant params: {}", error));
}
}
}
"deleteBucketTenant" => {
#[derive(Deserialize)]
#[serde(rename_all = "camelCase")]
struct DeleteBucketTenantParams {
bucket_name: String,
access_key_id: Option<String>,
}
match serde_json::from_value::<DeleteBucketTenantParams>(req.params) {
Ok(params) => {
if let Some(ref s) = server {
match s
.delete_bucket_tenant(
&params.bucket_name,
params.access_key_id.as_deref(),
)
.await
{
Ok(()) => send_response(id, serde_json::json!({})),
Err(error) => send_error(
id,
format!("Failed to delete bucket tenant: {}", error),
),
}
} else {
send_error(id, "Server not started".to_string());
}
}
Err(error) => {
send_error(id, format!("Invalid deleteBucketTenant params: {}", error));
}
}
}
"rotateBucketTenantCredentials" => {
#[derive(Deserialize)]
#[serde(rename_all = "camelCase")]
struct RotateBucketTenantCredentialsParams {
bucket_name: String,
access_key_id: String,
secret_access_key: String,
region: Option<String>,
}
match serde_json::from_value::<RotateBucketTenantCredentialsParams>(req.params) {
Ok(params) => {
if let Some(ref s) = server {
let credential = Credential {
access_key_id: params.access_key_id,
secret_access_key: params.secret_access_key,
bucket_name: Some(params.bucket_name.clone()),
region: params.region,
};
match s
.rotate_bucket_tenant_credentials(&params.bucket_name, credential)
.await
{
Ok(credential) => match serde_json::to_value(credential) {
Ok(value) => send_response(id, value),
Err(error) => send_error(
id,
format!("Failed to serialize bucket tenant: {}", error),
),
},
Err(error) => send_error(
id,
format!(
"Failed to rotate bucket tenant credentials: {}",
error
),
),
}
} else {
send_error(id, "Server not started".to_string());
}
}
Err(error) => {
send_error(
id,
format!("Invalid rotateBucketTenantCredentials params: {}", error),
);
}
}
}
"listBucketTenants" => {
if let Some(ref s) = server {
match serde_json::to_value(s.list_bucket_tenants().await) {
Ok(value) => send_response(id, value),
Err(error) => {
send_error(id, format!("Failed to serialize bucket tenants: {}", error))
}
}
} else {
send_error(id, "Server not started".to_string());
}
}
"getBucketTenantCredential" => {
#[derive(Deserialize)]
#[serde(rename_all = "camelCase")]
struct GetBucketTenantCredentialParams {
bucket_name: String,
}
match serde_json::from_value::<GetBucketTenantCredentialParams>(req.params) {
Ok(params) => {
if let Some(ref s) = server {
match s.get_bucket_tenant_credential(&params.bucket_name).await {
Some(credential) => match serde_json::to_value(credential) {
Ok(value) => send_response(id, value),
Err(error) => send_error(
id,
format!("Failed to serialize bucket tenant: {}", error),
),
},
None => send_error(
id,
format!(
"No bucket tenant credential exists for bucket {}",
params.bucket_name
),
),
}
} else {
send_error(id, "Server not started".to_string());
}
}
Err(error) => {
send_error(
id,
format!("Invalid getBucketTenantCredential params: {}", error),
);
}
}
}
"exportBucket" => {
#[derive(Deserialize)]
#[serde(rename_all = "camelCase")]
struct ExportBucketParams {
bucket_name: String,
}
match serde_json::from_value::<ExportBucketParams>(req.params) {
Ok(params) => {
if let Some(ref s) = server {
match s.store().export_bucket(&params.bucket_name).await {
Ok(export) => match serde_json::to_value(export) {
Ok(value) => send_response(id, value),
Err(error) => send_error(
id,
format!("Failed to serialize bucket export: {}", error),
),
},
Err(error) => {
send_error(id, format!("Failed to export bucket: {}", error))
}
}
} else {
send_error(id, "Server not started".to_string());
}
}
Err(error) => {
send_error(id, format!("Invalid exportBucket params: {}", error));
}
}
}
"importBucket" => {
#[derive(Deserialize)]
#[serde(rename_all = "camelCase")]
struct ImportBucketParams {
bucket_name: String,
source: BucketExport,
}
match serde_json::from_value::<ImportBucketParams>(req.params) {
Ok(params) => {
if let Some(ref s) = server {
match s
.store()
.import_bucket(&params.bucket_name, params.source)
.await
{
Ok(()) => send_response(id, serde_json::json!({})),
Err(error) => {
send_error(id, format!("Failed to import bucket: {}", error))
}
}
} else {
send_error(id, "Server not started".to_string());
}
}
Err(error) => {
send_error(id, format!("Invalid importBucket params: {}", error));
}
}
}
"getStorageStats" => {
if let Some(ref s) = server {
match s.store().get_storage_stats().await {
Ok(stats) => match serde_json::to_value(stats) {
Ok(value) => send_response(id, value),
Err(error) => {
send_error(
id,
format!("Failed to serialize storage stats: {}", error),
);
}
},
Err(error) => {
send_error(id, format!("Failed to get storage stats: {}", error));
}
}
} else {
send_error(id, "Server not started".to_string());
}
}
"listBucketSummaries" => {
if let Some(ref s) = server {
match s.store().list_bucket_summaries().await {
Ok(summaries) => match serde_json::to_value(summaries) {
Ok(value) => send_response(id, value),
Err(error) => {
send_error(
id,
format!("Failed to serialize bucket summaries: {}", error),
);
}
},
Err(error) => {
send_error(id, format!("Failed to list bucket summaries: {}", error));
}
}
} else {
send_error(id, "Server not started".to_string());
}
}
"listCredentials" => {
if let Some(ref s) = server {
match serde_json::to_value(s.list_credentials().await) {
Ok(value) => send_response(id, value),
Err(error) => {
send_error(id, format!("Failed to serialize credentials: {}", error));
}
}
} else {
send_error(id, "Server not started".to_string());
}
}
"replaceCredentials" => {
#[derive(Deserialize)]
struct ReplaceCredentialsParams {
credentials: Vec<Credential>,
}
match serde_json::from_value::<ReplaceCredentialsParams>(req.params) {
Ok(params) => {
if let Some(ref s) = server {
match s.replace_credentials(params.credentials).await {
Ok(()) => {
send_response(id, serde_json::json!({}));
}
Err(error) => {
send_error(
id,
format!("Failed to replace credentials: {}", error),
);
}
}
} else {
send_error(id, "Server not started".to_string());
}
}
Err(error) => {
send_error(id, format!("Invalid replaceCredentials params: {}", error));
}
}
}
"getClusterHealth" => {
if let Some(ref s) = server {
match s.store().get_cluster_health().await {
Ok(health) => match serde_json::to_value(health) {
Ok(value) => send_response(id, value),
Err(error) => {
send_error(
id,
format!("Failed to serialize cluster health: {}", error),
);
}
},
Err(error) => {
send_error(id, format!("Failed to get cluster health: {}", error));
}
}
} else {
send_error(id, "Server not started".to_string());
}
}
"clusterStatus" => {
send_response(
id,
serde_json::json!({
"status": "ok",
"message": "Cluster status endpoint ready"
}),
);
}
_ => { _ => {
send_error(id, format!("Unknown method: {}", method)); send_error(id, format!("Unknown method: {}", method));
} }
+693 -71
View File
File diff suppressed because it is too large Load Diff
+817 -21
View File
@@ -8,8 +8,10 @@ use std::collections::HashMap;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use tokio::fs; use tokio::fs;
use tokio::io::{AsyncReadExt, AsyncSeekExt, AsyncWriteExt, BufWriter}; use tokio::io::{AsyncReadExt, AsyncSeekExt, AsyncWriteExt, BufWriter};
use tokio::sync::RwLock;
use uuid::Uuid; use uuid::Uuid;
use crate::cluster::coordinator::DistributedStore;
use crate::error::StorageError; use crate::error::StorageError;
// ============================ // ============================
@@ -63,6 +65,152 @@ pub struct BucketInfo {
pub creation_date: DateTime<Utc>, pub creation_date: DateTime<Utc>,
} }
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct BucketSummary {
pub name: String,
pub object_count: u64,
pub total_size_bytes: u64,
#[serde(skip_serializing_if = "Option::is_none")]
pub creation_date: Option<i64>,
}
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct StorageLocationSummary {
pub path: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub total_bytes: Option<u64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub available_bytes: Option<u64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub used_bytes: Option<u64>,
}
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct StorageStats {
pub bucket_count: u64,
pub total_object_count: u64,
pub total_storage_bytes: u64,
pub buckets: Vec<BucketSummary>,
pub storage_directory: String,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub storage_locations: Vec<StorageLocationSummary>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct BucketExport {
pub format: String,
pub bucket_name: String,
pub exported_at: i64,
pub objects: Vec<BucketExportObject>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct BucketExportObject {
pub key: String,
pub size: u64,
pub md5: String,
pub metadata: HashMap<String, String>,
pub data_hex: String,
}
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct ClusterPeerHealth {
pub node_id: String,
pub status: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub quic_address: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub s3_address: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub drive_count: Option<u32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub last_heartbeat: Option<i64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub missed_heartbeats: Option<u32>,
}
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct ClusterDriveHealth {
pub index: u32,
pub path: String,
pub status: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub total_bytes: Option<u64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub used_bytes: Option<u64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub available_bytes: Option<u64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub error_count: Option<u64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub last_error: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub last_check: Option<i64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub erasure_set_id: Option<u32>,
}
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct ClusterErasureHealth {
pub data_shards: usize,
pub parity_shards: usize,
pub chunk_size_bytes: usize,
pub total_shards: usize,
pub read_quorum: usize,
pub write_quorum: usize,
pub erasure_set_count: usize,
}
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct ClusterRepairHealth {
pub active: bool,
#[serde(skip_serializing_if = "Option::is_none")]
pub scan_interval_ms: Option<u64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub last_run_started_at: Option<i64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub last_run_completed_at: Option<i64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub last_duration_ms: Option<u64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub shards_checked: Option<u64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub shards_healed: Option<u64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub failed: Option<u64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub last_error: Option<String>,
}
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct ClusterHealth {
pub enabled: bool,
#[serde(skip_serializing_if = "Option::is_none")]
pub node_id: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub quorum_healthy: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub majority_healthy: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub peers: Option<Vec<ClusterPeerHealth>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub drives: Option<Vec<ClusterDriveHealth>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub erasure: Option<ClusterErasureHealth>,
#[serde(skip_serializing_if = "Option::is_none")]
pub repairs: Option<ClusterRepairHealth>,
}
pub struct MultipartUploadInfo { pub struct MultipartUploadInfo {
pub upload_id: String, pub upload_id: String,
pub key: String, pub key: String,
@@ -97,22 +245,186 @@ struct PartMetadata {
last_modified: String, last_modified: String,
} }
#[derive(Debug, Clone, Default)]
pub(crate) struct RuntimeBucketStats {
pub object_count: u64,
pub total_size_bytes: u64,
pub creation_date: Option<DateTime<Utc>>,
}
#[derive(Debug, Clone, Default)]
pub(crate) struct RuntimeStatsState {
buckets: HashMap<String, RuntimeBucketStats>,
total_object_count: u64,
total_storage_bytes: u64,
}
impl RuntimeStatsState {
pub(crate) fn replace_buckets(&mut self, buckets: HashMap<String, RuntimeBucketStats>) {
self.total_object_count = buckets.values().map(|bucket| bucket.object_count).sum();
self.total_storage_bytes = buckets.values().map(|bucket| bucket.total_size_bytes).sum();
self.buckets = buckets;
}
pub(crate) fn ensure_bucket(&mut self, name: &str, creation_date: Option<DateTime<Utc>>) {
let bucket = self.buckets.entry(name.to_string()).or_default();
if bucket.creation_date.is_none() {
bucket.creation_date = creation_date;
}
}
pub(crate) fn remove_bucket(&mut self, name: &str) {
if let Some(bucket) = self.buckets.remove(name) {
self.total_object_count = self.total_object_count.saturating_sub(bucket.object_count);
self.total_storage_bytes = self
.total_storage_bytes
.saturating_sub(bucket.total_size_bytes);
}
}
pub(crate) fn upsert_object(
&mut self,
bucket_name: &str,
previous_size: Option<u64>,
new_size: u64,
) {
let bucket_was_present = self.buckets.contains_key(bucket_name);
let bucket = self.buckets.entry(bucket_name.to_string()).or_default();
if let Some(previous_size) = previous_size {
if !bucket_was_present {
bucket.object_count = 1;
self.total_object_count += 1;
}
bucket.total_size_bytes =
bucket.total_size_bytes.saturating_sub(previous_size) + new_size;
self.total_storage_bytes =
self.total_storage_bytes.saturating_sub(previous_size) + new_size;
} else {
bucket.object_count += 1;
bucket.total_size_bytes += new_size;
self.total_object_count += 1;
self.total_storage_bytes += new_size;
}
}
pub(crate) fn remove_object(&mut self, bucket_name: &str, existing_size: Option<u64>) {
let Some(existing_size) = existing_size else {
return;
};
let Some(bucket) = self.buckets.get_mut(bucket_name) else {
return;
};
bucket.object_count = bucket.object_count.saturating_sub(1);
bucket.total_size_bytes = bucket.total_size_bytes.saturating_sub(existing_size);
self.total_object_count = self.total_object_count.saturating_sub(1);
self.total_storage_bytes = self.total_storage_bytes.saturating_sub(existing_size);
}
pub(crate) fn bucket_summaries(&self) -> Vec<BucketSummary> {
let mut buckets: Vec<BucketSummary> = self
.buckets
.iter()
.map(|(name, stats)| BucketSummary {
name: name.clone(),
object_count: stats.object_count,
total_size_bytes: stats.total_size_bytes,
creation_date: stats
.creation_date
.as_ref()
.map(|creation_date| creation_date.timestamp_millis()),
})
.collect();
buckets.sort_by(|a, b| a.name.cmp(&b.name));
buckets
}
pub(crate) fn snapshot(
&self,
storage_directory: &Path,
storage_locations: Vec<StorageLocationSummary>,
) -> StorageStats {
StorageStats {
bucket_count: self.buckets.len() as u64,
total_object_count: self.total_object_count,
total_storage_bytes: self.total_storage_bytes,
buckets: self.bucket_summaries(),
storage_directory: storage_directory.to_string_lossy().to_string(),
storage_locations,
}
}
}
#[derive(Debug, Clone, Copy)]
struct FilesystemUsage {
total_bytes: u64,
available_bytes: u64,
used_bytes: u64,
}
pub(crate) fn storage_location_summary(path: &Path) -> StorageLocationSummary {
let usage = filesystem_usage(path);
StorageLocationSummary {
path: path.to_string_lossy().to_string(),
total_bytes: usage.map(|usage| usage.total_bytes),
available_bytes: usage.map(|usage| usage.available_bytes),
used_bytes: usage.map(|usage| usage.used_bytes),
}
}
#[cfg(unix)]
fn filesystem_usage(path: &Path) -> Option<FilesystemUsage> {
use std::ffi::CString;
use std::os::unix::ffi::OsStrExt;
let path_bytes = path.as_os_str().as_bytes();
let c_path = CString::new(path_bytes).ok()?;
let mut stat: libc::statvfs = unsafe { std::mem::zeroed() };
if unsafe { libc::statvfs(c_path.as_ptr(), &mut stat) } != 0 {
return None;
}
let block_size = stat.f_frsize as u64;
let total_bytes = stat.f_blocks as u64 * block_size;
let available_bytes = stat.f_bavail as u64 * block_size;
let free_bytes = stat.f_bfree as u64 * block_size;
Some(FilesystemUsage {
total_bytes,
available_bytes,
used_bytes: total_bytes.saturating_sub(free_bytes),
})
}
#[cfg(not(unix))]
fn filesystem_usage(_path: &Path) -> Option<FilesystemUsage> {
None
}
// ============================ // ============================
// FileStore // FileStore
// ============================ // ============================
pub struct FileStore { pub struct FileStore {
root_dir: PathBuf, root_dir: PathBuf,
runtime_stats: RwLock<RuntimeStatsState>,
} }
impl FileStore { impl FileStore {
pub fn new(root_dir: PathBuf) -> Self { pub fn new(root_dir: PathBuf) -> Self {
Self { root_dir } Self {
root_dir,
runtime_stats: RwLock::new(RuntimeStatsState::default()),
}
} }
pub async fn initialize(&self) -> Result<()> { pub async fn initialize(&self) -> Result<()> {
fs::create_dir_all(&self.root_dir).await?; fs::create_dir_all(&self.root_dir).await?;
fs::create_dir_all(self.policies_dir()).await?; fs::create_dir_all(self.policies_dir()).await?;
self.refresh_runtime_stats().await;
Ok(()) Ok(())
} }
@@ -126,9 +438,56 @@ impl FileStore {
} }
fs::create_dir_all(&self.root_dir).await?; fs::create_dir_all(&self.root_dir).await?;
fs::create_dir_all(self.policies_dir()).await?; fs::create_dir_all(self.policies_dir()).await?;
self.refresh_runtime_stats().await;
Ok(()) Ok(())
} }
pub async fn get_storage_stats(&self) -> Result<StorageStats> {
let runtime_stats = self.runtime_stats.read().await;
Ok(runtime_stats.snapshot(
&self.root_dir,
vec![storage_location_summary(&self.root_dir)],
))
}
pub async fn list_bucket_summaries(&self) -> Result<Vec<BucketSummary>> {
let runtime_stats = self.runtime_stats.read().await;
Ok(runtime_stats.bucket_summaries())
}
async fn refresh_runtime_stats(&self) {
let buckets = match self.list_buckets().await {
Ok(buckets) => buckets,
Err(error) => {
tracing::warn!(path = %self.root_dir.display(), error = %error, "Failed to initialize runtime stats");
return;
}
};
let mut runtime_buckets = HashMap::new();
for bucket in buckets {
let bucket_path = self.root_dir.join(&bucket.name);
match Self::scan_bucket_objects(&bucket_path).await {
Ok((object_count, total_size_bytes)) => {
runtime_buckets.insert(
bucket.name,
RuntimeBucketStats {
object_count,
total_size_bytes,
creation_date: Some(bucket.creation_date),
},
);
}
Err(error) => {
tracing::warn!(bucket = %bucket.name, error = %error, "Failed to scan bucket for runtime stats");
}
}
}
let mut runtime_stats = self.runtime_stats.write().await;
runtime_stats.replace_buckets(runtime_buckets);
}
// ============================ // ============================
// Bucket operations // Bucket operations
// ============================ // ============================
@@ -167,6 +526,7 @@ impl FileStore {
pub async fn create_bucket(&self, bucket: &str) -> Result<()> { pub async fn create_bucket(&self, bucket: &str) -> Result<()> {
let bucket_path = self.root_dir.join(bucket); let bucket_path = self.root_dir.join(bucket);
fs::create_dir_all(&bucket_path).await?; fs::create_dir_all(&bucket_path).await?;
self.track_bucket_created(bucket).await;
Ok(()) Ok(())
} }
@@ -184,6 +544,7 @@ impl FileStore {
} }
fs::remove_dir_all(&bucket_path).await?; fs::remove_dir_all(&bucket_path).await?;
self.track_bucket_deleted(bucket).await;
Ok(()) Ok(())
} }
@@ -202,6 +563,8 @@ impl FileStore {
return Err(StorageError::no_such_bucket().into()); return Err(StorageError::no_such_bucket().into());
} }
let previous_size = self.object_size_if_exists(bucket, key).await;
let object_path = self.object_path(bucket, key); let object_path = self.object_path(bucket, key);
if let Some(parent) = object_path.parent() { if let Some(parent) = object_path.parent() {
fs::create_dir_all(parent).await?; fs::create_dir_all(parent).await?;
@@ -242,9 +605,45 @@ impl FileStore {
let metadata_json = serde_json::to_string_pretty(&metadata)?; let metadata_json = serde_json::to_string_pretty(&metadata)?;
fs::write(&metadata_path, metadata_json).await?; fs::write(&metadata_path, metadata_json).await?;
Ok(PutResult { let object_size = fs::metadata(&object_path).await?.len();
md5: md5_hex, self.track_object_upsert(bucket, previous_size, object_size)
}) .await;
Ok(PutResult { md5: md5_hex })
}
pub async fn put_object_bytes(
&self,
bucket: &str,
key: &str,
data: &[u8],
metadata: HashMap<String, String>,
) -> Result<PutResult> {
if !self.bucket_exists(bucket).await {
return Err(StorageError::no_such_bucket().into());
}
let previous_size = self.object_size_if_exists(bucket, key).await;
let object_path = self.object_path(bucket, key);
if let Some(parent) = object_path.parent() {
fs::create_dir_all(parent).await?;
}
fs::write(&object_path, data).await?;
let md5_hex = format!("{:x}", Md5::digest(data));
fs::write(format!("{}.md5", object_path.display()), &md5_hex).await?;
let metadata_json = serde_json::to_string_pretty(&metadata)?;
fs::write(
format!("{}.metadata.json", object_path.display()),
metadata_json,
)
.await?;
self.track_object_upsert(bucket, previous_size, data.len() as u64)
.await;
Ok(PutResult { md5: md5_hex })
} }
pub async fn get_object( pub async fn get_object(
@@ -309,6 +708,7 @@ impl FileStore {
} }
pub async fn delete_object(&self, bucket: &str, key: &str) -> Result<()> { pub async fn delete_object(&self, bucket: &str, key: &str) -> Result<()> {
let existing_size = self.object_size_if_exists(bucket, key).await;
let object_path = self.object_path(bucket, key); let object_path = self.object_path(bucket, key);
let md5_path = format!("{}.md5", object_path.display()); let md5_path = format!("{}.md5", object_path.display());
let metadata_path = format!("{}.metadata.json", object_path.display()); let metadata_path = format!("{}.metadata.json", object_path.display());
@@ -336,6 +736,8 @@ impl FileStore {
current = dir.parent().map(|p| p.to_path_buf()); current = dir.parent().map(|p| p.to_path_buf());
} }
self.track_object_deleted(bucket, existing_size).await;
Ok(()) Ok(())
} }
@@ -359,6 +761,8 @@ impl FileStore {
return Err(StorageError::no_such_bucket().into()); return Err(StorageError::no_such_bucket().into());
} }
let previous_size = self.object_size_if_exists(dest_bucket, dest_key).await;
if let Some(parent) = dest_path.parent() { if let Some(parent) = dest_path.parent() {
fs::create_dir_all(parent).await?; fs::create_dir_all(parent).await?;
} }
@@ -386,10 +790,10 @@ impl FileStore {
let md5 = self.read_md5(&dest_path).await; let md5 = self.read_md5(&dest_path).await;
let last_modified: DateTime<Utc> = file_meta.modified()?.into(); let last_modified: DateTime<Utc> = file_meta.modified()?.into();
Ok(CopyResult { self.track_object_upsert(dest_bucket, previous_size, file_meta.len())
md5, .await;
last_modified,
}) Ok(CopyResult { md5, last_modified })
} }
pub async fn list_objects( pub async fn list_objects(
@@ -437,11 +841,7 @@ impl FileStore {
if !delimiter.is_empty() { if !delimiter.is_empty() {
let remaining = &key[prefix.len()..]; let remaining = &key[prefix.len()..];
if let Some(delim_idx) = remaining.find(delimiter) { if let Some(delim_idx) = remaining.find(delimiter) {
let cp = format!( let cp = format!("{}{}", prefix, &remaining[..delim_idx + delimiter.len()]);
"{}{}",
prefix,
&remaining[..delim_idx + delimiter.len()]
);
if common_prefix_set.insert(cp.clone()) { if common_prefix_set.insert(cp.clone()) {
common_prefixes.push(cp); common_prefixes.push(cp);
} }
@@ -457,7 +857,10 @@ impl FileStore {
let object_path = self.object_path(bucket, key); let object_path = self.object_path(bucket, key);
if let Ok(meta) = fs::metadata(&object_path).await { if let Ok(meta) = fs::metadata(&object_path).await {
let md5 = self.read_md5(&object_path).await; let md5 = self.read_md5(&object_path).await;
let last_modified: DateTime<Utc> = meta.modified().unwrap_or(std::time::SystemTime::UNIX_EPOCH).into(); let last_modified: DateTime<Utc> = meta
.modified()
.unwrap_or(std::time::SystemTime::UNIX_EPOCH)
.into();
contents.push(ListObjectEntry { contents.push(ListObjectEntry {
key: key.clone(), key: key.clone(),
size: meta.len(), size: meta.len(),
@@ -610,6 +1013,8 @@ impl FileStore {
let content = fs::read_to_string(&meta_path).await?; let content = fs::read_to_string(&meta_path).await?;
let meta: MultipartMetadata = serde_json::from_str(&content)?; let meta: MultipartMetadata = serde_json::from_str(&content)?;
let previous_size = self.object_size_if_exists(&meta.bucket, &meta.key).await;
let object_path = self.object_path(&meta.bucket, &meta.key); let object_path = self.object_path(&meta.bucket, &meta.key);
if let Some(parent) = object_path.parent() { if let Some(parent) = object_path.parent() {
fs::create_dir_all(parent).await?; fs::create_dir_all(parent).await?;
@@ -652,12 +1057,14 @@ impl FileStore {
let metadata_json = serde_json::to_string_pretty(&meta.metadata)?; let metadata_json = serde_json::to_string_pretty(&meta.metadata)?;
fs::write(&metadata_path, metadata_json).await?; fs::write(&metadata_path, metadata_json).await?;
let object_size = fs::metadata(&object_path).await?.len();
self.track_object_upsert(&meta.bucket, previous_size, object_size)
.await;
// Clean up multipart directory // Clean up multipart directory
let _ = fs::remove_dir_all(&upload_dir).await; let _ = fs::remove_dir_all(&upload_dir).await;
Ok(CompleteMultipartResult { Ok(CompleteMultipartResult { etag })
etag,
})
} }
pub async fn abort_multipart(&self, upload_id: &str) -> Result<()> { pub async fn abort_multipart(&self, upload_id: &str) -> Result<()> {
@@ -669,10 +1076,7 @@ impl FileStore {
Ok(()) Ok(())
} }
pub async fn list_multipart_uploads( pub async fn list_multipart_uploads(&self, bucket: &str) -> Result<Vec<MultipartUploadInfo>> {
&self,
bucket: &str,
) -> Result<Vec<MultipartUploadInfo>> {
let multipart_dir = self.multipart_dir(); let multipart_dir = self.multipart_dir();
if !multipart_dir.is_dir() { if !multipart_dir.is_dir() {
return Ok(Vec::new()); return Ok(Vec::new());
@@ -711,6 +1115,75 @@ impl FileStore {
// Helpers // Helpers
// ============================ // ============================
async fn scan_bucket_objects(bucket_path: &Path) -> Result<(u64, u64)> {
let mut object_count = 0u64;
let mut total_size_bytes = 0u64;
let mut directories = vec![bucket_path.to_path_buf()];
while let Some(directory) = directories.pop() {
let mut entries = match fs::read_dir(&directory).await {
Ok(entries) => entries,
Err(_) => continue,
};
while let Some(entry) = entries.next_entry().await? {
let metadata = entry.metadata().await?;
if metadata.is_dir() {
directories.push(entry.path());
continue;
}
let name = entry.file_name().to_string_lossy().to_string();
if name.ends_with("._storage_object") {
object_count += 1;
total_size_bytes += metadata.len();
}
}
}
Ok((object_count, total_size_bytes))
}
async fn bucket_creation_date(&self, bucket: &str) -> Option<DateTime<Utc>> {
let metadata = fs::metadata(self.root_dir.join(bucket)).await.ok()?;
let created_or_modified = metadata.created().unwrap_or(
metadata
.modified()
.unwrap_or(std::time::SystemTime::UNIX_EPOCH),
);
Some(created_or_modified.into())
}
async fn object_size_if_exists(&self, bucket: &str, key: &str) -> Option<u64> {
fs::metadata(self.object_path(bucket, key))
.await
.ok()
.map(|metadata| metadata.len())
}
async fn track_bucket_created(&self, bucket: &str) {
let creation_date = self.bucket_creation_date(bucket).await;
let mut runtime_stats = self.runtime_stats.write().await;
runtime_stats.ensure_bucket(bucket, creation_date);
}
async fn track_bucket_deleted(&self, bucket: &str) {
let mut runtime_stats = self.runtime_stats.write().await;
runtime_stats.remove_bucket(bucket);
}
async fn track_object_upsert(&self, bucket: &str, previous_size: Option<u64>, new_size: u64) {
let creation_date = self.bucket_creation_date(bucket).await;
let mut runtime_stats = self.runtime_stats.write().await;
runtime_stats.ensure_bucket(bucket, creation_date);
runtime_stats.upsert_object(bucket, previous_size, new_size);
}
async fn track_object_deleted(&self, bucket: &str, existing_size: Option<u64>) {
let mut runtime_stats = self.runtime_stats.write().await;
runtime_stats.remove_object(bucket, existing_size);
}
fn object_path(&self, bucket: &str, key: &str) -> PathBuf { fn object_path(&self, bucket: &str, key: &str) -> PathBuf {
let encoded = encode_key(key); let encoded = encode_key(key);
self.root_dir self.root_dir
@@ -795,6 +1268,329 @@ impl FileStore {
} }
} }
// ============================
// StorageBackend enum
// ============================
/// Unified storage backend that dispatches to either standalone (FileStore)
/// or clustered (DistributedStore) storage.
pub enum StorageBackend {
Standalone(FileStore),
Clustered(DistributedStore),
}
impl StorageBackend {
pub fn policies_dir(&self) -> std::path::PathBuf {
match self {
StorageBackend::Standalone(fs) => fs.policies_dir(),
StorageBackend::Clustered(ds) => ds.policies_dir(),
}
}
pub async fn get_cluster_health(&self) -> Result<ClusterHealth> {
match self {
StorageBackend::Standalone(_) => Ok(ClusterHealth {
enabled: false,
node_id: None,
quorum_healthy: None,
majority_healthy: None,
peers: None,
drives: None,
erasure: None,
repairs: None,
}),
StorageBackend::Clustered(ds) => ds.get_cluster_health().await,
}
}
pub async fn get_storage_stats(&self) -> Result<StorageStats> {
match self {
StorageBackend::Standalone(fs) => fs.get_storage_stats().await,
StorageBackend::Clustered(ds) => ds.get_storage_stats().await,
}
}
pub async fn list_bucket_summaries(&self) -> Result<Vec<BucketSummary>> {
match self {
StorageBackend::Standalone(fs) => fs.list_bucket_summaries().await,
StorageBackend::Clustered(ds) => ds.list_bucket_summaries().await,
}
}
pub async fn initialize(&self) -> Result<()> {
match self {
StorageBackend::Standalone(fs) => fs.initialize().await,
StorageBackend::Clustered(ds) => {
// Ensure policies directory exists
tokio::fs::create_dir_all(ds.policies_dir()).await?;
ds.initialize_runtime_stats().await;
Ok(())
}
}
}
pub async fn reset(&self) -> Result<()> {
match self {
StorageBackend::Standalone(fs) => fs.reset().await,
StorageBackend::Clustered(_) => Ok(()), // TODO: cluster reset
}
}
pub async fn list_buckets(&self) -> Result<Vec<BucketInfo>> {
match self {
StorageBackend::Standalone(fs) => fs.list_buckets().await,
StorageBackend::Clustered(ds) => ds.list_buckets().await,
}
}
pub async fn bucket_exists(&self, bucket: &str) -> bool {
match self {
StorageBackend::Standalone(fs) => fs.bucket_exists(bucket).await,
StorageBackend::Clustered(ds) => ds.bucket_exists(bucket).await,
}
}
pub async fn create_bucket(&self, bucket: &str) -> Result<()> {
match self {
StorageBackend::Standalone(fs) => fs.create_bucket(bucket).await,
StorageBackend::Clustered(ds) => ds.create_bucket(bucket).await,
}
}
pub async fn delete_bucket(&self, bucket: &str) -> Result<()> {
match self {
StorageBackend::Standalone(fs) => fs.delete_bucket(bucket).await,
StorageBackend::Clustered(ds) => ds.delete_bucket(bucket).await,
}
}
pub async fn delete_bucket_recursive(&self, bucket: &str) -> Result<()> {
if !self.bucket_exists(bucket).await {
return Err(StorageError::no_such_bucket().into());
}
loop {
let objects = self.list_objects(bucket, "", "", 1000, None).await?;
if objects.contents.is_empty() {
break;
}
for object in objects.contents {
self.delete_object(bucket, &object.key).await?;
}
}
self.delete_bucket(bucket).await
}
pub async fn put_object(
&self,
bucket: &str,
key: &str,
body: Incoming,
metadata: HashMap<String, String>,
) -> Result<PutResult> {
match self {
StorageBackend::Standalone(fs) => fs.put_object(bucket, key, body, metadata).await,
StorageBackend::Clustered(ds) => ds.put_object(bucket, key, body, metadata).await,
}
}
pub async fn put_object_bytes(
&self,
bucket: &str,
key: &str,
data: &[u8],
metadata: HashMap<String, String>,
) -> Result<PutResult> {
match self {
StorageBackend::Standalone(fs) => {
fs.put_object_bytes(bucket, key, data, metadata).await
}
StorageBackend::Clustered(ds) => ds.put_object_bytes(bucket, key, data, metadata).await,
}
}
pub async fn get_object(
&self,
bucket: &str,
key: &str,
range: Option<(u64, u64)>,
) -> Result<GetResult> {
match self {
StorageBackend::Standalone(fs) => fs.get_object(bucket, key, range).await,
StorageBackend::Clustered(ds) => ds.get_object(bucket, key, range).await,
}
}
pub async fn head_object(&self, bucket: &str, key: &str) -> Result<HeadResult> {
match self {
StorageBackend::Standalone(fs) => fs.head_object(bucket, key).await,
StorageBackend::Clustered(ds) => ds.head_object(bucket, key).await,
}
}
pub async fn delete_object(&self, bucket: &str, key: &str) -> Result<()> {
match self {
StorageBackend::Standalone(fs) => fs.delete_object(bucket, key).await,
StorageBackend::Clustered(ds) => ds.delete_object(bucket, key).await,
}
}
pub async fn copy_object(
&self,
src_bucket: &str,
src_key: &str,
dest_bucket: &str,
dest_key: &str,
metadata_directive: &str,
new_metadata: Option<HashMap<String, String>>,
) -> Result<CopyResult> {
match self {
StorageBackend::Standalone(fs) => {
fs.copy_object(
src_bucket,
src_key,
dest_bucket,
dest_key,
metadata_directive,
new_metadata,
)
.await
}
StorageBackend::Clustered(ds) => {
ds.copy_object(
src_bucket,
src_key,
dest_bucket,
dest_key,
metadata_directive,
new_metadata,
)
.await
}
}
}
pub async fn list_objects(
&self,
bucket: &str,
prefix: &str,
delimiter: &str,
max_keys: usize,
continuation_token: Option<&str>,
) -> Result<ListObjectsResult> {
match self {
StorageBackend::Standalone(fs) => {
fs.list_objects(bucket, prefix, delimiter, max_keys, continuation_token)
.await
}
StorageBackend::Clustered(ds) => {
ds.list_objects(bucket, prefix, delimiter, max_keys, continuation_token)
.await
}
}
}
pub async fn initiate_multipart(
&self,
bucket: &str,
key: &str,
metadata: HashMap<String, String>,
) -> Result<String> {
match self {
StorageBackend::Standalone(fs) => fs.initiate_multipart(bucket, key, metadata).await,
StorageBackend::Clustered(ds) => ds.initiate_multipart(bucket, key, metadata).await,
}
}
pub async fn upload_part(
&self,
upload_id: &str,
part_number: u32,
body: Incoming,
) -> Result<(String, u64)> {
match self {
StorageBackend::Standalone(fs) => fs.upload_part(upload_id, part_number, body).await,
StorageBackend::Clustered(ds) => ds.upload_part(upload_id, part_number, body).await,
}
}
pub async fn complete_multipart(
&self,
upload_id: &str,
parts: &[(u32, String)],
) -> Result<CompleteMultipartResult> {
match self {
StorageBackend::Standalone(fs) => fs.complete_multipart(upload_id, parts).await,
StorageBackend::Clustered(ds) => ds.complete_multipart(upload_id, parts).await,
}
}
pub async fn abort_multipart(&self, upload_id: &str) -> Result<()> {
match self {
StorageBackend::Standalone(fs) => fs.abort_multipart(upload_id).await,
StorageBackend::Clustered(ds) => ds.abort_multipart(upload_id).await,
}
}
pub async fn list_multipart_uploads(&self, bucket: &str) -> Result<Vec<MultipartUploadInfo>> {
match self {
StorageBackend::Standalone(fs) => fs.list_multipart_uploads(bucket).await,
StorageBackend::Clustered(ds) => ds.list_multipart_uploads(bucket).await,
}
}
pub async fn export_bucket(&self, bucket: &str) -> Result<BucketExport> {
if !self.bucket_exists(bucket).await {
return Err(StorageError::no_such_bucket().into());
}
let objects = self.list_objects(bucket, "", "", usize::MAX, None).await?;
let mut exported_objects = Vec::with_capacity(objects.contents.len());
for object in objects.contents {
let result = self.get_object(bucket, &object.key, None).await?;
let mut file = result.body;
let mut data = Vec::with_capacity(result.size as usize);
file.read_to_end(&mut data).await?;
exported_objects.push(BucketExportObject {
key: object.key,
size: result.size,
md5: result.md5,
metadata: result.metadata,
data_hex: hex::encode(data),
});
}
Ok(BucketExport {
format: "smartstorage.bucket.v1".to_string(),
bucket_name: bucket.to_string(),
exported_at: Utc::now().timestamp_millis(),
objects: exported_objects,
})
}
pub async fn import_bucket(&self, bucket: &str, source: BucketExport) -> Result<()> {
if source.format != "smartstorage.bucket.v1" {
return Err(StorageError::invalid_request("Unsupported bucket export format.").into());
}
if !self.bucket_exists(bucket).await {
self.create_bucket(bucket).await?;
}
for object in source.objects {
let data = hex::decode(&object.data_hex)
.map_err(|error| StorageError::invalid_request(&error.to_string()))?;
self.put_object_bytes(bucket, &object.key, &data, object.metadata)
.await?;
}
Ok(())
}
}
// ============================ // ============================
// Key encoding (identity on Linux) // Key encoding (identity on Linux)
// ============================ // ============================
+112 -6
View File
@@ -1,16 +1,28 @@
/// <reference types="node" />
import { expect, tap } from '@git.zone/tstest/tapbundle'; import { expect, tap } from '@git.zone/tstest/tapbundle';
import { S3Client, CreateBucketCommand, ListBucketsCommand, PutObjectCommand, GetObjectCommand, DeleteObjectCommand, DeleteBucketCommand } from '@aws-sdk/client-s3'; import { S3Client, CreateBucketCommand, ListBucketsCommand, PutObjectCommand, GetObjectCommand, DeleteObjectCommand, DeleteBucketCommand } from '@aws-sdk/client-s3';
import { Buffer } from 'buffer';
import { Readable } from 'stream'; import { Readable } from 'stream';
import * as smartstorage from '../ts/index.js'; import * as smartstorage from '../ts/index.js';
let testSmartStorageInstance: smartstorage.SmartStorage; let testSmartStorageInstance: smartstorage.SmartStorage;
let s3Client: S3Client; let s3Client: S3Client;
const testObjectBody = 'Hello from AWS SDK!';
const testObjectSize = Buffer.byteLength(testObjectBody);
function getBucketSummary(
summaries: smartstorage.IBucketSummary[],
bucketName: string,
): smartstorage.IBucketSummary | undefined {
return summaries.find((summary) => summary.name === bucketName);
}
// Helper to convert stream to string // Helper to convert stream to string
async function streamToString(stream: Readable): Promise<string> { async function streamToString(stream: Readable): Promise<string> {
const chunks: Buffer[] = []; const chunks: Buffer[] = [];
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
stream.on('data', (chunk) => chunks.push(Buffer.from(chunk))); stream.on('data', (chunk: string | Buffer | Uint8Array) => chunks.push(Buffer.from(chunk)));
stream.on('error', reject); stream.on('error', reject);
stream.on('end', () => resolve(Buffer.concat(chunks).toString('utf8'))); stream.on('end', () => resolve(Buffer.concat(chunks).toString('utf8')));
}); });
@@ -46,28 +58,82 @@ tap.test('should list buckets (empty)', async () => {
expect(response.Buckets!.length).toEqual(0); expect(response.Buckets!.length).toEqual(0);
}); });
tap.test('should expose empty runtime stats after startup', async () => {
const stats = await testSmartStorageInstance.getStorageStats();
expect(stats.bucketCount).toEqual(0);
expect(stats.totalObjectCount).toEqual(0);
expect(stats.totalStorageBytes).toEqual(0);
expect(stats.buckets.length).toEqual(0);
expect(stats.storageDirectory.length > 0).toEqual(true);
});
tap.test('should expose disabled cluster health in standalone mode', async () => {
const clusterHealth = await testSmartStorageInstance.getClusterHealth();
expect(clusterHealth.enabled).toEqual(false);
expect(clusterHealth.nodeId).toEqual(undefined);
expect(clusterHealth.quorumHealthy).toEqual(undefined);
expect(clusterHealth.drives).toEqual(undefined);
});
tap.test('should create a bucket', async () => { tap.test('should create a bucket', async () => {
const response = await s3Client.send(new CreateBucketCommand({ Bucket: 'test-bucket' })); const response = await s3Client.send(new CreateBucketCommand({ Bucket: 'test-bucket' }));
expect(response.$metadata.httpStatusCode).toEqual(200); expect(response.$metadata.httpStatusCode).toEqual(200);
}); });
tap.test('should list buckets (showing created bucket)', async () => { tap.test('should create an empty bucket through the bridge', async () => {
const response = await testSmartStorageInstance.createBucket('empty-bucket');
expect(response.name).toEqual('empty-bucket');
});
tap.test('should list buckets (showing created buckets)', async () => {
const response = await s3Client.send(new ListBucketsCommand({})); const response = await s3Client.send(new ListBucketsCommand({}));
expect(response.Buckets!.length).toEqual(1); expect(response.Buckets!.length).toEqual(2);
expect(response.Buckets![0].Name).toEqual('test-bucket'); expect(response.Buckets!.some((bucket) => bucket.Name === 'test-bucket')).toEqual(true);
expect(response.Buckets!.some((bucket) => bucket.Name === 'empty-bucket')).toEqual(true);
});
tap.test('should expose runtime bucket summaries after bucket creation', async () => {
const stats = await testSmartStorageInstance.getStorageStats();
const summaries = await testSmartStorageInstance.listBucketSummaries();
const testBucketSummary = getBucketSummary(stats.buckets, 'test-bucket');
const emptyBucketSummary = getBucketSummary(summaries, 'empty-bucket');
expect(stats.bucketCount).toEqual(2);
expect(stats.totalObjectCount).toEqual(0);
expect(stats.totalStorageBytes).toEqual(0);
expect(summaries.length).toEqual(2);
expect(testBucketSummary?.objectCount).toEqual(0);
expect(testBucketSummary?.totalSizeBytes).toEqual(0);
expect(typeof testBucketSummary?.creationDate).toEqual('number');
expect(emptyBucketSummary?.objectCount).toEqual(0);
expect(emptyBucketSummary?.totalSizeBytes).toEqual(0);
}); });
tap.test('should upload an object', async () => { tap.test('should upload an object', async () => {
const response = await s3Client.send(new PutObjectCommand({ const response = await s3Client.send(new PutObjectCommand({
Bucket: 'test-bucket', Bucket: 'test-bucket',
Key: 'test-file.txt', Key: 'test-file.txt',
Body: 'Hello from AWS SDK!', Body: testObjectBody,
ContentType: 'text/plain', ContentType: 'text/plain',
})); }));
expect(response.$metadata.httpStatusCode).toEqual(200); expect(response.$metadata.httpStatusCode).toEqual(200);
expect(response.ETag).toBeTypeofString(); expect(response.ETag).toBeTypeofString();
}); });
tap.test('should reflect uploaded object in runtime stats', async () => {
const stats = await testSmartStorageInstance.getStorageStats();
const testBucketSummary = getBucketSummary(stats.buckets, 'test-bucket');
const emptyBucketSummary = getBucketSummary(stats.buckets, 'empty-bucket');
expect(stats.bucketCount).toEqual(2);
expect(stats.totalObjectCount).toEqual(1);
expect(stats.totalStorageBytes).toEqual(testObjectSize);
expect(testBucketSummary?.objectCount).toEqual(1);
expect(testBucketSummary?.totalSizeBytes).toEqual(testObjectSize);
expect(emptyBucketSummary?.objectCount).toEqual(0);
expect(emptyBucketSummary?.totalSizeBytes).toEqual(0);
});
tap.test('should download the object', async () => { tap.test('should download the object', async () => {
const response = await s3Client.send(new GetObjectCommand({ const response = await s3Client.send(new GetObjectCommand({
Bucket: 'test-bucket', Bucket: 'test-bucket',
@@ -76,7 +142,7 @@ tap.test('should download the object', async () => {
expect(response.$metadata.httpStatusCode).toEqual(200); expect(response.$metadata.httpStatusCode).toEqual(200);
const content = await streamToString(response.Body as Readable); const content = await streamToString(response.Body as Readable);
expect(content).toEqual('Hello from AWS SDK!'); expect(content).toEqual(testObjectBody);
}); });
tap.test('should delete the object', async () => { tap.test('should delete the object', async () => {
@@ -87,6 +153,20 @@ tap.test('should delete the object', async () => {
expect(response.$metadata.httpStatusCode).toEqual(204); expect(response.$metadata.httpStatusCode).toEqual(204);
}); });
tap.test('should reflect object deletion in runtime stats', async () => {
const stats = await testSmartStorageInstance.getStorageStats();
const testBucketSummary = getBucketSummary(stats.buckets, 'test-bucket');
const emptyBucketSummary = getBucketSummary(stats.buckets, 'empty-bucket');
expect(stats.bucketCount).toEqual(2);
expect(stats.totalObjectCount).toEqual(0);
expect(stats.totalStorageBytes).toEqual(0);
expect(testBucketSummary?.objectCount).toEqual(0);
expect(testBucketSummary?.totalSizeBytes).toEqual(0);
expect(emptyBucketSummary?.objectCount).toEqual(0);
expect(emptyBucketSummary?.totalSizeBytes).toEqual(0);
});
tap.test('should fail to get deleted object', async () => { tap.test('should fail to get deleted object', async () => {
await expect( await expect(
s3Client.send(new GetObjectCommand({ s3Client.send(new GetObjectCommand({
@@ -96,11 +176,37 @@ tap.test('should fail to get deleted object', async () => {
).rejects.toThrow(); ).rejects.toThrow();
}); });
tap.test('should delete the empty bucket', async () => {
const response = await s3Client.send(new DeleteBucketCommand({ Bucket: 'empty-bucket' }));
expect(response.$metadata.httpStatusCode).toEqual(204);
});
tap.test('should reflect bucket deletion in runtime stats', async () => {
const stats = await testSmartStorageInstance.getStorageStats();
const testBucketSummary = getBucketSummary(stats.buckets, 'test-bucket');
const emptyBucketSummary = getBucketSummary(stats.buckets, 'empty-bucket');
expect(stats.bucketCount).toEqual(1);
expect(stats.totalObjectCount).toEqual(0);
expect(stats.totalStorageBytes).toEqual(0);
expect(testBucketSummary?.objectCount).toEqual(0);
expect(testBucketSummary?.totalSizeBytes).toEqual(0);
expect(emptyBucketSummary).toEqual(undefined);
});
tap.test('should delete the bucket', async () => { tap.test('should delete the bucket', async () => {
const response = await s3Client.send(new DeleteBucketCommand({ Bucket: 'test-bucket' })); const response = await s3Client.send(new DeleteBucketCommand({ Bucket: 'test-bucket' }));
expect(response.$metadata.httpStatusCode).toEqual(204); expect(response.$metadata.httpStatusCode).toEqual(204);
}); });
tap.test('should expose empty runtime stats after deleting all buckets', async () => {
const stats = await testSmartStorageInstance.getStorageStats();
expect(stats.bucketCount).toEqual(0);
expect(stats.totalObjectCount).toEqual(0);
expect(stats.totalStorageBytes).toEqual(0);
expect(stats.buckets.length).toEqual(0);
});
tap.test('should stop the storage server', async () => { tap.test('should stop the storage server', async () => {
await testSmartStorageInstance.stop(); await testSmartStorageInstance.stop();
}); });
+335
View File
@@ -0,0 +1,335 @@
/// <reference types="node" />
import { expect, tap } from '@git.zone/tstest/tapbundle';
import {
CopyObjectCommand,
GetBucketPolicyCommand,
GetObjectCommand,
HeadBucketCommand,
ListBucketsCommand,
ListObjectsV2Command,
PutBucketPolicyCommand,
PutObjectCommand,
DeleteObjectCommand,
S3Client,
} from '@aws-sdk/client-s3';
import { rm } from 'fs/promises';
import { fileURLToPath } from 'url';
import { Readable } from 'stream';
import * as smartstorage from '../ts/index.js';
const TEST_PORT = 3361;
const STORAGE_DIR = fileURLToPath(new URL('../.nogit/bucket-tenant-tests', import.meta.url));
const WORKAPP_A_BUCKET = 'workapp-a-bucket';
const WORKAPP_B_BUCKET = 'workapp-b-bucket';
const RESTORE_BUCKET = 'workapp-a-restore-bucket';
const POLICY_BUCKET = 'workapp-policy-bucket';
const ADMIN_CREDENTIAL: smartstorage.IStorageCredential = {
accessKeyId: 'TENANTADMIN',
secretAccessKey: 'TENANTADMINSECRET123',
};
let testSmartStorageInstance: smartstorage.SmartStorage;
let adminClient: S3Client;
let tenantA: smartstorage.IBucketTenantDescriptor;
let tenantB: smartstorage.IBucketTenantDescriptor;
let tenantAClient: S3Client;
let tenantBClient: S3Client;
let oldTenantAClient: S3Client;
function createS3Client(
credential: smartstorage.IStorageCredential,
region = 'us-east-1',
): S3Client {
return new S3Client({
endpoint: `http://localhost:${TEST_PORT}`,
region,
credentials: {
accessKeyId: credential.accessKeyId,
secretAccessKey: credential.secretAccessKey,
},
forcePathStyle: true,
});
}
function createS3ClientFromDescriptor(
descriptor: smartstorage.IBucketTenantDescriptor,
): S3Client {
return new S3Client({
endpoint: `http://${descriptor.endpoint}:${descriptor.port}`,
region: descriptor.region,
credentials: {
accessKeyId: descriptor.accessKeyId,
secretAccessKey: descriptor.secretAccessKey,
},
forcePathStyle: true,
});
}
async function streamToString(stream: Readable): Promise<string> {
const chunks: Buffer[] = [];
return new Promise((resolve, reject) => {
stream.on('data', (chunk: string | Buffer | Uint8Array) => chunks.push(Buffer.from(chunk)));
stream.on('error', reject);
stream.on('end', () => resolve(Buffer.concat(chunks).toString('utf8')));
});
}
async function startStorage() {
testSmartStorageInstance = await smartstorage.SmartStorage.createAndStart({
server: {
port: TEST_PORT,
silent: true,
region: 'us-east-1',
},
storage: {
directory: STORAGE_DIR,
cleanSlate: false,
},
auth: {
enabled: true,
credentials: [ADMIN_CREDENTIAL],
},
});
adminClient = createS3Client(ADMIN_CREDENTIAL);
}
tap.test('setup: start storage and provision bucket tenants', async () => {
await rm(STORAGE_DIR, { recursive: true, force: true });
await startStorage();
tenantA = await testSmartStorageInstance.createBucketTenant({
bucketName: WORKAPP_A_BUCKET,
});
tenantB = await testSmartStorageInstance.createBucketTenant({
bucketName: WORKAPP_B_BUCKET,
});
tenantAClient = createS3ClientFromDescriptor(tenantA);
tenantBClient = createS3ClientFromDescriptor(tenantB);
});
tap.test('tenant descriptors expose app-ready S3 connection data', async () => {
expect(tenantA.endpoint).toEqual('localhost');
expect(tenantA.port).toEqual(TEST_PORT);
expect(tenantA.region).toEqual('us-east-1');
expect(tenantA.bucket).toEqual(WORKAPP_A_BUCKET);
expect(tenantA.bucketName).toEqual(WORKAPP_A_BUCKET);
expect(tenantA.accessKeyId).toBeTypeofString();
expect(tenantA.secretAccessKey).toBeTypeofString();
expect(tenantA.useSsl).toEqual(false);
expect(tenantA.env.S3_BUCKET).toEqual(WORKAPP_A_BUCKET);
expect(tenantA.env.AWS_ACCESS_KEY_ID).toEqual(tenantA.accessKeyId);
});
tap.test('listBucketTenants returns scoped credential metadata without secrets', async () => {
const tenants = await testSmartStorageInstance.listBucketTenants();
expect(tenants.length).toEqual(2);
expect(tenants.some((tenant) => tenant.bucketName === WORKAPP_A_BUCKET)).toEqual(true);
expect(tenants.some((tenant) => tenant.bucketName === WORKAPP_B_BUCKET)).toEqual(true);
expect((tenants[0] as any).secretAccessKey).toEqual(undefined);
});
tap.test('tenant credentials work with AWS SDK v3 for their assigned bucket', async () => {
const putA = await tenantAClient.send(new PutObjectCommand({
Bucket: WORKAPP_A_BUCKET,
Key: 'hello.txt',
Body: 'hello from tenant a',
ContentType: 'text/plain',
}));
expect(putA.$metadata.httpStatusCode).toEqual(200);
const putB = await tenantBClient.send(new PutObjectCommand({
Bucket: WORKAPP_B_BUCKET,
Key: 'other.txt',
Body: 'hello from tenant b',
ContentType: 'text/plain',
}));
expect(putB.$metadata.httpStatusCode).toEqual(200);
const getA = await tenantAClient.send(new GetObjectCommand({
Bucket: WORKAPP_A_BUCKET,
Key: 'hello.txt',
}));
expect(await streamToString(getA.Body as Readable)).toEqual('hello from tenant a');
const listA = await tenantAClient.send(new ListObjectsV2Command({
Bucket: WORKAPP_A_BUCKET,
}));
expect(listA.Contents?.some((object) => object.Key === 'hello.txt')).toEqual(true);
});
tap.test('tenant credentials cannot access unrelated buckets', async () => {
await expect(tenantAClient.send(new ListBucketsCommand({}))).rejects.toThrow();
await expect(tenantAClient.send(new HeadBucketCommand({
Bucket: WORKAPP_B_BUCKET,
}))).rejects.toThrow();
await expect(tenantAClient.send(new PutObjectCommand({
Bucket: WORKAPP_B_BUCKET,
Key: 'blocked-write.txt',
Body: 'blocked',
}))).rejects.toThrow();
await expect(tenantAClient.send(new GetObjectCommand({
Bucket: WORKAPP_B_BUCKET,
Key: 'other.txt',
}))).rejects.toThrow();
await expect(tenantAClient.send(new DeleteObjectCommand({
Bucket: WORKAPP_B_BUCKET,
Key: 'other.txt',
}))).rejects.toThrow();
await expect(tenantAClient.send(new CopyObjectCommand({
Bucket: WORKAPP_A_BUCKET,
Key: 'copy-from-other-bucket.txt',
CopySource: `/${WORKAPP_B_BUCKET}/other.txt`,
}))).rejects.toThrow();
await expect(tenantBClient.send(new GetObjectCommand({
Bucket: WORKAPP_A_BUCKET,
Key: 'hello.txt',
}))).rejects.toThrow();
});
tap.test('health and metrics expose running storage state', async () => {
const health = await testSmartStorageInstance.getHealth();
expect(health.running).toEqual(true);
expect(health.ok).toEqual(true);
expect(health.storageDirectory).toEqual(STORAGE_DIR);
expect(health.auth.enabled).toEqual(true);
expect(health.auth.tenantCredentialCount).toEqual(2);
expect(health.bucketCount >= 2).toEqual(true);
expect(health.objectCount >= 2).toEqual(true);
expect(health.totalBytes > 0).toEqual(true);
const metrics = await testSmartStorageInstance.getMetrics();
expect(metrics.tenantCredentialCount).toEqual(2);
expect(metrics.prometheusText).toMatch(/smartstorage_tenant_credentials_total 2/);
});
tap.test('export/import targets one bucket without unrelated tenant data', async () => {
const bucketExport = await testSmartStorageInstance.exportBucket({
bucketName: WORKAPP_A_BUCKET,
});
expect(bucketExport.format).toEqual('smartstorage.bucket.v1');
expect(bucketExport.bucketName).toEqual(WORKAPP_A_BUCKET);
expect(bucketExport.objects.some((object) => object.key === 'hello.txt')).toEqual(true);
expect(bucketExport.objects.some((object) => object.key === 'other.txt')).toEqual(false);
await testSmartStorageInstance.importBucket({
bucketName: RESTORE_BUCKET,
source: bucketExport,
});
const restoredObject = await adminClient.send(new GetObjectCommand({
Bucket: RESTORE_BUCKET,
Key: 'hello.txt',
}));
expect(await streamToString(restoredObject.Body as Readable)).toEqual('hello from tenant a');
const restoredObjects = await adminClient.send(new ListObjectsV2Command({
Bucket: RESTORE_BUCKET,
}));
expect(restoredObjects.Contents?.some((object) => object.Key === 'other.txt')).toEqual(false);
});
tap.test('bucket policies persist across restart', async () => {
await testSmartStorageInstance.createBucket(POLICY_BUCKET);
const policy = JSON.stringify({
Version: '2012-10-17',
Statement: [{
Sid: 'TenantPolicyPersistence',
Effect: 'Allow',
Principal: { AWS: ADMIN_CREDENTIAL.accessKeyId },
Action: ['s3:GetBucketPolicy', 's3:PutBucketPolicy', 's3:ListBucket'],
Resource: `arn:aws:s3:::${POLICY_BUCKET}`,
}],
});
const response = await adminClient.send(new PutBucketPolicyCommand({
Bucket: POLICY_BUCKET,
Policy: policy,
}));
expect(response.$metadata.httpStatusCode).toEqual(204);
});
tap.test('credential rotation replaces the active tenant credential', async () => {
oldTenantAClient = tenantAClient;
tenantA = await testSmartStorageInstance.rotateBucketTenantCredentials({
bucketName: WORKAPP_A_BUCKET,
});
tenantAClient = createS3ClientFromDescriptor(tenantA);
await expect(oldTenantAClient.send(new GetObjectCommand({
Bucket: WORKAPP_A_BUCKET,
Key: 'hello.txt',
}))).rejects.toThrow();
const getA = await tenantAClient.send(new GetObjectCommand({
Bucket: WORKAPP_A_BUCKET,
Key: 'hello.txt',
}));
expect(await streamToString(getA.Body as Readable)).toEqual('hello from tenant a');
const descriptor = await testSmartStorageInstance.getBucketTenantDescriptor({
bucketName: WORKAPP_A_BUCKET,
});
expect(descriptor.accessKeyId).toEqual(tenantA.accessKeyId);
expect(descriptor.secretAccessKey).toEqual(tenantA.secretAccessKey);
});
tap.test('runtime credentials survive restart', async () => {
await testSmartStorageInstance.stop();
await startStorage();
const persistedTenantA = await testSmartStorageInstance.getBucketTenantDescriptor({
bucketName: WORKAPP_A_BUCKET,
});
expect(persistedTenantA.accessKeyId).toEqual(tenantA.accessKeyId);
expect(persistedTenantA.secretAccessKey).toEqual(tenantA.secretAccessKey);
tenantAClient = createS3ClientFromDescriptor(persistedTenantA);
const getA = await tenantAClient.send(new GetObjectCommand({
Bucket: WORKAPP_A_BUCKET,
Key: 'hello.txt',
}));
expect(await streamToString(getA.Body as Readable)).toEqual('hello from tenant a');
const tenants = await testSmartStorageInstance.listBucketTenants();
expect(tenants.some((tenant) => tenant.bucketName === WORKAPP_A_BUCKET)).toEqual(true);
expect(tenants.some((tenant) => tenant.bucketName === WORKAPP_B_BUCKET)).toEqual(true);
const policyResponse = await adminClient.send(new GetBucketPolicyCommand({
Bucket: POLICY_BUCKET,
}));
expect(policyResponse.Policy?.includes('TenantPolicyPersistence')).toEqual(true);
});
tap.test('deleteBucketTenant can revoke credentials and delete tenant buckets', async () => {
await testSmartStorageInstance.deleteBucketTenant({
bucketName: WORKAPP_B_BUCKET,
accessKeyId: tenantB.accessKeyId,
});
await expect(tenantBClient.send(new GetObjectCommand({
Bucket: WORKAPP_B_BUCKET,
Key: 'other.txt',
}))).rejects.toThrow();
const headAfterRevoke = await adminClient.send(new HeadBucketCommand({
Bucket: WORKAPP_B_BUCKET,
}));
expect(headAfterRevoke.$metadata.httpStatusCode).toEqual(200);
await testSmartStorageInstance.deleteBucketTenant({
bucketName: WORKAPP_B_BUCKET,
});
await expect(adminClient.send(new HeadBucketCommand({
Bucket: WORKAPP_B_BUCKET,
}))).rejects.toThrow();
const tenants = await testSmartStorageInstance.listBucketTenants();
expect(tenants.some((tenant) => tenant.bucketName === WORKAPP_B_BUCKET)).toEqual(false);
});
tap.test('teardown: stop storage server', async () => {
await testSmartStorageInstance.stop();
});
export default tap.start();
+84
View File
@@ -0,0 +1,84 @@
/// <reference types="node" />
import { rm } from 'fs/promises';
import { join } from 'path';
import { expect, tap } from '@git.zone/tstest/tapbundle';
import * as smartstorage from '../ts/index.js';
let clusterStorage: smartstorage.SmartStorage;
const baseDir = join(process.cwd(), '.nogit', `cluster-health-${Date.now()}`);
const drivePaths = Array.from({ length: 6 }, (_value, index) => {
return join(baseDir, `drive-${index + 1}`);
});
const storageDir = join(baseDir, 'storage');
tap.test('setup: start clustered storage server', async () => {
clusterStorage = await smartstorage.SmartStorage.createAndStart({
server: {
port: 3348,
silent: true,
},
storage: {
directory: storageDir,
},
cluster: {
enabled: true,
nodeId: 'cluster-health-node',
quicPort: 4348,
seedNodes: [],
erasure: {
dataShards: 4,
parityShards: 2,
chunkSizeBytes: 1024 * 1024,
},
drives: {
paths: drivePaths,
},
},
});
});
tap.test('should expose clustered runtime health', async () => {
const health = await clusterStorage.getClusterHealth();
expect(health.enabled).toEqual(true);
expect(health.nodeId).toEqual('cluster-health-node');
expect(health.quorumHealthy).toEqual(true);
expect(health.majorityHealthy).toEqual(true);
expect(Array.isArray(health.peers)).toEqual(true);
expect(health.peers!.length).toEqual(0);
expect(Array.isArray(health.drives)).toEqual(true);
expect(health.drives!.length).toEqual(6);
expect(health.drives!.every((drive) => drive.status === 'online')).toEqual(true);
expect(health.drives!.every((drive) => drivePaths.includes(drive.path))).toEqual(true);
expect(health.drives!.every((drive) => drive.totalBytes !== undefined)).toEqual(true);
expect(health.drives!.every((drive) => drive.usedBytes !== undefined)).toEqual(true);
expect(health.drives!.every((drive) => drive.lastCheck !== undefined)).toEqual(true);
expect(health.drives!.every((drive) => drive.erasureSetId === 0)).toEqual(true);
expect(health.erasure?.dataShards).toEqual(4);
expect(health.erasure?.parityShards).toEqual(2);
expect(health.erasure?.chunkSizeBytes).toEqual(1024 * 1024);
expect(health.erasure?.totalShards).toEqual(6);
expect(health.erasure?.readQuorum).toEqual(4);
expect(health.erasure?.writeQuorum).toEqual(5);
expect(health.erasure?.erasureSetCount).toEqual(1);
expect(health.repairs?.active).toEqual(false);
expect(health.repairs?.scanIntervalMs).toEqual(24 * 60 * 60 * 1000);
});
tap.test('should expose cluster health after bucket creation', async () => {
const bucket = await clusterStorage.createBucket('cluster-health-bucket');
const health = await clusterStorage.getClusterHealth();
expect(bucket.name).toEqual('cluster-health-bucket');
expect(health.enabled).toEqual(true);
expect(health.quorumHealthy).toEqual(true);
expect(health.drives!.length).toEqual(6);
});
tap.test('teardown: stop clustered server and clean files', async () => {
await clusterStorage.stop();
await rm(baseDir, { recursive: true, force: true });
});
export default tap.start()
+317
View File
@@ -0,0 +1,317 @@
/// <reference types="node" />
import { readFile, readdir, rm } from 'fs/promises';
import { join } from 'path';
import { expect, tap } from '@git.zone/tstest/tapbundle';
import { CreateBucketCommand, GetObjectCommand, PutObjectCommand, S3Client } from '@aws-sdk/client-s3';
import { Readable } from 'stream';
import * as smartstorage from '../ts/index.js';
const baseDir = join(process.cwd(), '.nogit', `cluster-multinode-${Date.now()}`);
const nodes: smartstorage.SmartStorage[] = [];
const makeDrivePaths = (nodeId: string) => {
return [1, 2].map((driveIndex) => join(baseDir, nodeId, `drive-${driveIndex}`));
};
const streamToString = async (stream: Readable): Promise<string> => {
const chunks: Buffer[] = [];
return new Promise((resolve, reject) => {
stream.on('data', (chunk: string | Buffer | Uint8Array) => chunks.push(Buffer.from(chunk)));
stream.on('error', reject);
stream.on('end', () => resolve(Buffer.concat(chunks).toString('utf8')));
});
};
const fileExistsBelow = async (directory: string, fileName: string): Promise<boolean> => {
let entries;
try {
entries = await readdir(directory, { withFileTypes: true });
} catch {
return false;
}
for (const entry of entries) {
const entryPath = join(directory, entry.name);
if (entry.isFile() && entry.name === fileName) {
return true;
}
if (entry.isDirectory() && await fileExistsBelow(entryPath, fileName)) {
return true;
}
}
return false;
};
const waitFor = async (check: () => Promise<boolean>, timeoutMs = 10000) => {
const deadline = Date.now() + timeoutMs;
let lastError = '';
while (Date.now() < deadline) {
try {
if (await check()) {
return;
}
} catch (error) {
lastError = error instanceof Error ? error.message : String(error);
}
await new Promise((resolve) => setTimeout(resolve, 250));
}
throw new Error(`Timed out waiting for cluster condition${lastError ? `: ${lastError}` : ''}`);
};
tap.test('setup: start three clustered storage nodes', async () => {
await rm(baseDir, { recursive: true, force: true });
const node1 = await smartstorage.SmartStorage.createAndStart({
server: {
address: '127.0.0.1',
port: 3350,
silent: true,
},
storage: {
directory: join(baseDir, 'node-1', 'storage'),
},
cluster: {
enabled: true,
nodeId: 'node-1',
quicPort: 4350,
seedNodes: [],
erasure: {
dataShards: 4,
parityShards: 2,
chunkSizeBytes: 1024 * 1024,
},
drives: {
paths: makeDrivePaths('node-1'),
},
heartbeatIntervalMs: 500,
heartbeatTimeoutMs: 3000,
},
});
nodes.push(node1);
await new Promise((resolve) => setTimeout(resolve, 500));
const node2 = await smartstorage.SmartStorage.createAndStart({
server: {
address: '127.0.0.1',
port: 3351,
silent: true,
},
storage: {
directory: join(baseDir, 'node-2', 'storage'),
},
cluster: {
enabled: true,
nodeId: 'node-2',
quicPort: 4351,
seedNodes: ['127.0.0.1:4350'],
erasure: {
dataShards: 4,
parityShards: 2,
chunkSizeBytes: 1024 * 1024,
},
drives: {
paths: makeDrivePaths('node-2'),
},
heartbeatIntervalMs: 500,
heartbeatTimeoutMs: 3000,
},
});
nodes.push(node2);
await new Promise((resolve) => setTimeout(resolve, 500));
const node3 = await smartstorage.SmartStorage.createAndStart({
server: {
address: '127.0.0.1',
port: 3352,
silent: true,
},
storage: {
directory: join(baseDir, 'node-3', 'storage'),
},
cluster: {
enabled: true,
nodeId: 'node-3',
quicPort: 4352,
seedNodes: ['127.0.0.1:4350'],
erasure: {
dataShards: 4,
parityShards: 2,
chunkSizeBytes: 1024 * 1024,
},
drives: {
paths: makeDrivePaths('node-3'),
},
heartbeatIntervalMs: 500,
heartbeatTimeoutMs: 3000,
},
});
nodes.push(node3);
});
tap.test('seed node should report joined peers and multi-node erasure topology', async () => {
const seed = nodes[0];
await waitFor(async () => {
const health = await seed.getClusterHealth();
if (health.peers?.length !== 2 || health.erasure?.erasureSetCount !== 1) {
throw new Error(JSON.stringify(health));
}
return health.peers?.length === 2 && health.erasure?.erasureSetCount === 1;
});
const health = await seed.getClusterHealth();
const peerIds = health.peers!.map((peer) => peer.nodeId).sort();
expect(health.enabled).toEqual(true);
expect(health.nodeId).toEqual('node-1');
expect(health.quorumHealthy).toEqual(true);
expect(health.majorityHealthy).toEqual(true);
expect(peerIds).toEqual(['node-2', 'node-3']);
expect(health.erasure?.totalShards).toEqual(6);
expect(health.erasure?.erasureSetCount).toEqual(1);
});
tap.test('all nodes should converge to the same multi-node topology', async () => {
for (const node of nodes) {
await waitFor(async () => {
const health = await node.getClusterHealth();
if (health.peers?.length !== 2 || health.erasure?.erasureSetCount !== 1) {
throw new Error(JSON.stringify(health));
}
return true;
});
}
});
tap.test('seed node should write shards to the declared remote drives', async () => {
const seed = nodes[0];
const descriptor = await seed.getStorageDescriptor();
const client = new S3Client({
endpoint: `http://${descriptor.endpoint}:${descriptor.port}`,
region: 'us-east-1',
credentials: {
accessKeyId: descriptor.accessKey,
secretAccessKey: descriptor.accessSecret,
},
forcePathStyle: true,
});
const bucket = 'multinode-bucket';
const key = 'distributed.txt';
const body = 'hello distributed shards';
await client.send(new CreateBucketCommand({ Bucket: bucket }));
await client.send(new PutObjectCommand({ Bucket: bucket, Key: key, Body: body }));
const getResponse = await client.send(new GetObjectCommand({ Bucket: bucket, Key: key }));
expect(await streamToString(getResponse.Body as Readable)).toEqual(body);
const manifestPath = join(
baseDir,
'node-1',
'storage',
'.manifests',
bucket,
`${key}.manifest.json`,
);
const manifest = JSON.parse(await readFile(manifestPath, 'utf8')) as {
chunks: Array<{
shardPlacements: Array<{ shardIndex: number; nodeId: string; driveId: string }>;
}>;
};
const placements = manifest.chunks[0].shardPlacements;
expect(placements.length).toEqual(6);
expect(placements.some((placement) => placement.nodeId === 'node-2' && placement.driveId === '1'))
.toEqual(true);
expect(placements.some((placement) => placement.nodeId === 'node-3' && placement.driveId === '1'))
.toEqual(true);
for (const placement of placements) {
const drivePath = makeDrivePaths(placement.nodeId)[Number(placement.driveId)];
const shardFile = `shard-${placement.shardIndex}.dat`;
expect(await fileExistsBelow(join(drivePath, '.smartstorage', 'data'), shardFile)).toEqual(true);
}
});
tap.test('restarted peer should keep durable identity and rejoin topology', async () => {
await nodes[1].stop();
await new Promise((resolve) => setTimeout(resolve, 500));
nodes[1] = await smartstorage.SmartStorage.createAndStart({
server: {
address: '127.0.0.1',
port: 3351,
silent: true,
},
storage: {
directory: join(baseDir, 'node-2', 'storage'),
},
cluster: {
enabled: true,
nodeId: 'node-2',
quicPort: 4351,
seedNodes: ['127.0.0.1:4350'],
erasure: {
dataShards: 4,
parityShards: 2,
chunkSizeBytes: 1024 * 1024,
},
drives: {
paths: makeDrivePaths('node-2'),
},
heartbeatIntervalMs: 500,
heartbeatTimeoutMs: 3000,
},
});
await waitFor(async () => {
const health = await nodes[1].getClusterHealth();
if (health.nodeId !== 'node-2' || health.peers?.length !== 2) {
throw new Error(JSON.stringify(health));
}
return true;
});
const identityPath = join(
baseDir,
'node-2',
'storage',
'.smartstorage',
'cluster',
'identity.json',
);
const topologyPath = join(
baseDir,
'node-2',
'storage',
'.smartstorage',
'cluster',
'topology.json',
);
const identity = JSON.parse(await readFile(identityPath, 'utf8')) as {
nodeId: string;
clusterId: string;
};
const topology = JSON.parse(await readFile(topologyPath, 'utf8')) as {
cluster_id: string;
nodes: Array<{ node_id: string }>;
};
expect(identity.nodeId).toEqual('node-2');
expect(identity.clusterId).toEqual(topology.cluster_id);
expect(topology.nodes.some((node) => node.node_id === 'node-1')).toEqual(true);
expect(topology.nodes.some((node) => node.node_id === 'node-3')).toEqual(true);
});
tap.test('teardown: stop clustered nodes and clean files', async () => {
for (const node of nodes.reverse()) {
await node.stop();
}
await rm(baseDir, { recursive: true, force: true });
});
export default tap.start();
+150
View File
@@ -0,0 +1,150 @@
import { expect, tap } from '@git.zone/tstest/tapbundle';
import {
CreateBucketCommand,
DeleteBucketCommand,
ListBucketsCommand,
S3Client,
} from '@aws-sdk/client-s3';
import * as smartstorage from '../ts/index.js';
const TEST_PORT = 3349;
const INITIAL_CREDENTIAL: smartstorage.IStorageCredential = {
accessKeyId: 'RUNTIMEINITIAL',
secretAccessKey: 'RUNTIMEINITIALSECRET123',
};
const ROTATED_CREDENTIAL_A: smartstorage.IStorageCredential = {
accessKeyId: 'RUNTIMEA',
secretAccessKey: 'RUNTIMEASECRET123',
};
const ROTATED_CREDENTIAL_B: smartstorage.IStorageCredential = {
accessKeyId: 'RUNTIMEB',
secretAccessKey: 'RUNTIMEBSECRET123',
};
const TEST_BUCKET = 'runtime-credentials-bucket';
let testSmartStorageInstance: smartstorage.SmartStorage;
let initialClient: S3Client;
let rotatedClientA: S3Client;
let rotatedClientB: S3Client;
function createS3Client(credential: smartstorage.IStorageCredential): S3Client {
return new S3Client({
endpoint: `http://localhost:${TEST_PORT}`,
region: 'us-east-1',
credentials: {
accessKeyId: credential.accessKeyId,
secretAccessKey: credential.secretAccessKey,
},
forcePathStyle: true,
});
}
tap.test('setup: start storage server with runtime-managed credentials', async () => {
testSmartStorageInstance = await smartstorage.SmartStorage.createAndStart({
server: {
port: TEST_PORT,
silent: true,
region: 'us-east-1',
},
storage: {
cleanSlate: true,
},
auth: {
enabled: true,
credentials: [INITIAL_CREDENTIAL],
},
});
initialClient = createS3Client(INITIAL_CREDENTIAL);
rotatedClientA = createS3Client(ROTATED_CREDENTIAL_A);
rotatedClientB = createS3Client(ROTATED_CREDENTIAL_B);
});
tap.test('startup credentials authenticate successfully', async () => {
const response = await initialClient.send(new ListBucketsCommand({}));
expect(response.$metadata.httpStatusCode).toEqual(200);
});
tap.test('listCredentials returns active credential metadata without secrets', async () => {
const credentials = await testSmartStorageInstance.listCredentials();
expect(credentials.length).toEqual(1);
expect(credentials[0].accessKeyId).toEqual(INITIAL_CREDENTIAL.accessKeyId);
expect((credentials[0] as any).secretAccessKey).toEqual(undefined);
});
tap.test('invalid replacement input fails cleanly and leaves old credentials active', async () => {
await expect(
testSmartStorageInstance.replaceCredentials([
{
accessKeyId: '',
secretAccessKey: 'invalid-secret',
},
]),
).rejects.toThrow();
const credentials = await testSmartStorageInstance.listCredentials();
expect(credentials.length).toEqual(1);
expect(credentials[0].accessKeyId).toEqual(INITIAL_CREDENTIAL.accessKeyId);
const response = await initialClient.send(new ListBucketsCommand({}));
expect(response.$metadata.httpStatusCode).toEqual(200);
});
tap.test('replacing credentials swaps the active set atomically', async () => {
await testSmartStorageInstance.replaceCredentials([
ROTATED_CREDENTIAL_A,
ROTATED_CREDENTIAL_B,
]);
const credentials = await testSmartStorageInstance.listCredentials();
expect(credentials.length).toEqual(2);
expect(credentials[0].accessKeyId).toEqual(ROTATED_CREDENTIAL_A.accessKeyId);
expect(credentials[1].accessKeyId).toEqual(ROTATED_CREDENTIAL_B.accessKeyId);
});
tap.test('old credentials stop working immediately for new requests', async () => {
await expect(initialClient.send(new ListBucketsCommand({}))).rejects.toThrow();
});
tap.test('first rotated credential authenticates successfully', async () => {
const response = await rotatedClientA.send(
new CreateBucketCommand({ Bucket: TEST_BUCKET }),
);
expect(response.$metadata.httpStatusCode).toEqual(200);
});
tap.test('multiple rotated credentials remain active', async () => {
const response = await rotatedClientB.send(new ListBucketsCommand({}));
expect(response.$metadata.httpStatusCode).toEqual(200);
expect(response.Buckets?.some((bucket) => bucket.Name === TEST_BUCKET)).toEqual(true);
});
tap.test('duplicate replacement input fails cleanly without changing the active set', async () => {
await expect(
testSmartStorageInstance.replaceCredentials([
ROTATED_CREDENTIAL_A,
{
accessKeyId: ROTATED_CREDENTIAL_A.accessKeyId,
secretAccessKey: 'another-secret',
},
]),
).rejects.toThrow();
const credentials = await testSmartStorageInstance.listCredentials();
expect(credentials.length).toEqual(2);
expect(credentials[0].accessKeyId).toEqual(ROTATED_CREDENTIAL_A.accessKeyId);
expect(credentials[1].accessKeyId).toEqual(ROTATED_CREDENTIAL_B.accessKeyId);
const response = await rotatedClientA.send(new ListBucketsCommand({}));
expect(response.$metadata.httpStatusCode).toEqual(200);
});
tap.test('teardown: clean up bucket and stop the storage server', async () => {
const response = await rotatedClientA.send(
new DeleteBucketCommand({ Bucket: TEST_BUCKET }),
);
expect(response.$metadata.httpStatusCode).toEqual(204);
await testSmartStorageInstance.stop();
});
export default tap.start()
+50
View File
@@ -0,0 +1,50 @@
import { expect, tap } from '@git.zone/tstest/tapbundle';
import * as smartstorage from '../ts/index.js';
const TEST_PORT = 3353;
let testSmartStorageInstance: smartstorage.SmartStorage;
tap.test('setup: start storage server for operational endpoint checks', async () => {
testSmartStorageInstance = await smartstorage.SmartStorage.createAndStart({
server: {
port: TEST_PORT,
silent: true,
region: 'us-east-1',
},
storage: {
cleanSlate: true,
},
auth: {
enabled: false,
credentials: [],
},
});
});
tap.test('operational endpoints expose live ready health and metrics', async () => {
const live = await fetch(`http://localhost:${TEST_PORT}/-/live`);
expect(live.status).toEqual(200);
expect((await live.json()).status).toEqual('alive');
const ready = await fetch(`http://localhost:${TEST_PORT}/-/ready`);
expect(ready.status).toEqual(200);
expect((await ready.json()).status).toEqual('ready');
const health = await fetch(`http://localhost:${TEST_PORT}/-/health`);
expect(health.status).toEqual(200);
const healthBody = await health.json();
expect(healthBody.ok).toEqual(true);
expect(healthBody.cluster.enabled).toEqual(false);
const metrics = await fetch(`http://localhost:${TEST_PORT}/-/metrics`);
expect(metrics.status).toEqual(200);
const metricsBody = await metrics.text();
expect(metricsBody.includes('smartstorage_requests_total')).toEqual(true);
expect(metricsBody.includes('smartstorage_cluster_enabled 0')).toEqual(true);
});
tap.test('teardown: stop storage server', async () => {
await testSmartStorageInstance.stop();
});
export default tap.start();
+1 -1
View File
@@ -3,6 +3,6 @@
*/ */
export const commitinfo = { export const commitinfo = {
name: '@push.rocks/smartstorage', name: '@push.rocks/smartstorage',
version: '6.0.1', version: '6.5.0',
description: 'A Node.js TypeScript package to create a local S3-compatible storage server using mapped local directories for development and testing purposes.' description: 'A Node.js TypeScript package to create a local S3-compatible storage server using mapped local directories for development and testing purposes.'
} }
+466 -5
View File
@@ -1,15 +1,28 @@
import * as plugins from './plugins.js'; import * as plugins from './plugins.js';
import * as paths from './paths.js'; import * as paths from './paths.js';
/**
* Authentication configuration
*/
export interface IStorageCredential {
accessKeyId: string;
secretAccessKey: string;
bucketName?: string;
region?: string;
}
export interface IStorageCredentialMetadata {
accessKeyId: string;
bucketName?: string;
region?: string;
}
/** /**
* Authentication configuration * Authentication configuration
*/ */
export interface IAuthConfig { export interface IAuthConfig {
enabled: boolean; enabled: boolean;
credentials: Array<{ credentials: IStorageCredential[];
accessKeyId: string;
secretAccessKey: string;
}>;
} }
/** /**
@@ -69,6 +82,36 @@ export interface IStorageConfig {
cleanSlate?: boolean; cleanSlate?: boolean;
} }
/**
* Erasure coding configuration
*/
export interface IErasureConfig {
dataShards?: number;
parityShards?: number;
chunkSizeBytes?: number;
}
/**
* Drive configuration for multi-drive support
*/
export interface IDriveConfig {
paths: string[];
}
/**
* Cluster configuration for distributed mode
*/
export interface IClusterConfig {
enabled: boolean;
nodeId?: string;
quicPort?: number;
seedNodes?: string[];
erasure?: IErasureConfig;
drives?: IDriveConfig;
heartbeatIntervalMs?: number;
heartbeatTimeoutMs?: number;
}
/** /**
* Complete smartstorage configuration * Complete smartstorage configuration
*/ */
@@ -80,6 +123,188 @@ export interface ISmartStorageConfig {
logging?: ILoggingConfig; logging?: ILoggingConfig;
limits?: ILimitsConfig; limits?: ILimitsConfig;
multipart?: IMultipartConfig; multipart?: IMultipartConfig;
cluster?: IClusterConfig;
}
/**
* Logical bucket stats maintained by the Rust runtime.
* Values are initialized from native storage on startup and updated on smartstorage mutations.
*/
export interface IBucketSummary {
name: string;
objectCount: number;
totalSizeBytes: number;
creationDate?: number;
}
/**
* Filesystem-level capacity snapshot for the storage directory or configured drive path.
*/
export interface IStorageLocationSummary {
path: string;
totalBytes?: number;
availableBytes?: number;
usedBytes?: number;
}
/**
* Runtime storage stats served by the Rust core without issuing S3 list calls.
*/
export interface IStorageStats {
bucketCount: number;
totalObjectCount: number;
totalStorageBytes: number;
buckets: IBucketSummary[];
storageDirectory: string;
storageLocations?: IStorageLocationSummary[];
}
export interface IBucketTenantInput {
bucketName: string;
accessKeyId?: string;
secretAccessKey?: string;
region?: string;
}
export interface IDeleteBucketTenantInput {
bucketName: string;
accessKeyId?: string;
}
export interface IBucketTenantMetadata {
bucketName: string;
accessKeyId: string;
region?: string;
}
export interface IBucketTenantDescriptor extends plugins.tsclass.storage.IS3Descriptor {
endpoint: string;
port: number;
region: string;
bucket: string;
bucketName: string;
accessKeyId: string;
secretAccessKey: string;
accessKey: string;
accessSecret: string;
useSsl: boolean;
ssl: boolean;
env: Record<string, string>;
}
export interface IBucketExportObject {
key: string;
size: number;
md5: string;
metadata: Record<string, string>;
dataHex: string;
}
export interface IBucketExport {
format: 'smartstorage.bucket.v1';
bucketName: string;
exportedAt: number;
objects: IBucketExportObject[];
}
export interface IExportBucketInput {
bucketName: string;
}
export interface IImportBucketInput {
bucketName: string;
source: IBucketExport;
}
export interface ISmartStorageHealth {
ok: boolean;
running: boolean;
storageDirectory: string;
auth: {
enabled: boolean;
credentialCount: number;
tenantCredentialCount: number;
};
bucketCount: number;
objectCount: number;
totalBytes: number;
cluster: IClusterHealth;
}
export interface ISmartStorageMetrics {
bucketCount: number;
objectCount: number;
totalBytes: number;
authCredentialCount: number;
tenantCredentialCount: number;
clusterEnabled: boolean;
prometheusText: string;
}
/**
* Known peer status from the local node's current cluster view.
*/
export interface IClusterPeerHealth {
nodeId: string;
status: 'online' | 'suspect' | 'offline';
quicAddress?: string;
s3Address?: string;
driveCount?: number;
lastHeartbeat?: number;
missedHeartbeats?: number;
}
/**
* Local drive health as measured by smartstorage's runtime probes.
*/
export interface IClusterDriveHealth {
index: number;
path: string;
status: 'online' | 'degraded' | 'offline' | 'healing';
totalBytes?: number;
usedBytes?: number;
availableBytes?: number;
errorCount?: number;
lastError?: string;
lastCheck?: number;
erasureSetId?: number;
}
export interface IClusterErasureHealth {
dataShards: number;
parityShards: number;
chunkSizeBytes: number;
totalShards: number;
readQuorum: number;
writeQuorum: number;
erasureSetCount: number;
}
export interface IClusterRepairHealth {
active: boolean;
scanIntervalMs?: number;
lastRunStartedAt?: number;
lastRunCompletedAt?: number;
lastDurationMs?: number;
shardsChecked?: number;
shardsHealed?: number;
failed?: number;
lastError?: string;
}
/**
* Cluster runtime health from the Rust core.
* When clustering is disabled, the response is `{ enabled: false }`.
*/
export interface IClusterHealth {
enabled: boolean;
nodeId?: string;
quorumHealthy?: boolean;
majorityHealthy?: boolean;
peers?: IClusterPeerHealth[];
drives?: IClusterDriveHealth[];
erasure?: IClusterErasureHealth;
repairs?: IClusterRepairHealth;
} }
/** /**
@@ -163,7 +388,16 @@ function mergeConfig(userConfig: ISmartStorageConfig): Required<ISmartStorageCon
...DEFAULT_CONFIG.multipart!, ...DEFAULT_CONFIG.multipart!,
...(userConfig.multipart || {}), ...(userConfig.multipart || {}),
}, },
}; ...(userConfig.cluster ? { cluster: userConfig.cluster } : {}),
} as Required<ISmartStorageConfig>;
}
function createAccessKeyId(): string {
return `SS${plugins.crypto.randomBytes(10).toString('hex').toUpperCase()}`;
}
function createSecretAccessKey(): string {
return plugins.crypto.randomBytes(32).toString('hex');
} }
/** /**
@@ -173,6 +407,40 @@ type TRustStorageCommands = {
start: { params: { config: Required<ISmartStorageConfig> }; result: {} }; start: { params: { config: Required<ISmartStorageConfig> }; result: {} };
stop: { params: {}; result: {} }; stop: { params: {}; result: {} };
createBucket: { params: { name: string }; result: {} }; createBucket: { params: { name: string }; result: {} };
createBucketTenant: {
params: {
bucketName: string;
accessKeyId: string;
secretAccessKey: string;
region?: string;
};
result: IStorageCredential;
};
deleteBucketTenant: {
params: { bucketName: string; accessKeyId?: string };
result: {};
};
rotateBucketTenantCredentials: {
params: {
bucketName: string;
accessKeyId: string;
secretAccessKey: string;
region?: string;
};
result: IStorageCredential;
};
listBucketTenants: { params: {}; result: IBucketTenantMetadata[] };
getBucketTenantCredential: {
params: { bucketName: string };
result: IStorageCredential;
};
exportBucket: { params: { bucketName: string }; result: IBucketExport };
importBucket: { params: { bucketName: string; source: IBucketExport }; result: {} };
getStorageStats: { params: {}; result: IStorageStats };
listBucketSummaries: { params: {}; result: IBucketSummary[] };
listCredentials: { params: {}; result: IStorageCredentialMetadata[] };
replaceCredentials: { params: { credentials: IStorageCredential[] }; result: {} };
getClusterHealth: { params: {}; result: IClusterHealth };
}; };
/** /**
@@ -189,6 +457,7 @@ export class SmartStorage {
// INSTANCE // INSTANCE
public config: Required<ISmartStorageConfig>; public config: Required<ISmartStorageConfig>;
private bridge: InstanceType<typeof plugins.RustBridge<TRustStorageCommands>>; private bridge: InstanceType<typeof plugins.RustBridge<TRustStorageCommands>>;
private running = false;
constructor(configArg: ISmartStorageConfig = {}) { constructor(configArg: ISmartStorageConfig = {}) {
this.config = mergeConfig(configArg); this.config = mergeConfig(configArg);
@@ -208,6 +477,7 @@ export class SmartStorage {
throw new Error('Failed to spawn ruststorage binary. Make sure it is compiled (pnpm build).'); throw new Error('Failed to spawn ruststorage binary. Make sure it is compiled (pnpm build).');
} }
await this.bridge.sendCommand('start', { config: this.config }); await this.bridge.sendCommand('start', { config: this.config });
this.running = true;
if (!this.config.server.silent) { if (!this.config.server.silent) {
console.log('storage server is running'); console.log('storage server is running');
@@ -237,13 +507,204 @@ export class SmartStorage {
}; };
} }
private getEndpoint(): string {
return this.config.server.address === '0.0.0.0' ? 'localhost' : this.config.server.address!;
}
private buildBucketTenantDescriptor(
credential: IStorageCredential,
bucketNameArg: string,
): IBucketTenantDescriptor {
const bucketName = credential.bucketName || bucketNameArg;
const region = credential.region || this.config.server.region || 'us-east-1';
const endpoint = this.getEndpoint();
const port = this.config.server.port!;
const useSsl = false;
return {
endpoint,
port,
region,
bucket: bucketName,
bucketName,
accessKeyId: credential.accessKeyId,
secretAccessKey: credential.secretAccessKey,
accessKey: credential.accessKeyId,
accessSecret: credential.secretAccessKey,
useSsl,
ssl: useSsl,
env: {
S3_ENDPOINT: endpoint,
S3_PORT: String(port),
S3_REGION: region,
S3_BUCKET: bucketName,
S3_ACCESS_KEY_ID: credential.accessKeyId,
S3_SECRET_ACCESS_KEY: credential.secretAccessKey,
S3_USE_SSL: String(useSsl),
AWS_ACCESS_KEY_ID: credential.accessKeyId,
AWS_SECRET_ACCESS_KEY: credential.secretAccessKey,
AWS_REGION: region,
},
};
}
private assertTenantAuthEnabled(): void {
if (!this.config.auth.enabled) {
throw new Error('Bucket tenant APIs require auth.enabled=true.');
}
}
public async createBucket(bucketNameArg: string) { public async createBucket(bucketNameArg: string) {
await this.bridge.sendCommand('createBucket', { name: bucketNameArg }); await this.bridge.sendCommand('createBucket', { name: bucketNameArg });
return { name: bucketNameArg }; return { name: bucketNameArg };
} }
public async createBucketTenant(
tenantArg: IBucketTenantInput,
): Promise<IBucketTenantDescriptor> {
this.assertTenantAuthEnabled();
const credential = await this.bridge.sendCommand('createBucketTenant', {
bucketName: tenantArg.bucketName,
accessKeyId: tenantArg.accessKeyId || createAccessKeyId(),
secretAccessKey: tenantArg.secretAccessKey || createSecretAccessKey(),
region: tenantArg.region || this.config.server.region,
});
return this.buildBucketTenantDescriptor(credential, tenantArg.bucketName);
}
public async deleteBucketTenant(tenantArg: IDeleteBucketTenantInput): Promise<void> {
this.assertTenantAuthEnabled();
await this.bridge.sendCommand('deleteBucketTenant', tenantArg);
}
public async rotateBucketTenantCredentials(
tenantArg: IBucketTenantInput,
): Promise<IBucketTenantDescriptor> {
this.assertTenantAuthEnabled();
const credential = await this.bridge.sendCommand('rotateBucketTenantCredentials', {
bucketName: tenantArg.bucketName,
accessKeyId: tenantArg.accessKeyId || createAccessKeyId(),
secretAccessKey: tenantArg.secretAccessKey || createSecretAccessKey(),
region: tenantArg.region || this.config.server.region,
});
return this.buildBucketTenantDescriptor(credential, tenantArg.bucketName);
}
public async listBucketTenants(): Promise<IBucketTenantMetadata[]> {
return this.bridge.sendCommand('listBucketTenants', {});
}
public async getBucketTenantDescriptor(optionsArg: {
bucketName: string;
}): Promise<IBucketTenantDescriptor> {
const credential = await this.bridge.sendCommand('getBucketTenantCredential', {
bucketName: optionsArg.bucketName,
});
return this.buildBucketTenantDescriptor(credential, optionsArg.bucketName);
}
public async exportBucket(optionsArg: IExportBucketInput): Promise<IBucketExport> {
return this.bridge.sendCommand('exportBucket', { bucketName: optionsArg.bucketName });
}
public async importBucket(optionsArg: IImportBucketInput): Promise<void> {
await this.bridge.sendCommand('importBucket', optionsArg);
}
public async getStorageStats(): Promise<IStorageStats> {
return this.bridge.sendCommand('getStorageStats', {});
}
public async listBucketSummaries(): Promise<IBucketSummary[]> {
return this.bridge.sendCommand('listBucketSummaries', {});
}
public async listCredentials(): Promise<IStorageCredentialMetadata[]> {
return this.bridge.sendCommand('listCredentials', {});
}
public async replaceCredentials(credentials: IStorageCredential[]): Promise<void> {
await this.bridge.sendCommand('replaceCredentials', { credentials });
this.config.auth.credentials = credentials.map((credential) => ({ ...credential }));
}
public async getClusterHealth(): Promise<IClusterHealth> {
return this.bridge.sendCommand('getClusterHealth', {});
}
public async getHealth(): Promise<ISmartStorageHealth> {
if (!this.running) {
return {
ok: false,
running: false,
storageDirectory: this.config.storage.directory || paths.bucketsDir,
auth: {
enabled: this.config.auth.enabled,
credentialCount: this.config.auth.credentials.length,
tenantCredentialCount: 0,
},
bucketCount: 0,
objectCount: 0,
totalBytes: 0,
cluster: { enabled: false },
};
}
const [stats, credentials, tenants, cluster] = await Promise.all([
this.getStorageStats(),
this.listCredentials(),
this.listBucketTenants(),
this.getClusterHealth(),
]);
return {
ok: true,
running: true,
storageDirectory: stats.storageDirectory,
auth: {
enabled: this.config.auth.enabled,
credentialCount: credentials.length,
tenantCredentialCount: tenants.length,
},
bucketCount: stats.bucketCount,
objectCount: stats.totalObjectCount,
totalBytes: stats.totalStorageBytes,
cluster,
};
}
public async getMetrics(): Promise<ISmartStorageMetrics> {
const health = await this.getHealth();
const clusterEnabled = health.cluster.enabled;
return {
bucketCount: health.bucketCount,
objectCount: health.objectCount,
totalBytes: health.totalBytes,
authCredentialCount: health.auth.credentialCount,
tenantCredentialCount: health.auth.tenantCredentialCount,
clusterEnabled,
prometheusText: [
'# HELP smartstorage_buckets_total Runtime bucket count.',
'# TYPE smartstorage_buckets_total gauge',
`smartstorage_buckets_total ${health.bucketCount}`,
'# HELP smartstorage_objects_total Runtime object count.',
'# TYPE smartstorage_objects_total gauge',
`smartstorage_objects_total ${health.objectCount}`,
'# HELP smartstorage_storage_bytes_total Runtime storage bytes.',
'# TYPE smartstorage_storage_bytes_total gauge',
`smartstorage_storage_bytes_total ${health.totalBytes}`,
'# HELP smartstorage_tenant_credentials_total Scoped bucket tenant credential count.',
'# TYPE smartstorage_tenant_credentials_total gauge',
`smartstorage_tenant_credentials_total ${health.auth.tenantCredentialCount}`,
'# HELP smartstorage_cluster_enabled Cluster mode enabled.',
'# TYPE smartstorage_cluster_enabled gauge',
`smartstorage_cluster_enabled ${clusterEnabled ? 1 : 0}`,
].join('\n'),
};
}
public async stop() { public async stop() {
await this.bridge.sendCommand('stop', {}); await this.bridge.sendCommand('stop', {});
this.bridge.kill(); this.bridge.kill();
this.running = false;
} }
} }
+2 -1
View File
@@ -1,7 +1,8 @@
// node native // node native
import * as crypto from 'crypto';
import * as path from 'path'; import * as path from 'path';
export { path }; export { crypto, path };
// @push.rocks scope // @push.rocks scope
import * as smartpath from '@push.rocks/smartpath'; import * as smartpath from '@push.rocks/smartpath';
+3
View File
@@ -5,6 +5,9 @@
"moduleResolution": "NodeNext", "moduleResolution": "NodeNext",
"esModuleInterop": true, "esModuleInterop": true,
"verbatimModuleSyntax": true, "verbatimModuleSyntax": true,
"types": ["node"],
"noImplicitAny": true,
"ignoreDeprecations": "6.0",
"baseUrl": ".", "baseUrl": ".",
"paths": {} "paths": {}
}, },