Monorepo consolidation: workspace, shared types, transport plans, docker/swam assets
This commit is contained in:
2
.cargo/config.toml
Normal file
2
.cargo/config.toml
Normal file
@@ -0,0 +1,2 @@
|
||||
[registries.madapes]
|
||||
index = "sparse+https://git.madapes.com/api/packages/madapes/cargo/"
|
||||
15
.dockerignore
Normal file
15
.dockerignore
Normal file
@@ -0,0 +1,15 @@
|
||||
target/
|
||||
**/target/
|
||||
|
||||
node_modules/
|
||||
**/node_modules/
|
||||
|
||||
dist/
|
||||
**/dist/
|
||||
|
||||
.git/
|
||||
**/.git/
|
||||
|
||||
.DS_Store
|
||||
|
||||
control/ui/.vite/
|
||||
47
.github/workflows/ci.yml
vendored
Normal file
47
.github/workflows/ci.yml
vendored
Normal file
@@ -0,0 +1,47 @@
|
||||
name: ci
|
||||
|
||||
on:
|
||||
push:
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
ui:
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 10
|
||||
defaults:
|
||||
run:
|
||||
working-directory: control/ui
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 20
|
||||
cache: npm
|
||||
cache-dependency-path: control/ui/package-lock.json
|
||||
|
||||
- run: npm config set registry https://registry.npmjs.org
|
||||
- run: npm ci
|
||||
- run: npm run lint
|
||||
- run: npm run typecheck
|
||||
- run: npm run test
|
||||
- run: npm run build
|
||||
|
||||
rust:
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 15
|
||||
defaults:
|
||||
run:
|
||||
working-directory: .
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: dtolnay/rust-toolchain@stable
|
||||
with:
|
||||
components: rustfmt, clippy
|
||||
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
|
||||
- run: cargo fmt --check
|
||||
- run: cargo clippy --workspace --all-targets -- -D warnings
|
||||
- run: cargo test --workspace
|
||||
12
.gitignore
vendored
Normal file
12
.gitignore
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
target/
|
||||
/target-*/
|
||||
**/target/
|
||||
**/target-*/
|
||||
|
||||
node_modules/
|
||||
**/node_modules/
|
||||
|
||||
dist/
|
||||
**/dist/
|
||||
|
||||
.DS_Store
|
||||
6050
Cargo.lock
generated
Normal file
6050
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
15
Cargo.toml
Normal file
15
Cargo.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
[workspace]
|
||||
resolver = "2"
|
||||
members = [
|
||||
"aggregate",
|
||||
"gateway",
|
||||
"projection",
|
||||
"runner",
|
||||
"shared",
|
||||
"control/api",
|
||||
]
|
||||
|
||||
[profile.release]
|
||||
lto = true
|
||||
codegen-units = 1
|
||||
strip = "symbols"
|
||||
56
DOCKER.md
Normal file
56
DOCKER.md
Normal file
@@ -0,0 +1,56 @@
|
||||
# Docker
|
||||
|
||||
## Local Dev (Compose)
|
||||
|
||||
```bash
|
||||
docker compose up -d --build
|
||||
docker compose ps
|
||||
docker compose down -v
|
||||
```
|
||||
|
||||
To include the observability stack (Grafana/Loki/Tempo/VictoriaMetrics) with the local compose:
|
||||
|
||||
```bash
|
||||
docker compose -f docker-compose.yml -f observability/docker-compose.yml up -d --build
|
||||
docker compose -f docker-compose.yml -f observability/docker-compose.yml down -v
|
||||
```
|
||||
|
||||
Service ports in the default compose:
|
||||
- Gateway HTTP: `http://localhost:8080`
|
||||
- Gateway gRPC: `localhost:8081`
|
||||
- Aggregate gRPC: `localhost:50051`
|
||||
- Aggregate HTTP: `http://localhost:18080`
|
||||
- Runner HTTP: `http://localhost:28080`
|
||||
- Control API: `http://localhost:38080`
|
||||
- Control UI: `http://localhost:8082`
|
||||
- NATS: `nats://localhost:4222`, monitoring `http://localhost:8222`
|
||||
|
||||
## Swarm (Dev)
|
||||
|
||||
Build images:
|
||||
|
||||
```bash
|
||||
sh docker/scripts/build_images.sh all
|
||||
```
|
||||
|
||||
Create dev secrets required by the observability stack:
|
||||
|
||||
```bash
|
||||
sh docker/scripts/swarm_dev_secrets.sh
|
||||
```
|
||||
|
||||
Deploy:
|
||||
|
||||
```bash
|
||||
docker stack deploy -c swarm/stacks/platform.yml cloudlysis
|
||||
docker stack deploy -c swarm/stacks/control-plane.yml cloudlysis_control
|
||||
docker stack deploy -c swarm/stacks/observability.yml cloudlysis_obs
|
||||
```
|
||||
|
||||
Remove:
|
||||
|
||||
```bash
|
||||
docker stack rm cloudlysis_obs
|
||||
docker stack rm cloudlysis_control
|
||||
docker stack rm cloudlysis
|
||||
```
|
||||
216
GATEWAY_TRANSPORT_PLAN.md
Normal file
216
GATEWAY_TRANSPORT_PLAN.md
Normal file
@@ -0,0 +1,216 @@
|
||||
# Gateway Transport Plan
|
||||
|
||||
## Purpose
|
||||
Standardize and optimize how the Gateway communicates with Aggregate, Projection, and Runner, and how nodes communicate via NATS JetStream, under these principles:
|
||||
- Simplicity (few patterns, minimal bespoke conventions)
|
||||
- Ease of operation (consistent health/ready/metrics, consistent failure modes)
|
||||
- Frugality (bounded connections, bounded fanout, low overhead)
|
||||
- High performance (low tail latency, backpressure-aware, predictable routing)
|
||||
- Safety (tenant isolation, deny-by-default authz, consistent context propagation)
|
||||
|
||||
## Non-Negotiable Rules (Global)
|
||||
- Every cross-service request MUST carry tenant + trace context.
|
||||
- Every transport path MUST have explicit timeouts/deadlines and bounded retries.
|
||||
- Every milestone below is “stop-the-line” gated:
|
||||
- All tasks completed
|
||||
- All tests passing
|
||||
- Workspace lint/format/type checks passing
|
||||
- Required integration tests for the milestone passing (when gated by env, they must be runnable and documented)
|
||||
|
||||
## Current State (Baseline)
|
||||
- Gateway → Aggregate: gRPC command submission
|
||||
- Gateway → Projection: HTTP query proxy (`/v1/query/*`)
|
||||
- Gateway → Runner: HTTP proxy for admin endpoints (`/admin/runner/*`)
|
||||
- Nodes ↔ NATS JetStream: events/workflow streams with headers for tenant/correlation/trace (now more consistent)
|
||||
|
||||
## Target Architecture (End State)
|
||||
- Edge contract (clients ↔ Gateway): HTTP/JSON (stable, debuggable, browser + ops friendly)
|
||||
- Internal RPC (Gateway ↔ services): gRPC for Aggregate + Projection + Runner (single internal RPC stack)
|
||||
- Async/event backbone: NATS JetStream remains for event/work distribution
|
||||
- `shared` is the single source of truth for:
|
||||
- Header names and propagation rules
|
||||
- Trace parsing/validation rules (`traceparent`, `trace-id`)
|
||||
- Request context representation (tenant/correlation/trace)
|
||||
|
||||
## Definitions
|
||||
### Request Context
|
||||
Fields that must be consistently propagated:
|
||||
- `tenant_id` (HTTP: `x-tenant-id`, NATS: `tenant-id`)
|
||||
- `correlation_id` (HTTP: `x-correlation-id`, NATS: `x-correlation-id` and `correlation-id`)
|
||||
- `traceparent` (HTTP: `traceparent`, NATS: `traceparent`)
|
||||
- `trace_id` (derived from `traceparent` or provided explicitly; NATS: `trace-id`)
|
||||
- `request_id` (HTTP: `x-request-id`, optional for NATS)
|
||||
|
||||
### Standard Health Endpoints (per service)
|
||||
- `GET /health` liveness
|
||||
- `GET /ready` readiness (includes tenant gating if applicable)
|
||||
- `GET /metrics` Prometheus
|
||||
|
||||
## Milestone 0: Transport Contract Lock-in (Context + Headers Everywhere)
|
||||
|
||||
### Goal
|
||||
Make context propagation and header naming consistent and enforceable across HTTP, gRPC, and NATS, including “background” Gateway calls (health checks, rebalance probes).
|
||||
|
||||
### Exit Criteria
|
||||
- A single shared contract exists for header names and trace parsing.
|
||||
- Gateway injects context into all upstream calls (including rebalance/health probes).
|
||||
- Aggregate/Projection/Runner consistently emit/consume the standard context on all transport paths they own.
|
||||
- Unit tests prove propagation behavior for each transport.
|
||||
- `cargo fmt --check`, `cargo clippy --workspace --all-targets -- -D warnings`, `cargo test --workspace` all pass.
|
||||
|
||||
### Tasks
|
||||
- [ ] Standardize header constants in `shared` and remove string literals from Gateway and nodes where feasible.
|
||||
- [ ] Add `shared` helpers for:
|
||||
- HTTP extract/inject
|
||||
- gRPC metadata extract/inject
|
||||
- NATS header extract/inject
|
||||
- [ ] Gateway: ensure context is injected into:
|
||||
- gRPC upstream requests to Aggregate
|
||||
- HTTP upstream requests to Projection
|
||||
- Runner admin proxy requests
|
||||
- Any “probe” calls (rebalance gates, fleet snapshots, health checks)
|
||||
- [ ] Projection/Runner/Aggregate: ensure NATS published messages include:
|
||||
- `tenant-id`
|
||||
- `x-correlation-id` + `correlation-id`
|
||||
- `traceparent`
|
||||
- `trace-id` (derived when possible)
|
||||
- [ ] Add transport-level tests:
|
||||
- [ ] Gateway gRPC path: incoming context → upstream metadata → response metadata preserved
|
||||
- [ ] Gateway HTTP proxy path: incoming context → upstream headers preserved
|
||||
- [ ] NATS publish path: produced headers contain expected keys/values
|
||||
|
||||
### Required Tests
|
||||
- Unit tests for shared parsing/derivation utilities
|
||||
- Existing per-crate test suites
|
||||
- At least one per-service “transport contract” test verifying headers are present and correct
|
||||
|
||||
## Milestone 1: Internal RPC Standardization (Projection via gRPC)
|
||||
|
||||
### Goal
|
||||
Eliminate Gateway → Projection HTTP proxy as the default path by introducing an internal gRPC Query service, keeping HTTP optional for human/debug use.
|
||||
|
||||
### Exit Criteria
|
||||
- A Projection gRPC service exists for query execution.
|
||||
- Gateway routes queries to Projection via gRPC by default.
|
||||
- Authorization semantics remain enforced in Gateway (deny-by-default).
|
||||
- Response shapes are stable and match the existing UI expectations.
|
||||
- All tests pass, including new gRPC query integration tests.
|
||||
|
||||
### Tasks
|
||||
- [ ] Define protobuf API: `projection.gateway.v1.QueryService`
|
||||
- [ ] Request includes tenant + view + query payload and metadata
|
||||
- [ ] Response includes result payload and standard context propagation
|
||||
- [ ] Implement Projection gRPC server:
|
||||
- [ ] Parse tenant/view/query
|
||||
- [ ] Execute query against current projection storage/query engine
|
||||
- [ ] Enforce tenant scope
|
||||
- [ ] Implement Gateway gRPC client path for queries:
|
||||
- [ ] Routing by tenant to Projection endpoint
|
||||
- [ ] Deadlines, bounded retries (idempotent only)
|
||||
- [ ] Context propagation (tenant/correlation/trace)
|
||||
- [ ] Keep HTTP `/v1/query/*`:
|
||||
- [ ] Either route to internal gRPC implementation or keep as legacy/debug endpoint
|
||||
- [ ] Add tests:
|
||||
- [ ] Gateway query authz + forwarding via gRPC
|
||||
- [ ] Projection gRPC query contract tests for tenant isolation
|
||||
|
||||
### Required Tests
|
||||
- New gRPC QueryService tests (unit + integration)
|
||||
- Existing query/authz tests in Gateway
|
||||
- Workspace fmt/clippy/test
|
||||
|
||||
## Milestone 2: Internal RPC Standardization (Runner Admin via gRPC)
|
||||
|
||||
### Goal
|
||||
Replace `/admin/runner/*` HTTP proxying with a first-class gRPC admin service for Runner operations.
|
||||
|
||||
### Exit Criteria
|
||||
- Runner exposes a gRPC admin service for the admin surface required by Control/Gateway.
|
||||
- Gateway uses gRPC to call Runner admin APIs.
|
||||
- Authentication/authorization remains in Gateway; Runner trusts Gateway boundary.
|
||||
- Admin operations are idempotent where appropriate and include audit hooks where required.
|
||||
- All tests pass and include negative/tenant-spoof cases.
|
||||
|
||||
### Tasks
|
||||
- [ ] Define protobuf API: `runner.admin.v1.RunnerAdmin`
|
||||
- [ ] Drain/resume/status/reload/tenant-scoped controls
|
||||
- [ ] Standard error mapping
|
||||
- [ ] Implement Runner gRPC admin server:
|
||||
- [ ] Tenant gating enforced for tenant-scoped operations
|
||||
- [ ] Readiness/drain semantics aligned with platform contracts
|
||||
- [ ] Implement Gateway gRPC client integration:
|
||||
- [ ] Route to Runner endpoint via routing table
|
||||
- [ ] Enforce authz rights (e.g. `runner.admin`)
|
||||
- [ ] Context propagation
|
||||
- [ ] Keep HTTP `/admin/*` in Runner optional:
|
||||
- [ ] Either remove Gateway proxy usage or keep for direct debugging behind secure network
|
||||
- [ ] Tests:
|
||||
- [ ] Gateway: admin calls rejected without rights
|
||||
- [ ] Gateway: tenant spoof attempts rejected
|
||||
- [ ] Runner: idempotency and drain semantics validated
|
||||
|
||||
### Required Tests
|
||||
- gRPC RunnerAdmin unit/integration tests
|
||||
- Gateway proxy-to-gRPC tests
|
||||
- Workspace fmt/clippy/test
|
||||
|
||||
## Milestone 3: Connection + Retry Policy Unification (Performance + Frugality)
|
||||
|
||||
### Goal
|
||||
Make upstream connection management and retry behavior consistent and bounded across Gateway and nodes.
|
||||
|
||||
### Exit Criteria
|
||||
- Gateway maintains bounded upstream connection pools for gRPC endpoints.
|
||||
- All gRPC calls have deadlines; retries are only for idempotent operations.
|
||||
- All probe/fanout calls are bounded and do not cause thundering herds.
|
||||
- Load/soak tests show stable behavior under partial failure.
|
||||
|
||||
### Tasks
|
||||
- [ ] Implement a Gateway upstream channel pool:
|
||||
- [ ] LRU bounded by max endpoints
|
||||
- [ ] TTL/eviction strategy
|
||||
- [ ] Fast path reuse under load
|
||||
- [ ] Standardize retry profiles:
|
||||
- [ ] Read-only: short retry with jitter
|
||||
- [ ] Mutations: no automatic retry unless idempotency key present
|
||||
- [ ] Standardize timeouts:
|
||||
- [ ] Edge timeout limits
|
||||
- [ ] Internal per-service deadlines
|
||||
- [ ] Fanout controls:
|
||||
- [ ] Concurrency limiters for fleet snapshot/probes
|
||||
- [ ] Cache results where safe (short TTL)
|
||||
|
||||
### Required Tests
|
||||
- Unit tests for pool eviction/TTL
|
||||
- Gateway integration tests for deadline propagation
|
||||
- Gated load tests (document env + how to run)
|
||||
|
||||
## Milestone 4: Transport Simplification Cleanup (Remove Legacy Paths)
|
||||
|
||||
### Goal
|
||||
Remove or de-prioritize legacy HTTP internal paths so the “happy path” uses: HTTP edge → Gateway → gRPC internal → NATS async.
|
||||
|
||||
### Exit Criteria
|
||||
- Gateway no longer depends on HTTP for Projection queries or Runner admin.
|
||||
- Legacy endpoints are either removed or explicitly marked “debug-only” and not used by Gateway/Control.
|
||||
- All operational playbooks rely on standardized endpoints.
|
||||
|
||||
### Tasks
|
||||
- [ ] Remove Gateway’s HTTP query proxy usage (or keep only as compatibility shim).
|
||||
- [ ] Remove Gateway’s runner admin HTTP proxy usage (or keep only as compatibility shim).
|
||||
- [ ] Ensure Control UI + Control API use the standardized Gateway surfaces.
|
||||
- [ ] Harden metrics and health probes to always carry context.
|
||||
|
||||
### Required Tests
|
||||
- End-to-end smoke tests (gated)
|
||||
- Workspace fmt/clippy/test
|
||||
|
||||
## Verification Commands (Required at Each Milestone)
|
||||
- `cargo fmt --check`
|
||||
- `cargo clippy --workspace --all-targets -- -D warnings`
|
||||
- `cargo test --workspace`
|
||||
- `npm ci && npm run lint && npm run typecheck && npm run test && npm run build` (in `control/ui`)
|
||||
|
||||
## Notes / Constraints
|
||||
- Do not break wire compatibility for NATS subjects or event payloads; evolve via optional fields and tolerant decoding.
|
||||
- Keep tenant isolation rules enforced at the Gateway boundary and re-validated at nodes where it is safety-critical.
|
||||
58
Makefile
Normal file
58
Makefile
Normal file
@@ -0,0 +1,58 @@
|
||||
.PHONY: docker-build-platform docker-build-control docker-build-observability docker-build-all
|
||||
.PHONY: compose-up compose-down compose-ps compose-up-observability compose-down-observability
|
||||
.PHONY: swarm-dev-secrets swarm-deploy-platform swarm-deploy-control swarm-deploy-observability swarm-deploy-all
|
||||
.PHONY: swarm-rm-platform swarm-rm-control swarm-rm-observability swarm-rm-all
|
||||
|
||||
docker-build-platform:
|
||||
sh docker/scripts/build_images.sh platform
|
||||
|
||||
docker-build-control:
|
||||
sh docker/scripts/build_images.sh control
|
||||
|
||||
docker-build-observability:
|
||||
true
|
||||
|
||||
docker-build-all:
|
||||
sh docker/scripts/build_images.sh all
|
||||
|
||||
compose-up:
|
||||
docker compose up -d --build
|
||||
|
||||
compose-up-observability:
|
||||
docker compose -f docker-compose.yml -f observability/docker-compose.yml up -d --build
|
||||
|
||||
compose-down:
|
||||
docker compose down -v
|
||||
|
||||
compose-down-observability:
|
||||
docker compose -f docker-compose.yml -f observability/docker-compose.yml down -v
|
||||
|
||||
compose-ps:
|
||||
docker compose ps
|
||||
|
||||
swarm-dev-secrets:
|
||||
sh docker/scripts/swarm_dev_secrets.sh
|
||||
|
||||
swarm-deploy-platform:
|
||||
docker stack deploy -c swarm/stacks/platform.yml cloudlysis
|
||||
|
||||
swarm-deploy-control:
|
||||
docker stack deploy -c swarm/stacks/control-plane.yml cloudlysis_control
|
||||
|
||||
swarm-deploy-observability:
|
||||
docker stack deploy -c swarm/stacks/observability.yml cloudlysis_obs
|
||||
|
||||
swarm-deploy-all: swarm-dev-secrets swarm-deploy-platform swarm-deploy-control swarm-deploy-observability
|
||||
true
|
||||
|
||||
swarm-rm-platform:
|
||||
docker stack rm cloudlysis
|
||||
|
||||
swarm-rm-control:
|
||||
docker stack rm cloudlysis_control
|
||||
|
||||
swarm-rm-observability:
|
||||
docker stack rm cloudlysis_obs
|
||||
|
||||
swarm-rm-all: swarm-rm-observability swarm-rm-control swarm-rm-platform
|
||||
true
|
||||
246
NATS_TRANSPORT_PLAN.md
Normal file
246
NATS_TRANSPORT_PLAN.md
Normal file
@@ -0,0 +1,246 @@
|
||||
# NATS Transport Plan
|
||||
|
||||
## Purpose
|
||||
Standardize and optimize how nodes (Aggregate, Projection, Runner, Gateway where applicable) use NATS JetStream and NATS KV, under these principles:
|
||||
- Simplicity (few primitives, consistent naming, minimal per-service divergence)
|
||||
- Ease of operation (predictable streams/consumers, clear runbooks, easy debugging)
|
||||
- Frugality (bounded consumers, bounded in-flight work, minimal churn, minimal storage)
|
||||
- Low resource usage (stable durable consumers, controlled ack waits, limited fanout)
|
||||
- High performance (high throughput, low tail latency, reliable backpressure)
|
||||
- Safety (tenant isolation, idempotency, deterministic replay, poison handling)
|
||||
|
||||
## Non-Negotiable Rules (Global)
|
||||
- Every JetStream stream/consumer MUST have an explicit contract:
|
||||
- name, subjects, retention, storage, replication, max sizes
|
||||
- ack policy, ack wait, max deliver, max in flight
|
||||
- Every node MUST run with bounded work:
|
||||
- bounded pull batch sizes
|
||||
- bounded concurrency
|
||||
- bounded retry/backoff
|
||||
- Every message MUST be tenant-scoped in subject and/or headers.
|
||||
- Every milestone below is “stop-the-line” gated:
|
||||
- all tasks completed
|
||||
- all tests passing
|
||||
- workspace lint/format checks passing
|
||||
- required NATS-gated integration tests for the milestone passing (when gated by env)
|
||||
|
||||
## Current State (Baseline)
|
||||
- Streams:
|
||||
- `AGGREGATE_EVENTS` (Aggregate publishes, Projection/Runner consume)
|
||||
- `WORKFLOW_COMMANDS`, `WORKFLOW_EVENTS` (Runner)
|
||||
- Subject conventions:
|
||||
- Aggregate events: `tenant.<tenant_id>.aggregate.<aggregate_type>.<aggregate_id>`
|
||||
- Defaults often use filters like `tenant.*.aggregate.*.*`
|
||||
- Durable consumers:
|
||||
- Projection uses a durable name (configurable)
|
||||
- Runner uses configurable durable prefix per role
|
||||
- Aggregate had ad-hoc fetch consumer risks; now mitigated with unique consumer names per fetch
|
||||
- Headers:
|
||||
- Tenant + correlation + trace headers exist but were historically inconsistent; shared utilities now exist
|
||||
|
||||
## Target Architecture (End State)
|
||||
- A single “NATS wire protocol” contract shared across services:
|
||||
- subject naming
|
||||
- required headers (tenant/correlation/trace)
|
||||
- message envelope compatibility rules (tolerant decoding, optional fields)
|
||||
- Stable, minimal set of JetStream streams:
|
||||
- one stream per message class (aggregate events, workflow commands, workflow events)
|
||||
- no per-tenant streams unless there is a strong operational reason
|
||||
- Stable, limited consumers:
|
||||
- durable consumers for long-lived processors (Projection, Runner)
|
||||
- ephemeral consumers only for bounded ad-hoc operations (Aggregate fetch), always unique + best-effort deletion
|
||||
- Uniform backpressure + reliability defaults:
|
||||
- explicit ack
|
||||
- bounded `max_ack_pending` and application-level concurrency
|
||||
- bounded redelivery via `max_deliver` + poison policy
|
||||
|
||||
## Definitions
|
||||
### Message Context (Headers)
|
||||
Standard headers for NATS published messages:
|
||||
- `tenant-id` (required)
|
||||
- `x-correlation-id` and `correlation-id` (required for any request-derived message; generated if missing)
|
||||
- `traceparent` (optional but recommended; generated/propagated if present upstream)
|
||||
- `trace-id` (optional; derived from traceparent when possible)
|
||||
- `Nats-Msg-Id` (required for idempotent publish/dedupe when applicable)
|
||||
|
||||
### Subject Naming Rules
|
||||
- Tenant-first prefix: `tenant.<tenant_id>.…`
|
||||
- Stable message class token:
|
||||
- `aggregate` for domain events
|
||||
- `effect`, `effect_result`, `workflow`, `workflow_event` for Runner
|
||||
- No ambiguous wildcard publishing:
|
||||
- producers publish concrete subjects only
|
||||
- consumers may filter with wildcards
|
||||
|
||||
### Consumer Naming Rules
|
||||
- Durable consumer names must be stable and collision-free:
|
||||
- include role + mode + optional view/saga name + shard/group
|
||||
- Ephemeral consumer names must be unique per operation:
|
||||
- include tenant + purpose + uuid
|
||||
- must be deleted best-effort when operation completes
|
||||
|
||||
## Milestone 0: NATS Wire Contract Lock-in (Names, Headers, Envelopes)
|
||||
|
||||
### Goal
|
||||
Make the NATS/JetStream wire contract explicit and enforced in code so all producers/consumers interoperate safely across scale-out and rolling restarts.
|
||||
|
||||
### Exit Criteria
|
||||
- `shared` exposes NATS header constants and helpers for inject/extract/derive.
|
||||
- All producers set required headers consistently.
|
||||
- All consumers tolerate unknown fields and missing optional fields.
|
||||
- A single, documented subject naming convention is enforced in code (builder functions).
|
||||
- Workspace fmt/clippy/tests pass.
|
||||
|
||||
### Tasks
|
||||
- [ ] Centralize NATS header constants and helpers in `shared`:
|
||||
- [ ] inject headers for publish (tenant, correlation, trace)
|
||||
- [ ] extract headers on receive (best-effort)
|
||||
- [ ] derive `trace-id` from `traceparent`
|
||||
- [ ] Aggregate:
|
||||
- [ ] Ensure event publishing always sets `tenant-id`, correlation headers, trace headers
|
||||
- [ ] Ensure `Nats-Msg-Id` strategy is correct for idempotency/dedupe (document and test)
|
||||
- [ ] Projection:
|
||||
- [ ] Ensure EventEnvelope decoding remains tolerant (unknown fields ignored, optional IDs supported)
|
||||
- [ ] Ensure correlation/trace context is carried into spans/metrics consistently
|
||||
- [ ] Runner:
|
||||
- [ ] Ensure publish paths include correlation/trace headers consistently for commands and results
|
||||
- [ ] Ensure outbox metadata → NATS headers mapping is consistent and tested
|
||||
- [ ] Tests:
|
||||
- [ ] Unit tests for header injection/extraction in `shared`
|
||||
- [ ] Per-service unit tests asserting produced headers include required keys
|
||||
|
||||
### Required Tests
|
||||
- `cargo fmt --check`
|
||||
- `cargo clippy --workspace --all-targets -- -D warnings`
|
||||
- `cargo test --workspace`
|
||||
|
||||
## Milestone 1: Stream Configuration Standardization (Retention, Limits, Storage)
|
||||
|
||||
### Goal
|
||||
Make stream configs consistent, explicit, and operationally sane across environments (dev → prod), minimizing surprise and preventing runaway resource usage.
|
||||
|
||||
### Exit Criteria
|
||||
- Stream config for each stream is explicitly defined and validated at startup.
|
||||
- Limits (max messages/bytes/age) are explicit and have defaults.
|
||||
- Duplicate windows and dedupe behavior are explicit and tested.
|
||||
- A “no destructive changes on startup” policy is enforced (create if missing; do not silently replace).
|
||||
|
||||
### Tasks
|
||||
- [ ] Define a single “stream config policy” module per service (or shared helper):
|
||||
- [ ] `AGGREGATE_EVENTS` subjects + retention policy
|
||||
- [ ] `WORKFLOW_COMMANDS` subjects + retention policy
|
||||
- [ ] `WORKFLOW_EVENTS` subjects + retention policy
|
||||
- [ ] Standardize defaults:
|
||||
- [ ] retention: limits appropriate for replay + rebuild
|
||||
- [ ] `duplicate_window` aligned with producer idempotency strategy
|
||||
- [ ] storage type and replication policy documented and configurable
|
||||
- [ ] Add startup validations:
|
||||
- [ ] verify stream exists and matches required subject set (compatible superset allowed)
|
||||
- [ ] verify required ack/dedupe assumptions hold
|
||||
- [ ] Add tests that parse and validate configs without NATS.
|
||||
|
||||
### Required Tests
|
||||
- Unit tests for stream config builders
|
||||
- Existing crate tests
|
||||
|
||||
## Milestone 2: Consumer Policy Standardization (Ack, Backpressure, Poison)
|
||||
|
||||
### Goal
|
||||
Make consumption reliable and cheap under load by standardizing ack policy, concurrency, and poison/deadletter handling.
|
||||
|
||||
### Exit Criteria
|
||||
- All long-lived consumers use explicit ack with consistent `ack_wait`, `max_deliver`, `max_ack_pending`.
|
||||
- Application concurrency is bounded and tied to `max_in_flight`.
|
||||
- Poison policy is consistent:
|
||||
- after `max_deliver`, term + deadletter/quarantine record is written
|
||||
- Replay behavior is deterministic on restart (checkpoint-based where applicable).
|
||||
|
||||
### Tasks
|
||||
- [ ] Define standard consumer config defaults:
|
||||
- [ ] `AckPolicy::Explicit`
|
||||
- [ ] `ack_wait` default + env override
|
||||
- [ ] `max_deliver` default + env override
|
||||
- [ ] `max_ack_pending` tied to application concurrency
|
||||
- [ ] Projection:
|
||||
- [ ] Ensure durable consumer naming is collision-free in all modes (Single vs PerView)
|
||||
- [ ] Ensure checkpoint gates ack correctly (skip still acks)
|
||||
- [ ] Ensure poison policy writes durable records and terminates reliably
|
||||
- [ ] Runner:
|
||||
- [ ] Ensure saga/effect consumers use consistent durable naming + deliver groups when scaling out
|
||||
- [ ] Ensure outbox relay preserves exactly-once semantics via dedupe keys + idempotent publish
|
||||
- [ ] Aggregate:
|
||||
- [ ] Ensure ad-hoc fetch consumer is bounded (timeouts) and unique per operation (already required)
|
||||
- [ ] Ensure best-effort cleanup is performed and cannot delete unrelated consumers
|
||||
- [ ] Tests:
|
||||
- [ ] Unit tests for consumer name generation (sanitization + uniqueness)
|
||||
- [ ] NATS-gated tests for ack/redelivery/poison behavior (must be runnable with env flag)
|
||||
|
||||
### Required Tests
|
||||
- Workspace fmt/clippy/tests
|
||||
- NATS-gated integration tests for:
|
||||
- redelivery idempotency
|
||||
- poison termination behavior
|
||||
- scale-out with deliver group (where supported)
|
||||
|
||||
## Milestone 3: Connection Management + Failure Semantics (Operational Frugality)
|
||||
|
||||
### Goal
|
||||
Make NATS connection handling stable under partial failure while minimizing resource churn and cascading outages.
|
||||
|
||||
### Exit Criteria
|
||||
- One NATS connection per process (or bounded pool only if justified).
|
||||
- Reconnect/backoff policy is explicit and consistent.
|
||||
- Circuit breaker behavior is consistent (when used), and health/ready reflect NATS state correctly.
|
||||
- No busy-looping on NATS outages.
|
||||
|
||||
### Tasks
|
||||
- [ ] Standardize connection options:
|
||||
- [ ] reconnect delays/backoff
|
||||
- [ ] max reconnect attempts or “infinite with backoff” strategy (explicit)
|
||||
- [ ] request timeouts around JetStream operations
|
||||
- [ ] Standardize readiness semantics:
|
||||
- [ ] `ready=false` when NATS is unavailable and the node depends on it
|
||||
- [ ] `health` stays “process alive” but reports NATS connectivity in payload
|
||||
- [ ] Add “fast fail” mode for tests and dev (avoid 30x retries when env not set).
|
||||
- [ ] Tests:
|
||||
- [ ] unit tests for backoff behavior (where possible)
|
||||
- [ ] gated integration test: temporary NATS outage does not crash-loop and recovers
|
||||
|
||||
## Milestone 4: Multi-Tenant Scale-Out Guarantees (Collision-Free + Predictable)
|
||||
|
||||
### Goal
|
||||
Guarantee safe multi-replica behavior: no consumer collisions, no duplicate side effects, predictable throughput with bounded resource usage.
|
||||
|
||||
### Exit Criteria
|
||||
- Durable names are deterministic and collision-free across replicas.
|
||||
- Deliver groups are used where appropriate to share work across replicas.
|
||||
- Exactly-once side effects are enforced via idempotency + dedupe keys (not wishful thinking).
|
||||
- A scale-out test suite exists and is gated but runnable.
|
||||
|
||||
### Tasks
|
||||
- [ ] Establish consumer naming scheme per service role:
|
||||
- [ ] Projection: per-view durable option uses sanitized names and stable mapping
|
||||
- [ ] Runner: durable prefix includes role + shard + optional group
|
||||
- [ ] Establish deliver group usage rules:
|
||||
- [ ] when to enable (scale-out consumers)
|
||||
- [ ] how to roll without duplication
|
||||
- [ ] Strengthen dedupe keys:
|
||||
- [ ] event-driven sagas: checkpoint + dedupe marker strategy tested under redelivery
|
||||
- [ ] outbox relay: verify publish idempotency with `Nats-Msg-Id`
|
||||
- [ ] Add gated tests:
|
||||
- [ ] two replicas, same tenant, no duplicate publishes
|
||||
- [ ] rolling restart preserves checkpoint correctness
|
||||
|
||||
## Verification Commands (Required at Each Milestone)
|
||||
- `cargo fmt --check`
|
||||
- `cargo clippy --workspace --all-targets -- -D warnings`
|
||||
- `cargo test --workspace`
|
||||
- Gated NATS integration tests:
|
||||
- Runner: `RUNNER_TEST_NATS_URL=... cargo test -p runner -- --ignored`
|
||||
- Projection: `PROJECTION_TEST_NATS_URL=... cargo test -p projection -- --ignored`
|
||||
- Control API (if it runs NATS-gated tests): set documented env flags and run ignored tests
|
||||
|
||||
## Notes / Constraints
|
||||
- Do not create per-tenant streams unless scaling evidence requires it; prefer subject partitioning and consumer groups.
|
||||
- Prefer backward-compatible envelope changes (optional fields, tolerant decoding).
|
||||
- Prefer stable durable consumers; ephemeral consumers must be unique and bounded and must cleanup best-effort.
|
||||
38
README.md
38
README.md
@@ -0,0 +1,38 @@
|
||||
# cloudlysis (monorepo)
|
||||
|
||||
## Layout
|
||||
- Rust services (Cargo workspace): `aggregate/`, `gateway/`, `projection/`, `runner/`, `control/api/`, `shared/`
|
||||
- Control UI: `control/ui/`
|
||||
- Docker + Swarm + Compose: `docker/`, `docker-compose.yml`, `swarm/`, `observability/`
|
||||
- Transport plans:
|
||||
- `TRANSPORT_DEVELOPMENT_PLAN.md`
|
||||
- `GATEWAY_TRANSPORT_PLAN.md`
|
||||
- `NATS_TRANSPORT_PLAN.md`
|
||||
|
||||
## Quick Start (Docker Compose)
|
||||
|
||||
```bash
|
||||
docker compose up -d --build
|
||||
```
|
||||
|
||||
Full local stack with observability:
|
||||
|
||||
```bash
|
||||
docker compose -f docker-compose.yml -f observability/docker-compose.yml up -d --build
|
||||
```
|
||||
|
||||
## Commands
|
||||
- `make compose-up`, `make compose-down`
|
||||
- `make compose-up-observability`, `make compose-down-observability`
|
||||
- `make docker-build-all`
|
||||
- `make swarm-deploy-all`, `make swarm-rm-all`
|
||||
|
||||
More details: `DOCKER.md`
|
||||
|
||||
## Workspace Verification
|
||||
|
||||
```bash
|
||||
cargo fmt --check
|
||||
cargo clippy --workspace --all-targets -- -D warnings
|
||||
cargo test --workspace
|
||||
```
|
||||
|
||||
333
TRANSPORT_DEVELOPMENT_PLAN.md
Normal file
333
TRANSPORT_DEVELOPMENT_PLAN.md
Normal file
@@ -0,0 +1,333 @@
|
||||
# Transport Development Plan
|
||||
|
||||
## Purpose
|
||||
Unify and optimize the platform transport layer end-to-end:
|
||||
- Gateway ↔ nodes (Aggregate, Projection, Runner): routing + RPC/proxying + probes
|
||||
- Node ↔ NATS JetStream/KV: event/work distribution + configuration substrate
|
||||
|
||||
This plan merges and supersedes:
|
||||
- `GATEWAY_TRANSPORT_PLAN.md`
|
||||
- `NATS_TRANSPORT_PLAN.md`
|
||||
|
||||
## Current Status (Codebase Reality)
|
||||
- Monorepo workspace exists; `shared` crate exists and is used by Aggregate/Projection/Runner/Gateway.
|
||||
- Request context pieces are partially standardized:
|
||||
- `shared` provides `TenantId`, `CorrelationId`, `TraceId`
|
||||
- `shared` provides `trace_id_from_traceparent(...)` and `traceparent_from_trace_id(...)`
|
||||
- Some header names are centralized in `shared` but not all call sites use constants yet.
|
||||
- Gateway → Aggregate is already HTTP(edge) → gRPC(internal) and propagates `x-tenant-id`, `x-correlation-id`, and `traceparent`.
|
||||
- Gateway → Projection remains HTTP proxy (`/v1/query/...`) and Gateway → Runner remains HTTP admin proxy (`/admin/runner/...`).
|
||||
- Node → NATS header propagation is improved and closer to consistent:
|
||||
- Runner publishes `x-correlation-id` and `correlation-id`, and ensures `traceparent`/`trace-id` are present/derived when possible.
|
||||
- Aggregate publishes `trace-id` when `traceparent` is present.
|
||||
- Many “hard” NATS tests already exist but are gated/ignored by default; they should be treated as milestone gates when enabling changes.
|
||||
|
||||
## Principles
|
||||
- Simplicity: minimize distinct patterns; prefer one internal RPC stack + one async backbone.
|
||||
- Ease of operation: consistent health/ready/metrics; consistent naming; predictable failure modes.
|
||||
- Frugality: bounded connections, bounded consumers, bounded in-flight work; no churny resources.
|
||||
- Low resource usage: stable durables; avoid per-request reconnects; avoid unbounded loops.
|
||||
- High performance: multiplexing, backpressure, low tail latency, predictable routing.
|
||||
- Safety: tenant isolation, deny-by-default authz at the edge, idempotency, deterministic replay.
|
||||
|
||||
## Non-Negotiable Rules (Global)
|
||||
- Every cross-component hop MUST carry tenant + correlation + trace context.
|
||||
- Every transport path MUST have explicit timeouts/deadlines and bounded retries.
|
||||
- Every JetStream stream/consumer MUST have an explicit contract (name/subjects/retention/ack policy).
|
||||
- Every milestone is stop-the-line gated:
|
||||
- All tasks completed
|
||||
- All tests required by the milestone pass
|
||||
- Workspace verification commands pass
|
||||
- Gated integration tests for the milestone are runnable and documented
|
||||
|
||||
## Baseline (Today)
|
||||
- Gateway → Aggregate: gRPC (command submission)
|
||||
- Gateway → Projection: HTTP (query proxy)
|
||||
- Gateway → Runner: HTTP (admin proxy)
|
||||
- Node ↔ NATS JetStream: `AGGREGATE_EVENTS`, `WORKFLOW_COMMANDS`, `WORKFLOW_EVENTS`
|
||||
|
||||
## End State (Target Architecture)
|
||||
- Edge contract (clients ↔ Gateway): HTTP/JSON
|
||||
- Internal RPC (Gateway ↔ nodes): gRPC for Aggregate + Projection + Runner admin
|
||||
- Async backbone: NATS JetStream for events/work distribution; NATS KV for routing/placement
|
||||
- `shared` is the single source of truth for:
|
||||
- header names and injection/extraction rules
|
||||
- trace parsing/validation (`traceparent`, `trace-id`)
|
||||
- context object model (tenant/correlation/trace/request ids)
|
||||
- NATS subject + consumer naming helpers
|
||||
|
||||
## Standard Contracts
|
||||
### Context Fields
|
||||
- Tenant: HTTP `x-tenant-id`, NATS `tenant-id`
|
||||
- Correlation: HTTP `x-correlation-id`, NATS `x-correlation-id` and `correlation-id`
|
||||
- Trace: HTTP `traceparent`, NATS `traceparent` and `trace-id` (derived when possible)
|
||||
- Request id: HTTP `x-request-id` (optional for NATS)
|
||||
|
||||
### Standard Service Endpoints (every service)
|
||||
- `GET /health` liveness
|
||||
- `GET /ready` readiness (includes tenant gating if relevant)
|
||||
- `GET /metrics` Prometheus
|
||||
|
||||
## Milestone 0: Shared Transport Contract (Headers + Context + Trace)
|
||||
|
||||
### Goal
|
||||
Make propagation rules consistent and enforceable across HTTP, gRPC, and NATS so every later milestone builds on one contract.
|
||||
|
||||
### Exit Criteria
|
||||
- `shared` contains canonical constants for header names and NATS header names.
|
||||
- `shared` contains canonical trace parsing/validation and trace derivation helpers.
|
||||
- Library-level unit tests cover parsing/derivation behavior.
|
||||
- All crates build and tests pass for the workspace.
|
||||
|
||||
### Tasks
|
||||
- [x] Add shared ID types in `shared`:
|
||||
- [x] `TenantId`
|
||||
- [x] `CorrelationId`
|
||||
- [x] `TraceId`
|
||||
- [~] Consolidate header constants in `shared`:
|
||||
- [x] HTTP: `x-correlation-id`, `traceparent`, `trace-id` (for NATS/interop)
|
||||
- [ ] HTTP: `x-tenant-id`, `x-request-id` (missing constants)
|
||||
- [x] NATS: `correlation-id` (used in Runner), `trace-id` (now emitted where possible)
|
||||
- [ ] NATS: `tenant-id` constant, `Nats-Msg-Id` constant (missing constants)
|
||||
- [x] Add shared helpers:
|
||||
- [x] derive `trace-id` from `traceparent`
|
||||
- [x] derive `traceparent` from `trace-id` when valid
|
||||
- [ ] normalize/generate correlation id when missing across all transports (helper exists for `CorrelationId::generate()`; adoption incomplete)
|
||||
- [x] Add unit tests in `shared` for:
|
||||
- [x] traceparent parsing validity
|
||||
- [x] serialization shape for correlation/trace id newtypes
|
||||
- [ ] additional validation cases (invalid traceparents, invalid trace-id lengths) if needed for stricter enforcement
|
||||
|
||||
### Required Tests
|
||||
- `cargo fmt --check`
|
||||
- `cargo clippy --workspace --all-targets -- -D warnings`
|
||||
- `cargo test --workspace`
|
||||
|
||||
## Milestone 1: NATS Wire Protocol Lock-In (Subjects + Headers + Envelopes)
|
||||
|
||||
### Dependencies
|
||||
- Milestone 0
|
||||
|
||||
### Goal
|
||||
Make the JetStream/NATS “wire protocol” explicit and uniform so interop is safe across scale-out and rolling restarts.
|
||||
|
||||
### Exit Criteria
|
||||
- Subject naming is standardized and enforced via builder functions (producers publish concrete subjects only).
|
||||
- All NATS producers set required headers consistently.
|
||||
- All NATS consumers tolerate unknown fields and missing optional fields.
|
||||
- “Contract tests” exist per service to verify produced headers and subject formats.
|
||||
|
||||
### Tasks
|
||||
- [ ] Create/standardize subject builder helpers (prefer `shared`):
|
||||
- [ ] Aggregate event subject builder (`tenant.<tenant>.aggregate.<type>.<id>`)
|
||||
- [ ] Runner effect/effect_result/workflow subject builders
|
||||
- [~] Aggregate publishes:
|
||||
- [ ] `tenant-id` header always present (still needs enforcement everywhere)
|
||||
- [ ] correlation + trace headers always present when available, generated when required
|
||||
- [x] `trace-id` is derived when `traceparent` is present (now emitted in publish path)
|
||||
- [ ] `Nats-Msg-Id` strategy explicitly defined and tested
|
||||
- [~] Runner publishes (commands/results):
|
||||
- [x] correlation headers emitted consistently (`x-correlation-id` + `correlation-id`)
|
||||
- [x] trace headers derived consistently when possible (`traceparent` from `trace-id`, `trace-id` from `traceparent`)
|
||||
- [ ] outbox metadata → NATS headers mapping standardized via shared helpers (adoption incomplete)
|
||||
- [~] Projection consumption:
|
||||
- [x] envelope decoding remains tolerant (unknown fields ignored)
|
||||
- [~] correlation/trace context flows into spans/metrics consistently (types are shared; header extraction remains best-effort and should be unified)
|
||||
- [ ] Add unit tests:
|
||||
- [ ] subject formatting tests per service (once builders exist)
|
||||
- [ ] required header presence tests per publisher (enforce required keys)
|
||||
|
||||
### Required Tests
|
||||
- Workspace verification commands
|
||||
|
||||
## Milestone 2: JetStream Stream Policy (Create/Validate, No Destructive Startup)
|
||||
|
||||
### Dependencies
|
||||
- Milestone 1
|
||||
|
||||
### Goal
|
||||
Make stream definitions explicit, validated, and safe in all environments, preventing resource runaway and accidental destructive changes.
|
||||
|
||||
### Exit Criteria
|
||||
- Each stream has a single authoritative config policy (name/subjects/retention/limits/duplicate window).
|
||||
- Services create streams if missing, and validate compatibility on startup.
|
||||
- Startup does not silently replace or destructively mutate existing streams.
|
||||
- Config-only tests validate stream config builders without requiring NATS.
|
||||
|
||||
### Tasks
|
||||
- [ ] Define stream policies:
|
||||
- [ ] `AGGREGATE_EVENTS` (subjects, retention, duplicate window)
|
||||
- [ ] `WORKFLOW_COMMANDS`
|
||||
- [ ] `WORKFLOW_EVENTS`
|
||||
- [ ] Implement compatibility validation rules:
|
||||
- [ ] required subjects are present (superset allowed)
|
||||
- [ ] retention/limits are within allowed ranges
|
||||
- [ ] dedupe assumptions align with producer `Nats-Msg-Id` usage
|
||||
- [ ] Add unit tests for stream config builders + validators.
|
||||
|
||||
### Required Tests
|
||||
- Workspace verification commands
|
||||
|
||||
## Milestone 3: Consumer Policy + Backpressure + Poison (Reliable and Cheap Under Load)
|
||||
|
||||
### Dependencies
|
||||
- Milestone 2
|
||||
|
||||
### Goal
|
||||
Standardize consumer configs and runtime behavior to guarantee bounded in-flight work, predictable redelivery behavior, and consistent poison handling.
|
||||
|
||||
### Exit Criteria
|
||||
- All long-lived consumers use explicit ack with standardized defaults (`ack_wait`, `max_deliver`, `max_ack_pending`).
|
||||
- Application-level concurrency is bounded and aligned with `max_in_flight`.
|
||||
- Poison policy is consistent across consumers (term + durable quarantine/deadletter record).
|
||||
- Gated NATS integration tests prove:
|
||||
- redelivery idempotency
|
||||
- poison termination
|
||||
- scale-out behavior (deliver group) where applicable
|
||||
|
||||
### Tasks
|
||||
- [ ] Standardize consumer defaults:
|
||||
- [ ] `AckPolicy::Explicit`
|
||||
- [ ] `ack_wait` default + env override
|
||||
- [ ] `max_deliver` default + env override
|
||||
- [ ] `max_ack_pending` tied to worker concurrency
|
||||
- [ ] Projection:
|
||||
- [ ] durable naming collision-free for Single/PerView modes
|
||||
- [ ] checkpoint gate semantics: “skip still acks”
|
||||
- [ ] poison handling persists durable records and terminates reliably
|
||||
- [ ] Runner:
|
||||
- [ ] durable naming collision-free and stable across replicas
|
||||
- [ ] deliver group rules defined and tested
|
||||
- [ ] outbox relay exactly-once behavior verified under redelivery
|
||||
- [ ] Aggregate:
|
||||
- [ ] ad-hoc fetch consumer always unique and bounded
|
||||
- [ ] best-effort deletion never targets unrelated consumers
|
||||
- [ ] Add gated NATS integration tests and document env flags:
|
||||
- [ ] Runner ignored tests
|
||||
- [ ] Projection ignored tests
|
||||
|
||||
### Required Tests
|
||||
- Workspace verification commands
|
||||
- Runner: `RUNNER_TEST_NATS_URL=... cargo test -p runner -- --ignored`
|
||||
- Projection: `PROJECTION_TEST_NATS_URL=... cargo test -p projection -- --ignored`
|
||||
|
||||
## Milestone 4: Gateway → Projection Internal RPC (gRPC QueryService)
|
||||
|
||||
### Dependencies
|
||||
- Milestone 0 (context contract)
|
||||
|
||||
### Goal
|
||||
Replace Gateway → Projection HTTP proxy as the default path with a gRPC Query service, keeping HTTP optional for human/debug use.
|
||||
|
||||
### Exit Criteria
|
||||
- Projection exposes `projection.gateway.v1.QueryService`.
|
||||
- Gateway routes queries via gRPC by default.
|
||||
- Authz remains enforced in Gateway (deny-by-default).
|
||||
- Query responses remain stable for Control UI expectations.
|
||||
- New gRPC query tests pass (unit + integration).
|
||||
|
||||
### Tasks
|
||||
- [ ] Define protobuf API: `projection.gateway.v1.QueryService`
|
||||
- [ ] Implement Projection gRPC server for query execution
|
||||
- [ ] Implement Gateway gRPC client routing to Projection
|
||||
- [ ] deadlines
|
||||
- [ ] bounded retries (idempotent only)
|
||||
- [ ] context propagation
|
||||
- [ ] Preserve HTTP `/v1/query/*` as compatibility/debug:
|
||||
- [ ] route internally to gRPC or keep as legacy endpoint
|
||||
- [ ] Add tests:
|
||||
- [ ] authz + forwarding via gRPC
|
||||
- [ ] tenant isolation enforcement in Projection QueryService
|
||||
|
||||
### Required Tests
|
||||
- Workspace verification commands
|
||||
|
||||
## Milestone 5: Gateway → Runner Admin Internal RPC (gRPC RunnerAdmin)
|
||||
|
||||
### Dependencies
|
||||
- Milestone 0 (context contract)
|
||||
|
||||
### Goal
|
||||
Replace Gateway’s `/admin/runner/*` HTTP proxy usage with a first-class gRPC admin service.
|
||||
|
||||
### Exit Criteria
|
||||
- Runner exposes `runner.admin.v1.RunnerAdmin`.
|
||||
- Gateway calls Runner admin via gRPC (authz enforced in Gateway).
|
||||
- Tenant-spoof and unauthorized calls are rejected deterministically.
|
||||
- Runner drain/readiness semantics validated and tested.
|
||||
|
||||
### Tasks
|
||||
- [ ] Define protobuf API: `runner.admin.v1.RunnerAdmin`
|
||||
- [ ] Implement Runner gRPC admin server
|
||||
- [ ] Implement Gateway gRPC client integration for admin operations
|
||||
- [ ] Keep Runner HTTP admin endpoints optional for direct debugging, not required by Gateway
|
||||
- [ ] Add tests:
|
||||
- [ ] Gateway: rejects without rights
|
||||
- [ ] Gateway: rejects tenant spoof attempts
|
||||
- [ ] Runner: idempotency and drain semantics
|
||||
|
||||
### Required Tests
|
||||
- Workspace verification commands
|
||||
|
||||
## Milestone 6: Gateway Upstream Performance + Operational Guardrails
|
||||
|
||||
### Dependencies
|
||||
- Milestones 4–5 (gRPC internal RPC surfaces available)
|
||||
|
||||
### Goal
|
||||
Make Gateway upstream connection handling, retry behavior, and probe/fanout operations consistent, bounded, and cheap under load.
|
||||
|
||||
### Exit Criteria
|
||||
- Bounded upstream gRPC channel pool exists (LRU + TTL/eviction).
|
||||
- Deadlines everywhere; retries only for idempotent operations.
|
||||
- Probe/fanout calls are bounded (timeouts + concurrency limits) and carry context.
|
||||
- Gated load/soak tests exist and are runnable.
|
||||
|
||||
### Tasks
|
||||
- [ ] Implement upstream channel pool
|
||||
- [ ] bounded LRU
|
||||
- [ ] TTL/eviction
|
||||
- [ ] fast-path reuse under load
|
||||
- [ ] Standardize retry profiles
|
||||
- [ ] read-only: limited retry with jitter
|
||||
- [ ] mutations: no retry unless idempotency key is present and semantics are safe
|
||||
- [ ] Standardize timeouts/deadlines:
|
||||
- [ ] edge timeout limits
|
||||
- [ ] internal per-service deadlines
|
||||
- [ ] Fanout controls:
|
||||
- [ ] concurrency limiters for probes/snapshots
|
||||
- [ ] short TTL caching where safe
|
||||
- [ ] Ensure probes carry context (correlation/trace) for observability.
|
||||
|
||||
### Required Tests
|
||||
- Workspace verification commands
|
||||
- Gated load/soak tests (document env + how to run)
|
||||
|
||||
## Milestone 7: Transport Cleanup (Remove Legacy Internal Paths)
|
||||
|
||||
### Dependencies
|
||||
- Milestone 6
|
||||
|
||||
### Goal
|
||||
Ensure the “happy path” is: HTTP edge → Gateway → gRPC internal → NATS async, with legacy internal HTTP proxy paths removed or clearly debug-only.
|
||||
|
||||
### Exit Criteria
|
||||
- Gateway no longer depends on HTTP for Projection queries or Runner admin.
|
||||
- Legacy paths are removed or explicitly debug-only and not referenced by Gateway/Control.
|
||||
- End-to-end smoke tests pass (gated).
|
||||
|
||||
### Tasks
|
||||
- [ ] Remove Gateway HTTP query proxy usage (or keep only as explicit compatibility shim)
|
||||
- [ ] Remove Gateway runner admin HTTP proxy usage (or keep only as explicit compatibility shim)
|
||||
- [ ] Ensure Control UI + Control API rely only on standardized surfaces
|
||||
- [ ] Harden metrics and readiness probes to match the standard contract everywhere
|
||||
|
||||
### Required Tests
|
||||
- Workspace verification commands
|
||||
- End-to-end smoke tests (gated)
|
||||
|
||||
## Workspace Verification Commands (Run for Every Milestone)
|
||||
- `cargo fmt --check`
|
||||
- `cargo clippy --workspace --all-targets -- -D warnings`
|
||||
- `cargo test --workspace`
|
||||
- `npm ci && npm run lint && npm run typecheck && npm run test && npm run build` (in `control/ui`)
|
||||
1
aggregate/.clippy.toml
Normal file
1
aggregate/.clippy.toml
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
37
aggregate/.gitignore
vendored
Normal file
37
aggregate/.gitignore
vendored
Normal file
@@ -0,0 +1,37 @@
|
||||
/target/
|
||||
/target-*/
|
||||
**/target/
|
||||
*.rs.bk
|
||||
*.pdb
|
||||
*.dSYM/
|
||||
*.orig
|
||||
*.rej
|
||||
*.log
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
.DS_Store
|
||||
.idea/
|
||||
.vscode/
|
||||
|
||||
.env
|
||||
.env.*
|
||||
.envrc
|
||||
.direnv/
|
||||
|
||||
docker-compose.override.yml
|
||||
|
||||
*.mdbx
|
||||
*.mdbx-*
|
||||
*.mdbx-lock
|
||||
*.mdbx.dat
|
||||
*.mdbx.lck
|
||||
*.mdb
|
||||
*.db
|
||||
/data/
|
||||
/tmp/
|
||||
|
||||
/coverage/
|
||||
lcov.info
|
||||
*.profraw
|
||||
*.profdata
|
||||
42
aggregate/Cargo.toml
Normal file
42
aggregate/Cargo.toml
Normal file
@@ -0,0 +1,42 @@
|
||||
[package]
|
||||
name = "aggregate"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[features]
|
||||
default = []
|
||||
runtime-v8 = ["v8"]
|
||||
runtime-wasm = []
|
||||
|
||||
[dependencies]
|
||||
shared = { path = "../shared" }
|
||||
edge_storage = { version = "0.1", registry = "madapes" }
|
||||
runtime-function = { version = "0.2", registry = "madapes" }
|
||||
edge-logger-client = { version = "0.1", registry = "madapes" }
|
||||
query_engine = { version = "0.1", registry = "madapes" }
|
||||
async-nats = "0.39"
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
serde_yaml = "0.9"
|
||||
toml = "0.8"
|
||||
thiserror = "2"
|
||||
anyhow = "1"
|
||||
tracing = "0.1"
|
||||
tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
|
||||
uuid = { version = "1", features = ["v7", "serde"] }
|
||||
chrono = { version = "0.4", features = ["serde"] }
|
||||
futures = "0.3"
|
||||
lru = "0.12"
|
||||
v8 = { version = "0.106", optional = true }
|
||||
tonic = { version = "0.12", default-features = false, features = ["codegen", "prost", "transport"] }
|
||||
prost = "0.13"
|
||||
axum = "0.7"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3"
|
||||
tokio-stream = { version = "0.1", features = ["net"] }
|
||||
|
||||
[build-dependencies]
|
||||
tonic-build = { version = "0.12", default-features = false, features = ["prost"] }
|
||||
protoc-bin-vendored = "3"
|
||||
1751
aggregate/DEVELOPMENT_PLAN.md
Normal file
1751
aggregate/DEVELOPMENT_PLAN.md
Normal file
File diff suppressed because it is too large
Load Diff
81
aggregate/README.md
Normal file
81
aggregate/README.md
Normal file
@@ -0,0 +1,81 @@
|
||||
# aggregate
|
||||
|
||||
## Running
|
||||
|
||||
### Configuration
|
||||
|
||||
Configuration is loaded in this order:
|
||||
|
||||
1. If `AGGREGATE_CONFIG_PATH` is set and points to a readable config file, load that file and apply env overrides.
|
||||
2. Otherwise load defaults and apply env overrides.
|
||||
|
||||
Supported config formats:
|
||||
- YAML (`.yaml`, `.yml`)
|
||||
- TOML (`.toml`)
|
||||
- JSON (`.json`)
|
||||
|
||||
### Environment Variables
|
||||
|
||||
#### Core
|
||||
- `AGGREGATE_NATS_URL` (default: `nats://localhost:4222`): NATS server URL.
|
||||
- `AGGREGATE_STORAGE_PATH` (default: `./data`): Path used by the snapshot storage.
|
||||
- `AGGREGATE_SNAPSHOT_THRESHOLD` (default: `10`): Save snapshot when events since last snapshot reach this threshold.
|
||||
- `AGGREGATE_MAX_RETRIES` (default: `3`): Max retries for version conflicts in command handling.
|
||||
- `AGGREGATE_HTTP_ADDR` (default: `0.0.0.0:8080`): HTTP bind address.
|
||||
- `AGGREGATE_GRPC_ADDR` (default: `0.0.0.0:50051`): gRPC bind address for command submission.
|
||||
|
||||
#### Multi-tenant
|
||||
- `AGGREGATE_MULTI_TENANT` (default: `true`): Enables multi-tenant behavior when parsing/validating tenant ids.
|
||||
- `AGGREGATE_DEFAULT_TENANT_ID` (default: unset): Default tenant id when the incoming request doesn't specify one.
|
||||
- `AGGREGATE_SHARD_ID` (default: `local`): Shard id used when applying placement maps.
|
||||
|
||||
#### Logging
|
||||
- `AGGREGATE_LOGGER_SOCKET` (default: unset): Socket path for `edge-logger-client` integration (if enabled).
|
||||
|
||||
#### Server
|
||||
- `AGGREGATE_CONFIG_PATH` (default: unset): Path to a YAML/TOML/JSON config file.
|
||||
|
||||
#### Placement
|
||||
- `AGGREGATE_PLACEMENT_BUCKET` (default: `AGGREGATE_PLACEMENT`): NATS KV bucket to watch.
|
||||
- `AGGREGATE_PLACEMENT_KEY` (default: `aggregate_placement`): NATS KV key to watch. Value is a JSON object mapping `tenant_id -> shard_id`.
|
||||
|
||||
#### Runtime Programs
|
||||
- `AGGREGATE_DECIDE_PROGRAM` / `AGGREGATE_APPLY_PROGRAM`: Inline program source strings.
|
||||
- `AGGREGATE_DECIDE_PROGRAM_PATH` / `AGGREGATE_APPLY_PROGRAM_PATH`: File paths to program source strings.
|
||||
|
||||
## HTTP Endpoints
|
||||
|
||||
- `GET /health` → JSON health report
|
||||
- `GET /ready` → JSON boolean readiness
|
||||
- `GET /metrics` → Prometheus text format
|
||||
- `GET /admin/tenants` → JSON list of hosted tenants
|
||||
- `POST /admin/drain` → marks tenant draining and waits for in-flight commands to finish (`{"tenant_id":"..."}`)
|
||||
- `POST /admin/reload` → updates hosted tenant allowlist (`{"hosted_tenants":[...]}`) or applies a placement map (`{"placement":{...}}`)
|
||||
- `GET /admin/tenant/{tenant_id}/status` → JSON tenant status (`hosted`, `accepting`, `draining`, `in_flight`)
|
||||
- `GET /admin/tenant/{tenant_id}/ready` → JSON boolean (node ready AND accepting tenant)
|
||||
- `POST /admin/tenant/{tenant_id}/drain` → drains tenant with optional timeout (`{"timeout_ms":10000}`)
|
||||
|
||||
## gRPC
|
||||
|
||||
Aggregate exposes a command submission API for the Gateway:
|
||||
|
||||
- Service: `aggregate.gateway.v1.CommandService`
|
||||
- Method: `SubmitCommand`
|
||||
- Metadata: `x-tenant-id` (tenant routing hint)
|
||||
|
||||
Proto definition: [aggregate.proto](file:///Users/vlad/Developer/cloudlysis/aggregate/proto/aggregate.proto)
|
||||
|
||||
## Container
|
||||
|
||||
Build and run locally:
|
||||
|
||||
```bash
|
||||
docker build -t cloudlysis/aggregate:local -f docker/Dockerfile.rust --build-arg PACKAGE=aggregate --build-arg BIN=aggregate .
|
||||
docker compose up -d --build
|
||||
```
|
||||
|
||||
Container smoke test (requires Docker installed):
|
||||
|
||||
```bash
|
||||
sh docker/scripts/verify_aggregate_container.sh
|
||||
```
|
||||
8
aggregate/build.rs
Normal file
8
aggregate/build.rs
Normal file
@@ -0,0 +1,8 @@
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let protoc = protoc_bin_vendored::protoc_bin_path()?;
|
||||
std::env::set_var("PROTOC", protoc);
|
||||
|
||||
tonic_build::configure().compile_protos(&["proto/aggregate.proto"], &["proto"])?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
4
aggregate/cargo-build.sh
Normal file
4
aggregate/cargo-build.sh
Normal file
@@ -0,0 +1,4 @@
|
||||
#!/bin/bash
|
||||
export CARGO_REGISTRIES_MADAPES_TOKEN=0f5ef6366637224dceae4c35e0e3b5639be77b69
|
||||
source ~/.cargo/env
|
||||
cargo "$@"
|
||||
192
aggregate/external_prd.md
Normal file
192
aggregate/external_prd.md
Normal file
@@ -0,0 +1,192 @@
|
||||
### External PRD: Changes Required in Aggregate, Projection, Runner
|
||||
|
||||
This document captures the work needed outside the Gateway to support:
|
||||
- Tenant-aware routing via `x-tenant-id`
|
||||
- Independent horizontal scalability of Aggregate, Projection, Runner
|
||||
- A safe mechanism for tenant rebalancing per service kind
|
||||
|
||||
---
|
||||
|
||||
## **Target State**
|
||||
|
||||
### Independent Placements
|
||||
|
||||
Each service kind has its own placement map:
|
||||
- `aggregate_placement[tenant_id] -> aggregate_shard_id`
|
||||
- `projection_placement[tenant_id] -> projection_shard_id`
|
||||
- `runner_placement[tenant_id] -> runner_shard_id`
|
||||
|
||||
Each shard is a replica set that can scale independently.
|
||||
|
||||
### Rebalancing Contract (Per Service Kind)
|
||||
|
||||
All nodes MUST support:
|
||||
- Dynamic placement updates (watch NATS KV or reload config)
|
||||
- A drain mechanism that can target a specific tenant (stop acquiring new work for that tenant, finish in-flight, report status)
|
||||
- Clear readiness semantics that reflect whether the node will accept work for a tenant
|
||||
|
||||
Additionally, all nodes SHOULD converge on the same operational contract:
|
||||
- A per-tenant “accepting” gate (can this shard accept new work/queries/commands for tenant X?)
|
||||
- A per-tenant “drained” signal (no in-flight work remains for tenant X)
|
||||
- A per-tenant warmup/catchup signal where relevant (projection lag, aggregate snapshot availability)
|
||||
|
||||
---
|
||||
|
||||
## **Aggregate: Required Changes**
|
||||
|
||||
### 1) Expose a Real Command API (Gateway Upstream)
|
||||
|
||||
Today, Aggregate has internal command handling types (e.g., `CommandServer`) but its running HTTP server only exposes health/metrics/admin endpoints ([aggregate/http_server.rs](file:///Users/vlad/Developer/cloudlysis/aggregate/src/http_server.rs#L15-L82), [aggregate/server/mod.rs](file:///Users/vlad/Developer/cloudlysis/aggregate/src/server/mod.rs#L81-L213)).
|
||||
|
||||
Aggregate MUST expose one of the following upstream APIs for the Gateway to call:
|
||||
- **Option A (Recommended)**: gRPC server implementing `aggregate.gateway.v1.CommandService/SubmitCommand` compatible with [aggregate.proto](file:///Users/vlad/Developer/cloudlysis/aggregate/proto/aggregate.proto#L1-L31).
|
||||
- **Option B**: HTTP endpoint for command submission (REST), with a stable request/response shape that the Gateway can proxy.
|
||||
|
||||
### 2) Tenant Placement Enforcement
|
||||
|
||||
Aggregate MUST enforce “hosted tenants” so independent scaling is safe:
|
||||
- If an Aggregate shard/node is not assigned a tenant, it MUST reject commands for that tenant (e.g., `403` or `503` with retriable hint depending on whether the issue is authorization vs placement).
|
||||
- Aggregate SHOULD maintain an in-memory allowlist of hosted tenants that is driven by:
|
||||
- NATS KV placement watcher (preferred), or
|
||||
- Hot-reloaded config pushed via `/admin/reload`
|
||||
|
||||
Aggregate already has admin hooks for drain/reload, but they are currently generic and/or illustrative ([aggregate/http_server.rs](file:///Users/vlad/Developer/cloudlysis/aggregate/src/http_server.rs#L15-L72), [aggregate/server/mod.rs](file:///Users/vlad/Developer/cloudlysis/aggregate/src/server/mod.rs#L402-L442)). These need to become placement-aware.
|
||||
|
||||
### 3) Tenant Drain (Per Tenant)
|
||||
|
||||
Aggregate MUST provide a per-tenant drain mechanism to support rebalancing:
|
||||
- Stop accepting new commands for the tenant.
|
||||
- Allow in-flight commands to finish (bounded wait), then report drained.
|
||||
- Expose drain status per tenant (admin endpoint).
|
||||
|
||||
### 4) Rebalancing State Strategy
|
||||
|
||||
Aggregate persists snapshots locally (MDBX) and uses JetStream for events. To move a tenant:
|
||||
- **Approach 1 (Snapshot migration)**: copy tenant snapshot DB/state to the target shard, then switch placement.
|
||||
- **Approach 2 (Cold rehydrate)**: switch placement and let the target shard rebuild state by replaying events from JetStream; expect higher latency during warmup.
|
||||
|
||||
The system should support both, with the rebalancer selecting the strategy based on tenant size/SLO.
|
||||
|
||||
### 5) Metrics for Placement Decisions
|
||||
|
||||
Aggregate SHOULD expose:
|
||||
- Per-tenant command rate, error rate
|
||||
- In-flight commands by tenant
|
||||
- Rehydrate time / snapshot hit ratio
|
||||
- Storage size per tenant (if feasible)
|
||||
|
||||
---
|
||||
|
||||
## **Projection: Required Changes**
|
||||
|
||||
### 1) Expose Query API Upstream for Gateway
|
||||
|
||||
Projection has a working `QueryService` with tenant-scoped prefix scans ([uqf.rs](file:///Users/vlad/Developer/cloudlysis/projection/src/query/uqf.rs#L121-L162)) but it is not exposed via HTTP/gRPC (current HTTP routes are health/ready/metrics/info only: [projection/http/mod.rs](file:///Users/vlad/Developer/cloudlysis/projection/src/http/mod.rs#L102-L109)).
|
||||
|
||||
Projection MUST add one upstream API the Gateway can route to:
|
||||
- `POST /query/{view_type}` (HTTP) accepting `x-tenant-id` and a UQF payload, returning `QueryResponse`.
|
||||
- Or a gRPC query service (new proto) if gRPC is preferred end-to-end.
|
||||
|
||||
### 2) Tenant Placement Filtering (Independent Scaling)
|
||||
|
||||
Projection MUST support running in one of these modes:
|
||||
- **Multi-tenant shard**: consumes all tenants (simple, less isolated).
|
||||
- **Tenant-filtered shard (required for rebalancing)**:
|
||||
- only consumes/serves queries for the tenants assigned to that shard
|
||||
- rejects queries for unassigned tenants (consistent error semantics)
|
||||
|
||||
Implementation direction:
|
||||
- Add a placement watcher similar to Runner’s tenant filter ([runner/tenant_placement.rs](file:///Users/vlad/Developer/cloudlysis/runner/src/tenant_placement.rs#L8-L100)).
|
||||
- Apply tenant filter to:
|
||||
- event consumption subject filters (preferred), and
|
||||
- query serving validation (always).
|
||||
|
||||
### 3) Drain + Warmup Endpoints
|
||||
|
||||
Projection SHOULD add:
|
||||
- `/admin/drain?tenant_id=...` (stop consuming new events for that tenant, finish in-flight, flush checkpoints)
|
||||
- `/admin/reload` (apply latest placement/config)
|
||||
- Optional warmup status: whether the shard has caught up to JetStream tail for that tenant/view_types
|
||||
|
||||
### 4) Rebalancing Strategy for Projection
|
||||
|
||||
Projection can rebalance safely with “warm then cut over”:
|
||||
- Assign tenant to the new projection shard while old shard still serves.
|
||||
- New shard catches up (replay from JetStream, build view KV).
|
||||
- Switch Gateway placement for query routing to new shard.
|
||||
- Drain old shard for that tenant and optionally delete old tenant KV keys.
|
||||
|
||||
### 5) Metrics for Placement Decisions
|
||||
|
||||
Projection SHOULD expose:
|
||||
- JetStream lag per tenant/view_type (tail minus checkpoint)
|
||||
- Query latency and scan counts
|
||||
- Storage size per tenant (if feasible)
|
||||
|
||||
---
|
||||
|
||||
## **Runner: Required Changes**
|
||||
|
||||
Runner already has:
|
||||
- A tenant placement watcher capable of producing an allowlist ([tenant_placement.rs](file:///Users/vlad/Developer/cloudlysis/runner/src/tenant_placement.rs#L8-L100))
|
||||
- Admin endpoints including drain/reload/config ([runner/http/mod.rs](file:///Users/vlad/Developer/cloudlysis/runner/src/http/mod.rs#L69-L86))
|
||||
- Gateway client integration for aggregate command submission ([runner/gateway/mod.rs](file:///Users/vlad/Developer/cloudlysis/runner/src/gateway/mod.rs#L1-L47))
|
||||
|
||||
To support independent scalability + rebalancing, Runner needs the following.
|
||||
|
||||
### 1) Per-Tenant Drain (Not Only Global)
|
||||
|
||||
Runner’s current drain is global (`/admin/drain` toggles a single draining flag). Runner MUST support draining a specific tenant:
|
||||
- Stop acquiring new saga/effect work for the tenant.
|
||||
- Allow in-flight work for the tenant to finish (bounded).
|
||||
- Flush outbox for the tenant (or guarantee idempotency on handoff).
|
||||
- Persist final checkpoints so another shard can continue without duplication beyond at-least-once bounds.
|
||||
|
||||
### 2) Placement-Enforced Work Acquisition
|
||||
|
||||
Runner MUST validate tenant assignment at the boundary where it:
|
||||
- consumes JetStream messages (saga triggers, effect commands), and
|
||||
- dispatches outbox work.
|
||||
|
||||
If a tenant is not assigned to the shard, Runner must not process its work.
|
||||
|
||||
### 3) Handoff Safety Rules for Rebalancing
|
||||
|
||||
Runner rebalancing should follow:
|
||||
- New shard begins processing only after it is assigned the tenant.
|
||||
- Old shard stops acquiring new work for that tenant, then drains.
|
||||
- Idempotency remains correct across handoff using checkpoints and dedupe markers.
|
||||
|
||||
### 4) Metrics for Placement Decisions
|
||||
|
||||
Runner SHOULD expose:
|
||||
- Outbox depth by tenant
|
||||
- Work processing latency and retries by tenant/effect
|
||||
- Schedule due items by tenant
|
||||
- Consumer lag by tenant (if the consumption model supports per-tenant lag)
|
||||
|
||||
### 5) Auth Delivery Side Effects (Email/SMS/Push)
|
||||
|
||||
If the platform’s AuthN flows require out-of-band delivery (password reset links, email verification, MFA codes), the Runner SHOULD be the standard place to execute those side effects:
|
||||
- Define a stable effect interface for sending transactional emails (reset links, verification links, security alerts).
|
||||
- Optionally add SMS/push providers later under the same effect contract.
|
||||
|
||||
This keeps the Gateway free of long-lived provider credentials and aligns with the existing “effects are executed by workers” pattern.
|
||||
|
||||
---
|
||||
|
||||
## **Gateway Integration Notes**
|
||||
|
||||
Once the above changes exist:
|
||||
- Gateway routes per `(tenant_id, service_kind)` using independent placement maps.
|
||||
- Gateway can implement “warm then cut over” rebalancing for Projection and Runner by switching only query/workflow routing after readiness conditions are met.
|
||||
- Gateway can enforce consistent tenant validation, authn/authz, and error semantics at the edge even as placements move.
|
||||
|
||||
---
|
||||
|
||||
## **Gaps / Opportunities**
|
||||
|
||||
- **KV schema + ownership**: define the exact NATS KV bucket layout, key naming, revisioning rules, and who is allowed to write placement updates.
|
||||
- **Rebalancer API**: define operator workflows (plan/apply/rollback), status reporting, and audit log requirements for placement changes.
|
||||
- **Shard discovery**: define how shard endpoints are registered (static config vs KV directory entries) and how health is represented.
|
||||
- **Consistency boundaries**: define rebalancing guarantees per service kind (projection can be warm-cutover; runner requires checkpoint handoff; aggregate requires single-writer and state availability).
|
||||
4
aggregate/gateway-routing.yaml
Normal file
4
aggregate/gateway-routing.yaml
Normal file
@@ -0,0 +1,4 @@
|
||||
tenants:
|
||||
tenant-a: "http://aggregate-node-a:8080"
|
||||
tenant-b: "http://aggregate-node-b:8080"
|
||||
|
||||
160
aggregate/prd.md
Normal file
160
aggregate/prd.md
Normal file
@@ -0,0 +1,160 @@
|
||||
### 🧱 Component: Aggregate
|
||||
**Definition:**
|
||||
The Aggregate is a standalone Rust-based container that serves as the primary consistency boundary and decision-making unit of the system. It is a stateful entity that encapsulates business logic, enforces invariants, and ensures that all changes to the system are valid according to defined rules. Commands are received from users through a Gateway, and events are stored on **NATS JetStream**; `edge-storage` `AggregateStore` holds versioned **snapshots** for efficient rehydration.
|
||||
|
||||
**Multi-Tenancy:**
|
||||
The Aggregate supports optional multi-tenancy via `tenant_id`. When enabled:
|
||||
- **Routing:** The Gateway routes commands to Aggregate nodes based on the `x-tenant-id` header
|
||||
- **Sharding:** Aggregate instances are sharded across nodes by `tenant_id`, ensuring tenant data isolation
|
||||
- **Storage:** Snapshots and events are namespaced by `tenant_id` to prevent cross-tenant access
|
||||
- **Subject Naming:** NATS subjects include `tenant_id` (e.g., `tenant.<tenant_id>.aggregate.<aggregate_type>.<aggregate_id>`)
|
||||
- **Backward Compatibility:** Aggregates without multi-tenancy use a default/empty `tenant_id`
|
||||
|
||||
**Dependencies:**
|
||||
* Core crates pulled from the custom Cargo registry:
|
||||
```toml
|
||||
[registries.madapes]
|
||||
index = "sparse+https://git.madapes.com/api/packages/madapes/cargo/"
|
||||
```
|
||||
|
||||
| Crate | Purpose |
|
||||
|-------|---------|
|
||||
| `edge-storage` | libmdbx-backed AggregateStore for versioned snapshots |
|
||||
| `runtime-function` | Deterministic DAG execution for `decide`/`apply` programs |
|
||||
| `edge-logger` | High-performance logging (UDS + Protobuf, Loki sink) |
|
||||
| `query-engine` | UQF query support for filtering/querying aggregate state |
|
||||
| `async-nats` | NATS JetStream client for event streaming |
|
||||
|
||||
* Source code available at `../../madapes/`
|
||||
* **Note:** This is a standalone container — it does not use `framework-bus` or `framework-aggregate` (those serve a different system)
|
||||
|
||||
**Observability:**
|
||||
* Production stack: **Grafana** + **Victoria Metrics** + **Loki**
|
||||
* `edge-logger` provides structured logging via Unix Domain Sockets with lock-free batching
|
||||
* Metrics exposed via `metrics-exporter-prometheus` for Victoria Metrics scraping
|
||||
* Traces/logs flow to Loki with cardinality protection and multi-tenant isolation
|
||||
|
||||
#### 1. Core Responsibilities
|
||||
* **Command Validation:** Receives intent (Commands) from the Gateway and uses `runtime-function` DAG programs to determine if the intent is valid based on the current state.
|
||||
* **State Rehydration:** Reconstructs its internal state by loading the latest **snapshot** from `edge-storage` `AggregateStore` (`get_latest_snapshot`) and replaying any subsequent events from NATS JetStream.
|
||||
* **Event Production:** Transforms valid commands into one or more Events that represent a "fact" that has occurred.
|
||||
* **Atomic Persistence:** Publishes new events to NATS JetStream and stores an updated snapshot in `edge-storage` `AggregateStore` (`put_snapshot_sync`).
|
||||
* **Concurrency Control:** Protects against "lost updates" using version-based optimistic locking. `edge-storage` `AggregateStore` returns `VersionConflict` for duplicate versions.
|
||||
|
||||
#### 2. The Lifecycle of a Command
|
||||
1. **Reception:** The Gateway routes a Command from a user to the Aggregate container based on the `aggregate_id` and `x-tenant-id` header. The `tenant_id` is extracted and included in the Command envelope for tenant-aware processing.
|
||||
2. **Loading (Rehydration):**
|
||||
* The Aggregate fetches the latest **Snapshot** from `edge-storage` `AggregateStore` using the composite key `(tenant_id, aggregate_id)`.
|
||||
* It reads any **Events** from NATS JetStream (tenant-namespaced subject) that occurred after the snapshot version.
|
||||
* It applies these events sequentially to the snapshot state using the deterministic `apply` runtime-function program to reach the "Current State."
|
||||
3. **Execution:**
|
||||
* The Aggregate passes the Current State and the Command to the `decide` runtime-function program.
|
||||
* If invalid: Returns an Error (Command Rejected).
|
||||
* If valid: Returns a list of New Events.
|
||||
4. **Persistence (The Commit):**
|
||||
* The Aggregate publishes New Events to NATS JetStream on tenant-namespaced subjects, with `command_id` mapped to `idempotency_key`.
|
||||
* It stores an updated snapshot in `edge-storage` `AggregateStore` using `(tenant_id, aggregate_id, new_version)` as the composite key.
|
||||
* **Constraint:** `AggregateStore` enforces strict monotonicity — if `new_version` already exists, it returns `VersionConflict`, and the Aggregate must reload and retry.
|
||||
5. **Publication:**
|
||||
* Events published to NATS JetStream are immediately available for downstream consumption by Sagas and Projections (filtered by tenant if needed).
|
||||
|
||||
#### 3. Technical Constraints & Guarantees
|
||||
* **Determinism:** The logic within an Aggregate must be 100% deterministic. `runtime-function` DAG programs are sandboxed and gas-metered, with no access to the system clock, random number generators, or external APIs. All data required for a decision must be present in the Command or the Aggregate State.
|
||||
* **Side-Effect Free:** An Aggregate does not send emails, update databases, or call other services. It only produces events. Side effects are the responsibility of Sagas.
|
||||
* **Single Writer:** While multiple nodes may attempt to process commands for the same `aggregate_id`, only one "Commit" can succeed for a specific version, enforced by `edge-storage` `AggregateStore` (`VersionConflict`).
|
||||
* **Tenant Isolation:** An Aggregate can only access data within its `tenant_id` scope. Cross-tenant access is blocked at the storage and stream layers. The `tenant_id` is validated on every command to prevent tenant spoofing.
|
||||
* **Isolation:** An Aggregate cannot see the state of other Aggregates. If a business rule spans multiple Aggregates, it must be handled by a **Saga**.
|
||||
|
||||
#### 4. Data Structure (The Envelope)
|
||||
Each Aggregate maintains a metadata header:
|
||||
* `tenant_id`: Optional identifier for multi-tenant isolation (routed via `x-tenant-id` header)
|
||||
* `aggregate_id`: Unique UUID or URN for the instance.
|
||||
* `aggregate_type`: The name of the business entity (e.g., `Account`, `Order`).
|
||||
* `version`: A monotonically increasing integer representing the number of events processed.
|
||||
* `snapshot_threshold`: A configuration defining how many events should trigger a new snapshot in `edge-storage`.
|
||||
|
||||
#### 5. Error Handling
|
||||
* **Validation Errors:** Business rule violations (e.g., "Insufficient Funds") result in an immediate synchronous rejection of the command.
|
||||
* **Tenant Access Errors:** Cross-tenant access attempts (e.g., wrong `tenant_id` in command) are rejected with `TenantAccessDenied`.
|
||||
* **Concurrency Conflicts:** If `edge-storage` returns `VersionConflict`, the framework implements an automatic "Retry-on-Conflict" policy (Reload → Re-validate → Re-commit) up to a defined limit.
|
||||
* **System Failures:** If `edge-storage` or NATS JetStream is unavailable, the Aggregate remains in a read-only or "unavailable" state to prevent inconsistent branching of the event stream.
|
||||
|
||||
#### 6. Horizontal Scaling Strategy
|
||||
The Aggregate container is designed for horizontal scaling on **Docker Swarm**, leveraging tenant-based sharding for predictable data locality and simple operations.
|
||||
|
||||
**Sharding Model:**
|
||||
- **Tenant-Aware Placement:** Aggregate instances are placed on Swarm nodes based on `tenant_id` using Docker Swarm placement constraints
|
||||
- **Consistent Hashing:** A hash ring maps `tenant_id` values to specific nodes, ensuring all commands for a tenant route to the same node (or replica set)
|
||||
- **Subject-Based Routing:** NATS JetStream consumer groups are tenant-namespaced, enabling parallel processing across tenants without coordination
|
||||
|
||||
**Scaling Architecture:**
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ Admin UI (Control Node) │
|
||||
│ ┌─────────────────────────────────────────────────────────┐ │
|
||||
│ │ Scale Manager: CRUD for tenant → node assignments │ │
|
||||
│ │ - List tenants, node assignments, load metrics │ │
|
||||
│ │ - Add/remove nodes, migrate tenants │ │
|
||||
│ │ - Emit scaling commands to Docker Swarm API │ │
|
||||
│ └─────────────────────────────────────────────────────────┘ │
|
||||
└──────────────────────────┬──────────────────────────────────────┘
|
||||
│ Docker Swarm API / SSH
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ Docker Swarm Cluster │
|
||||
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
|
||||
│ │ Node A │ │ Node B │ │ Node C │ │
|
||||
│ │ tenant: a-c │ │ tenant: d-m │ │ tenant: n-z │ │
|
||||
│ │ ┌────────┐ │ │ ┌────────┐ │ │ ┌────────┐ │ │
|
||||
│ │ │Agg Ctr │ │ │ │Agg Ctr │ │ │ │Agg Ctr │ │ │
|
||||
│ │ └───┬────┘ │ │ └───┬────┘ │ │ └───┬────┘ │ │
|
||||
│ │ │ │ │ │ │ │ │ │ │
|
||||
│ │ ┌───▼────┐ │ │ ┌───▼────┐ │ │ ┌───▼────┐ │ │
|
||||
│ │ │libmdbx │ │ │ │libmdbx │ │ │ │libmdbx │ │ │
|
||||
│ │ │(local) │ │ │ │(local) │ │ │ │(local) │ │ │
|
||||
│ │ └────────┘ │ │ └────────┘ │ │ └────────┘ │ │
|
||||
│ └──────────────┘ └──────────────┘ └──────────────┘ │
|
||||
│ │ │ │ │
|
||||
│ └──────────────────┴──────────────────┘ │
|
||||
│ │ │
|
||||
│ ┌────────────────────────▼────────────────────────────────────┐ │
|
||||
│ │ Shared NATS JetStream Cluster │ │
|
||||
│ │ (tenant-namespaced subjects for isolation) │ │
|
||||
│ └─────────────────────────────────────────────────────────────┘ │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
**Note:** Each node has its own embedded `edge-storage` (libmdbx) containing snapshots for its assigned tenants. NATS JetStream provides shared event storage. Tenant migration requires snapshot data transfer between nodes.
|
||||
|
||||
**Operational Model:**
|
||||
- **Scale Up:** Admin UI calls Swarm API to add new node, updates tenant → node mapping, Gateway updates routing table
|
||||
- **Scale Down:** Migrate tenants to other nodes (drain), remove node from Swarm
|
||||
- **Tenant Migration:** Pause consumer, copy tenant data, update routing, resume on new node
|
||||
- **Zero-Downtime:** New tenant assignments are picked up by Gateway via config reload without restart
|
||||
|
||||
**Placement Constraints:**
|
||||
- Each Aggregate service runs with `--constraint node.labels.tenant_range==<range>`
|
||||
- Gateway uses tenant → node mapping to route commands to correct Swarm service endpoint
|
||||
- Multiple replicas per tenant range supported for HA (active-passive via NATS consumer groups)
|
||||
|
||||
**Admin Endpoints (per Aggregate container):**
|
||||
- `/health` - Container health (NATS, storage, active aggregates)
|
||||
- `/ready` - Readiness for receiving commands
|
||||
- `/metrics` - Prometheus metrics with tenant_id labels
|
||||
- `/admin/tenants` - List tenants hosted on this node (read-only)
|
||||
- `/admin/drain` - Graceful drain for tenant migration
|
||||
- `/admin/reload` - Hot-reload tenant placement config
|
||||
|
||||
**External Control Node:**
|
||||
- Separate service that calls Aggregate admin endpoints
|
||||
- Manages Docker Swarm API for scaling operations
|
||||
- Publishes tenant → node mapping to NATS KV
|
||||
- See Admin UI repository for full implementation
|
||||
|
||||
---
|
||||
|
||||
### 💡 Implementation Note:
|
||||
The **Aggregate Logic** is a pair of `runtime-function` DAG programs:
|
||||
1. **`decide` program**: `(state, command) → events[]` — The business logic (validates command, produces events).
|
||||
2. **`apply` program**: `(state, event) → new_state` — The state transition logic (used during rehydration from snapshots + events).
|
||||
|
||||
These are referenced in the manifest as `decide:` and `apply:` fields under each aggregate definition.
|
||||
32
aggregate/proto/aggregate.proto
Normal file
32
aggregate/proto/aggregate.proto
Normal file
@@ -0,0 +1,32 @@
|
||||
syntax = "proto3";
|
||||
|
||||
package aggregate.gateway.v1;
|
||||
|
||||
service CommandService {
|
||||
rpc SubmitCommand(SubmitCommandRequest) returns (SubmitCommandResponse);
|
||||
}
|
||||
|
||||
message SubmitCommandRequest {
|
||||
string tenant_id = 1;
|
||||
string command_id = 2;
|
||||
string aggregate_id = 3;
|
||||
string aggregate_type = 4;
|
||||
string payload_json = 5;
|
||||
map<string, string> metadata = 6;
|
||||
}
|
||||
|
||||
message Event {
|
||||
string event_id = 1;
|
||||
string command_id = 2;
|
||||
string aggregate_id = 3;
|
||||
string aggregate_type = 4;
|
||||
uint64 version = 5;
|
||||
string event_type = 6;
|
||||
string payload_json = 7;
|
||||
string timestamp_rfc3339 = 8;
|
||||
}
|
||||
|
||||
message SubmitCommandResponse {
|
||||
repeated Event events = 1;
|
||||
}
|
||||
|
||||
2
aggregate/rustfmt.toml
Normal file
2
aggregate/rustfmt.toml
Normal file
@@ -0,0 +1,2 @@
|
||||
edition = "2021"
|
||||
newline_style = "Unix"
|
||||
487
aggregate/src/aggregate/handler.rs
Normal file
487
aggregate/src/aggregate/handler.rs
Normal file
@@ -0,0 +1,487 @@
|
||||
use super::AggregateInstance;
|
||||
use crate::query::{QueryClient, StateProjection};
|
||||
use crate::runtime::RuntimeExecutor;
|
||||
use crate::storage::StorageClient;
|
||||
use crate::stream::StreamClient;
|
||||
use crate::types::{
|
||||
AggregateError, AggregateId, AggregateType, Command, Event, Snapshot, TenantId, Version,
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct AggregateHandler {
|
||||
storage: StorageClient,
|
||||
stream: StreamClient,
|
||||
executor: RuntimeExecutor,
|
||||
query: QueryClient,
|
||||
decide_program: String,
|
||||
apply_program: String,
|
||||
snapshot_threshold: u64,
|
||||
max_retries: u32,
|
||||
}
|
||||
|
||||
impl AggregateHandler {
|
||||
pub fn new(
|
||||
storage: StorageClient,
|
||||
stream: StreamClient,
|
||||
executor: RuntimeExecutor,
|
||||
decide_program: String,
|
||||
apply_program: String,
|
||||
) -> Self {
|
||||
Self {
|
||||
storage,
|
||||
stream,
|
||||
executor,
|
||||
query: QueryClient::embedded(),
|
||||
decide_program,
|
||||
apply_program,
|
||||
snapshot_threshold: 10,
|
||||
max_retries: 3,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_query_client(mut self, query: QueryClient) -> Self {
|
||||
self.query = query;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_snapshot_threshold(mut self, threshold: u64) -> Self {
|
||||
self.snapshot_threshold = threshold;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_max_retries(mut self, max_retries: u32) -> Self {
|
||||
self.max_retries = max_retries.max(1);
|
||||
self
|
||||
}
|
||||
|
||||
pub async fn handle_command(&self, command: Command) -> Result<Vec<Event>, AggregateError> {
|
||||
let mut attempt = 0;
|
||||
|
||||
loop {
|
||||
attempt += 1;
|
||||
let tenant_id = &command.tenant_id;
|
||||
let aggregate_id = &command.aggregate_id;
|
||||
let aggregate_type = &command.aggregate_type;
|
||||
|
||||
let instance = self
|
||||
.load_or_create_instance(tenant_id, aggregate_id, aggregate_type)
|
||||
.await?;
|
||||
|
||||
let (instance, events) = self.execute_command(instance, command.clone()).await?;
|
||||
|
||||
if events.is_empty() {
|
||||
return Ok(events);
|
||||
}
|
||||
|
||||
match self.persist_events(&events).await {
|
||||
Ok(()) => {
|
||||
self.maybe_save_snapshot(&instance).await?;
|
||||
self.project_state(&instance).await?;
|
||||
return Ok(events);
|
||||
}
|
||||
Err(AggregateError::VersionConflict { .. }) if attempt < self.max_retries => {
|
||||
continue;
|
||||
}
|
||||
Err(e) => return Err(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn load_or_create_instance(
|
||||
&self,
|
||||
tenant_id: &TenantId,
|
||||
aggregate_id: &AggregateId,
|
||||
aggregate_type: &AggregateType,
|
||||
) -> Result<AggregateInstance, AggregateError> {
|
||||
let snapshot = self.storage.get_snapshot(tenant_id, aggregate_id).await?;
|
||||
|
||||
match snapshot {
|
||||
Some(snapshot) => {
|
||||
let events = self
|
||||
.stream
|
||||
.fetch_events(tenant_id, aggregate_id, snapshot.version)
|
||||
.await?;
|
||||
|
||||
AggregateInstance::rehydrate_with_executor(
|
||||
tenant_id.clone(),
|
||||
snapshot,
|
||||
events,
|
||||
self.decide_program.clone(),
|
||||
self.apply_program.clone(),
|
||||
&self.executor,
|
||||
)
|
||||
.await
|
||||
}
|
||||
None => {
|
||||
let events = self
|
||||
.stream
|
||||
.fetch_events(tenant_id, aggregate_id, Version::initial())
|
||||
.await?;
|
||||
|
||||
if events.is_empty() {
|
||||
Ok(AggregateInstance::new(
|
||||
aggregate_id.clone(),
|
||||
aggregate_type.clone(),
|
||||
tenant_id.clone(),
|
||||
self.decide_program.clone(),
|
||||
self.apply_program.clone(),
|
||||
))
|
||||
} else {
|
||||
let initial_snapshot = Snapshot::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
aggregate_type.clone(),
|
||||
Version::initial(),
|
||||
serde_json::Value::Null,
|
||||
);
|
||||
|
||||
AggregateInstance::rehydrate_with_executor(
|
||||
tenant_id.clone(),
|
||||
initial_snapshot,
|
||||
events,
|
||||
self.decide_program.clone(),
|
||||
self.apply_program.clone(),
|
||||
&self.executor,
|
||||
)
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn execute_command(
|
||||
&self,
|
||||
mut instance: AggregateInstance,
|
||||
command: Command,
|
||||
) -> Result<(AggregateInstance, Vec<Event>), AggregateError> {
|
||||
let events = instance.handle_command(command, &self.executor).await?;
|
||||
|
||||
Ok((instance, events))
|
||||
}
|
||||
|
||||
async fn project_state(&self, instance: &AggregateInstance) -> Result<(), AggregateError> {
|
||||
let projection = StateProjection::default_projection_from_state(
|
||||
instance.tenant_id(),
|
||||
instance.aggregate_id(),
|
||||
instance.aggregate_type(),
|
||||
&instance.version(),
|
||||
instance.state(),
|
||||
);
|
||||
|
||||
self.query
|
||||
.index(projection)
|
||||
.await
|
||||
.map_err(|e| AggregateError::StorageError(e.to_string()))
|
||||
}
|
||||
|
||||
async fn maybe_save_snapshot(
|
||||
&self,
|
||||
instance: &AggregateInstance,
|
||||
) -> Result<(), AggregateError> {
|
||||
let current_version = instance.version();
|
||||
let events_since_snapshot = current_version
|
||||
.as_u64()
|
||||
.saturating_sub(instance.snapshot_version().as_u64());
|
||||
|
||||
if events_since_snapshot >= self.snapshot_threshold {
|
||||
let snapshot = instance.to_snapshot();
|
||||
match self.storage.put_snapshot(&snapshot).await {
|
||||
Ok(()) => {}
|
||||
Err(AggregateError::VersionConflict { .. }) => {}
|
||||
Err(e) => return Err(e),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn persist_events(&self, events: &[Event]) -> Result<(), AggregateError> {
|
||||
self.stream.publish_events(events.to_vec()).await
|
||||
}
|
||||
|
||||
pub async fn load_aggregate(
|
||||
&self,
|
||||
tenant_id: &TenantId,
|
||||
aggregate_id: &AggregateId,
|
||||
aggregate_type: &AggregateType,
|
||||
) -> Result<AggregateInstance, AggregateError> {
|
||||
self.load_or_create_instance(tenant_id, aggregate_id, aggregate_type)
|
||||
.await
|
||||
}
|
||||
|
||||
pub fn storage(&self) -> &StorageClient {
|
||||
&self.storage
|
||||
}
|
||||
|
||||
pub fn stream(&self) -> &StreamClient {
|
||||
&self.stream
|
||||
}
|
||||
|
||||
pub fn executor(&self) -> &RuntimeExecutor {
|
||||
&self.executor
|
||||
}
|
||||
|
||||
pub fn query_client(&self) -> &QueryClient {
|
||||
&self.query
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use tempfile::tempdir;
|
||||
|
||||
async fn create_test_handler() -> (tempfile::TempDir, AggregateHandler) {
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("test.mdbx");
|
||||
let storage = StorageClient::open(path.to_string_lossy().to_string()).unwrap();
|
||||
|
||||
let stream = StreamClient::in_memory();
|
||||
let executor = RuntimeExecutor::with_config(
|
||||
crate::runtime::ExecutorConfig::default().with_mock_runtime(),
|
||||
);
|
||||
|
||||
let handler = AggregateHandler::new(
|
||||
storage,
|
||||
stream,
|
||||
executor,
|
||||
"function decide(s,c) { return []; }".to_string(),
|
||||
"function apply(s,e) { return s; }".to_string(),
|
||||
);
|
||||
|
||||
(dir, handler)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn handler_is_send_sync() {
|
||||
fn assert_send_sync<T: Send + Sync>() {}
|
||||
assert_send_sync::<AggregateHandler>();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn snapshot_threshold_defaults_to_10() {
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("test.mdbx");
|
||||
let storage = StorageClient::open(path.to_string_lossy().to_string()).unwrap();
|
||||
let stream = StreamClient::in_memory();
|
||||
|
||||
let executor = RuntimeExecutor::new();
|
||||
|
||||
let handler = AggregateHandler::new(
|
||||
storage,
|
||||
stream,
|
||||
executor,
|
||||
"decide".to_string(),
|
||||
"apply".to_string(),
|
||||
);
|
||||
|
||||
let handler_with_threshold = AggregateHandler::new(
|
||||
handler.storage.clone(),
|
||||
handler.stream.clone(),
|
||||
handler.executor.clone(),
|
||||
"decide".to_string(),
|
||||
"apply".to_string(),
|
||||
)
|
||||
.with_snapshot_threshold(25);
|
||||
|
||||
assert_eq!(handler.snapshot_threshold, 10);
|
||||
assert_eq!(handler_with_threshold.snapshot_threshold, 25);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn handler_full_lifecycle_persists_events_and_snapshot() {
|
||||
let (_dir, handler) = create_test_handler().await;
|
||||
let handler = handler.with_snapshot_threshold(1);
|
||||
|
||||
let tenant_id = TenantId::new("tenant-a");
|
||||
let aggregate_id = AggregateId::new_v7();
|
||||
let aggregate_type = AggregateType::from("Account");
|
||||
|
||||
let command = Command::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
aggregate_type.clone(),
|
||||
serde_json::json!({"type": "deposit", "amount": 50}),
|
||||
);
|
||||
|
||||
let events = handler.handle_command(command).await.unwrap();
|
||||
assert_eq!(events.len(), 1);
|
||||
|
||||
let snapshot = handler
|
||||
.storage
|
||||
.get_snapshot(&tenant_id, &aggregate_id)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(snapshot.version, Version::from(1));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn retry_on_version_conflict() {
|
||||
let (_dir, handler) = create_test_handler().await;
|
||||
let handler = handler.with_max_retries(5);
|
||||
|
||||
let tenant_id = TenantId::new("tenant-a");
|
||||
let aggregate_id = AggregateId::new_v7();
|
||||
let aggregate_type = AggregateType::from("Account");
|
||||
|
||||
let cmd1 = Command::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
aggregate_type.clone(),
|
||||
serde_json::json!({"type": "deposit", "amount": 10}),
|
||||
);
|
||||
let cmd2 = Command::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
aggregate_type.clone(),
|
||||
serde_json::json!({"type": "deposit", "amount": 20}),
|
||||
);
|
||||
|
||||
let (r1, r2) = tokio::join!(handler.handle_command(cmd1), handler.handle_command(cmd2));
|
||||
assert!(r1.is_ok());
|
||||
assert!(r2.is_ok());
|
||||
|
||||
let events = handler
|
||||
.stream
|
||||
.fetch_events(&tenant_id, &aggregate_id, Version::initial())
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(events.len(), 2);
|
||||
assert_eq!(events[0].version, Version::from(1));
|
||||
assert_eq!(events[1].version, Version::from(2));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn snapshot_threshold_respected() {
|
||||
let (_dir, handler) = create_test_handler().await;
|
||||
let handler = handler.with_snapshot_threshold(3);
|
||||
|
||||
let tenant_id = TenantId::new("tenant-a");
|
||||
let aggregate_id = AggregateId::new_v7();
|
||||
let aggregate_type = AggregateType::from("Account");
|
||||
|
||||
for _ in 0..5 {
|
||||
let cmd = Command::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
aggregate_type.clone(),
|
||||
serde_json::json!({"type": "deposit", "amount": 1}),
|
||||
);
|
||||
handler.handle_command(cmd).await.unwrap();
|
||||
}
|
||||
|
||||
let snapshot = handler
|
||||
.storage
|
||||
.get_snapshot(&tenant_id, &aggregate_id)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(snapshot.version, Version::from(3));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn empty_tenant_id_allowed_in_single_tenant_mode() {
|
||||
let (_dir, handler) = create_test_handler().await;
|
||||
let handler = handler.with_snapshot_threshold(1);
|
||||
|
||||
let tenant_id = TenantId::default();
|
||||
let aggregate_id = AggregateId::new_v7();
|
||||
let aggregate_type = AggregateType::from("Account");
|
||||
|
||||
let command = Command::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
aggregate_type.clone(),
|
||||
serde_json::json!({"type": "deposit", "amount": 5}),
|
||||
);
|
||||
|
||||
let events = handler.handle_command(command).await.unwrap();
|
||||
assert_eq!(events.len(), 1);
|
||||
|
||||
let proj = handler
|
||||
.query
|
||||
.get(&tenant_id, &aggregate_id.to_string())
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(proj.state["balance"], 5);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn query_returns_correct_aggregate_state() {
|
||||
let (_dir, handler) = create_test_handler().await;
|
||||
let handler = handler.with_snapshot_threshold(1);
|
||||
|
||||
let tenant_id = TenantId::new("tenant-a");
|
||||
let aggregate_id = AggregateId::new_v7();
|
||||
let aggregate_type = AggregateType::from("Account");
|
||||
|
||||
let command = Command::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
aggregate_type,
|
||||
serde_json::json!({"type": "deposit", "amount": 100}),
|
||||
);
|
||||
handler.handle_command(command).await.unwrap();
|
||||
|
||||
let proj = handler
|
||||
.query
|
||||
.get(&tenant_id, &aggregate_id.to_string())
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(proj.state["balance"], 100);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn system_failure_recovery_rehydrates_state() {
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("test.mdbx");
|
||||
let storage = StorageClient::open(path.to_string_lossy().to_string()).unwrap();
|
||||
let stream = StreamClient::in_memory();
|
||||
let executor = RuntimeExecutor::with_config(
|
||||
crate::runtime::ExecutorConfig::default().with_mock_runtime(),
|
||||
);
|
||||
|
||||
let handler1 = AggregateHandler::new(
|
||||
storage,
|
||||
stream.clone(),
|
||||
executor.clone(),
|
||||
"decide".to_string(),
|
||||
"apply".to_string(),
|
||||
)
|
||||
.with_snapshot_threshold(2);
|
||||
|
||||
let tenant_id = TenantId::new("tenant-a");
|
||||
let aggregate_id = AggregateId::new_v7();
|
||||
let aggregate_type = AggregateType::from("Account");
|
||||
|
||||
for _ in 0..2 {
|
||||
let cmd = Command::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
aggregate_type.clone(),
|
||||
serde_json::json!({"type": "deposit", "amount": 10}),
|
||||
);
|
||||
handler1.handle_command(cmd).await.unwrap();
|
||||
}
|
||||
|
||||
drop(handler1);
|
||||
|
||||
let storage2 = StorageClient::open(path.to_string_lossy().to_string()).unwrap();
|
||||
let handler2 = AggregateHandler::new(
|
||||
storage2,
|
||||
stream,
|
||||
executor,
|
||||
"decide".to_string(),
|
||||
"apply".to_string(),
|
||||
);
|
||||
|
||||
let loaded = handler2
|
||||
.load_aggregate(&tenant_id, &aggregate_id, &aggregate_type)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(loaded.state()["balance"], 20);
|
||||
}
|
||||
}
|
||||
5
aggregate/src/aggregate/mod.rs
Normal file
5
aggregate/src/aggregate/mod.rs
Normal file
@@ -0,0 +1,5 @@
|
||||
mod handler;
|
||||
mod state;
|
||||
|
||||
pub use handler::*;
|
||||
pub use state::*;
|
||||
448
aggregate/src/aggregate/state.rs
Normal file
448
aggregate/src/aggregate/state.rs
Normal file
@@ -0,0 +1,448 @@
|
||||
use crate::runtime::RuntimeExecutor;
|
||||
use crate::types::{
|
||||
AggregateError, AggregateId, AggregateType, Command, Event, Snapshot, TenantId, Version,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use std::collections::HashSet;
|
||||
use uuid::Uuid;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct AggregateInstance {
|
||||
aggregate_id: AggregateId,
|
||||
aggregate_type: AggregateType,
|
||||
tenant_id: TenantId,
|
||||
snapshot_version: Version,
|
||||
version: Version,
|
||||
state: Value,
|
||||
decide_program: String,
|
||||
apply_program: String,
|
||||
processed_command_ids: HashSet<Uuid>,
|
||||
}
|
||||
|
||||
impl AggregateInstance {
|
||||
pub fn new(
|
||||
aggregate_id: AggregateId,
|
||||
aggregate_type: AggregateType,
|
||||
tenant_id: TenantId,
|
||||
decide_program: String,
|
||||
apply_program: String,
|
||||
) -> Self {
|
||||
Self {
|
||||
aggregate_id,
|
||||
aggregate_type,
|
||||
tenant_id,
|
||||
snapshot_version: Version::initial(),
|
||||
version: Version::initial(),
|
||||
state: Value::Null,
|
||||
decide_program,
|
||||
apply_program,
|
||||
processed_command_ids: HashSet::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn aggregate_id(&self) -> &AggregateId {
|
||||
&self.aggregate_id
|
||||
}
|
||||
|
||||
pub fn aggregate_type(&self) -> &AggregateType {
|
||||
&self.aggregate_type
|
||||
}
|
||||
|
||||
pub fn tenant_id(&self) -> &TenantId {
|
||||
&self.tenant_id
|
||||
}
|
||||
|
||||
pub fn version(&self) -> Version {
|
||||
self.version
|
||||
}
|
||||
|
||||
pub fn snapshot_version(&self) -> Version {
|
||||
self.snapshot_version
|
||||
}
|
||||
|
||||
pub fn state(&self) -> &Value {
|
||||
&self.state
|
||||
}
|
||||
|
||||
pub fn rehydrate(
|
||||
tenant_id: TenantId,
|
||||
snapshot: Snapshot,
|
||||
events: Vec<Event>,
|
||||
decide_program: String,
|
||||
apply_program: String,
|
||||
) -> Result<Self, AggregateError> {
|
||||
if snapshot.tenant_id != tenant_id {
|
||||
return Err(AggregateError::TenantAccessDenied {
|
||||
tenant_id: snapshot.tenant_id,
|
||||
});
|
||||
}
|
||||
|
||||
let mut instance = Self {
|
||||
aggregate_id: snapshot.aggregate_id,
|
||||
aggregate_type: snapshot.aggregate_type,
|
||||
tenant_id,
|
||||
snapshot_version: snapshot.version,
|
||||
version: snapshot.version,
|
||||
state: snapshot.state,
|
||||
decide_program,
|
||||
apply_program,
|
||||
processed_command_ids: HashSet::new(),
|
||||
};
|
||||
|
||||
for event in events {
|
||||
instance.apply_event_internal(&event)?;
|
||||
}
|
||||
|
||||
Ok(instance)
|
||||
}
|
||||
|
||||
pub async fn rehydrate_with_executor(
|
||||
tenant_id: TenantId,
|
||||
snapshot: Snapshot,
|
||||
events: Vec<Event>,
|
||||
decide_program: String,
|
||||
apply_program: String,
|
||||
executor: &RuntimeExecutor,
|
||||
) -> Result<Self, AggregateError> {
|
||||
if snapshot.tenant_id != tenant_id {
|
||||
return Err(AggregateError::TenantAccessDenied {
|
||||
tenant_id: snapshot.tenant_id,
|
||||
});
|
||||
}
|
||||
|
||||
let mut instance = Self {
|
||||
aggregate_id: snapshot.aggregate_id,
|
||||
aggregate_type: snapshot.aggregate_type,
|
||||
tenant_id,
|
||||
snapshot_version: snapshot.version,
|
||||
version: snapshot.version,
|
||||
state: snapshot.state,
|
||||
decide_program,
|
||||
apply_program,
|
||||
processed_command_ids: HashSet::new(),
|
||||
};
|
||||
|
||||
for event in events {
|
||||
let apply_result = executor
|
||||
.execute_apply(&instance.state, &event, &instance.apply_program)
|
||||
.await?;
|
||||
instance.state = apply_result.new_state;
|
||||
instance.apply_event_internal(&event)?;
|
||||
}
|
||||
|
||||
Ok(instance)
|
||||
}
|
||||
|
||||
fn apply_event_internal(&mut self, event: &Event) -> Result<(), AggregateError> {
|
||||
if event.tenant_id != self.tenant_id {
|
||||
return Err(AggregateError::TenantAccessDenied {
|
||||
tenant_id: event.tenant_id.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
self.processed_command_ids.insert(event.command_id);
|
||||
self.version = event.version;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn apply_event(&mut self, event: &Event) -> Result<(), AggregateError> {
|
||||
self.apply_event_internal(event)
|
||||
}
|
||||
|
||||
pub async fn handle_command(
|
||||
&mut self,
|
||||
command: Command,
|
||||
executor: &RuntimeExecutor,
|
||||
) -> Result<Vec<Event>, AggregateError> {
|
||||
if command.tenant_id != self.tenant_id {
|
||||
return Err(AggregateError::TenantAccessDenied {
|
||||
tenant_id: command.tenant_id,
|
||||
});
|
||||
}
|
||||
|
||||
if command.aggregate_id != self.aggregate_id {
|
||||
return Err(AggregateError::NotFound(command.aggregate_id));
|
||||
}
|
||||
|
||||
if self.processed_command_ids.contains(&command.command_id) {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let decide_result = executor
|
||||
.execute_decide(&self.state, &command, &self.decide_program)
|
||||
.await?;
|
||||
|
||||
let command_id = command.command_id;
|
||||
let correlation_id = command
|
||||
.metadata
|
||||
.get("correlation_id")
|
||||
.and_then(|v| v.as_str())
|
||||
.map(|s| s.to_string());
|
||||
let traceparent = command
|
||||
.metadata
|
||||
.get("traceparent")
|
||||
.and_then(|v| v.as_str())
|
||||
.map(|s| s.to_string());
|
||||
let mut events = Vec::with_capacity(decide_result.events.len());
|
||||
|
||||
for event_payload in decide_result.events {
|
||||
let event_type = event_payload
|
||||
.get("type")
|
||||
.and_then(|t| t.as_str())
|
||||
.unwrap_or("Unknown")
|
||||
.to_string();
|
||||
|
||||
let new_version = self.version.increment();
|
||||
let mut event = Event::new(
|
||||
self.tenant_id.clone(),
|
||||
self.aggregate_id.clone(),
|
||||
self.aggregate_type.clone(),
|
||||
new_version,
|
||||
event_type,
|
||||
event_payload,
|
||||
command_id,
|
||||
);
|
||||
event.correlation_id = correlation_id.clone();
|
||||
event.traceparent = traceparent.clone();
|
||||
|
||||
let apply_result = executor
|
||||
.execute_apply(&self.state, &event, &self.apply_program)
|
||||
.await?;
|
||||
self.state = apply_result.new_state;
|
||||
self.version = new_version;
|
||||
|
||||
events.push(event);
|
||||
}
|
||||
|
||||
self.processed_command_ids.insert(command_id);
|
||||
Ok(events)
|
||||
}
|
||||
|
||||
pub fn to_snapshot(&self) -> Snapshot {
|
||||
Snapshot::new(
|
||||
self.tenant_id.clone(),
|
||||
self.aggregate_id.clone(),
|
||||
self.aggregate_type.clone(),
|
||||
self.version,
|
||||
self.state.clone(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use serde_json::json;
|
||||
|
||||
fn test_instance() -> AggregateInstance {
|
||||
AggregateInstance::new(
|
||||
AggregateId::new_v7(),
|
||||
AggregateType::new("Account"),
|
||||
TenantId::new("tenant-a"),
|
||||
"function decide(s,c) { return []; }".to_string(),
|
||||
"function apply(s,e) { return s; }".to_string(),
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn aggregate_instance_has_id_and_tenant() {
|
||||
let agg = test_instance();
|
||||
assert_eq!(agg.tenant_id().as_str(), "tenant-a");
|
||||
assert_eq!(agg.aggregate_type().as_str(), "Account");
|
||||
assert!(!agg.aggregate_id().to_string().is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn new_instance_starts_at_version_zero() {
|
||||
let agg = test_instance();
|
||||
assert_eq!(agg.version(), Version::initial());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rehydrate_validates_tenant() {
|
||||
let snapshot = Snapshot::new(
|
||||
TenantId::new("tenant-a"),
|
||||
AggregateId::new_v7(),
|
||||
AggregateType::new("Account"),
|
||||
Version::from(5),
|
||||
json!({ "balance": 100 }),
|
||||
);
|
||||
|
||||
let result = AggregateInstance::rehydrate(
|
||||
TenantId::new("tenant-b"),
|
||||
snapshot,
|
||||
vec![],
|
||||
"decide".to_string(),
|
||||
"apply".to_string(),
|
||||
);
|
||||
|
||||
assert!(result.is_err());
|
||||
match result.unwrap_err() {
|
||||
AggregateError::TenantAccessDenied { tenant_id } => {
|
||||
assert_eq!(tenant_id, TenantId::new("tenant-a"));
|
||||
}
|
||||
_ => panic!("Expected TenantAccessDenied"),
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn rehydrate_applies_events() {
|
||||
let tenant_id = TenantId::new("tenant-a");
|
||||
let aggregate_id = AggregateId::new_v7();
|
||||
let aggregate_type = AggregateType::new("Account");
|
||||
|
||||
let snapshot = Snapshot::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
aggregate_type.clone(),
|
||||
Version::from(2),
|
||||
json!({ "balance": 100 }),
|
||||
);
|
||||
|
||||
let event1 = Event::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
aggregate_type.clone(),
|
||||
Version::from(3),
|
||||
"deposited",
|
||||
json!({ "amount": 50 }),
|
||||
Uuid::now_v7(),
|
||||
);
|
||||
|
||||
let event2 = Event::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
aggregate_type.clone(),
|
||||
Version::from(4),
|
||||
"withdrawn",
|
||||
json!({ "amount": 25 }),
|
||||
Uuid::now_v7(),
|
||||
);
|
||||
|
||||
let executor = RuntimeExecutor::with_config(
|
||||
crate::runtime::ExecutorConfig::default().with_mock_runtime(),
|
||||
);
|
||||
let instance = AggregateInstance::rehydrate_with_executor(
|
||||
tenant_id,
|
||||
snapshot,
|
||||
vec![event1, event2],
|
||||
"decide".to_string(),
|
||||
"apply".to_string(),
|
||||
&executor,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(instance.version(), Version::from(4));
|
||||
assert_eq!(instance.state()["balance"], 125);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn to_snapshot_captures_state() {
|
||||
let mut agg = test_instance();
|
||||
agg.state = json!({ "balance": 150 });
|
||||
agg.version = Version::from(3);
|
||||
|
||||
let snapshot = agg.to_snapshot();
|
||||
assert_eq!(snapshot.state, json!({ "balance": 150 }));
|
||||
assert_eq!(snapshot.version, Version::from(3));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn idempotency_via_command_id_returns_empty() {
|
||||
let tenant_id = TenantId::new("tenant-a");
|
||||
let aggregate_id = AggregateId::new_v7();
|
||||
let aggregate_type = AggregateType::new("Account");
|
||||
let command_id = Uuid::now_v7();
|
||||
|
||||
let mut agg = AggregateInstance::new(
|
||||
aggregate_id.clone(),
|
||||
aggregate_type.clone(),
|
||||
tenant_id.clone(),
|
||||
"decide".to_string(),
|
||||
"apply".to_string(),
|
||||
);
|
||||
|
||||
let e = Event::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
aggregate_type.clone(),
|
||||
Version::from(1),
|
||||
"deposited",
|
||||
json!({ "amount": 10 }),
|
||||
command_id,
|
||||
);
|
||||
agg.apply_event(&e).unwrap();
|
||||
let before_version = agg.version();
|
||||
|
||||
let mut cmd = Command::new(
|
||||
tenant_id,
|
||||
aggregate_id,
|
||||
aggregate_type,
|
||||
json!({ "type": "deposit", "amount": 10 }),
|
||||
);
|
||||
cmd.command_id = command_id;
|
||||
|
||||
let executor = RuntimeExecutor::new();
|
||||
let events = agg.handle_command(cmd, &executor).await.unwrap();
|
||||
assert!(events.is_empty());
|
||||
assert_eq!(agg.version(), before_version);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn handle_command_validates_tenant() {
|
||||
let agg = AggregateInstance::new(
|
||||
AggregateId::new_v7(),
|
||||
AggregateType::new("Account"),
|
||||
TenantId::new("tenant-a"),
|
||||
"decide".to_string(),
|
||||
"apply".to_string(),
|
||||
);
|
||||
|
||||
let command = Command::new(
|
||||
TenantId::new("tenant-b"),
|
||||
agg.aggregate_id.clone(),
|
||||
AggregateType::new("Account"),
|
||||
json!({ "type": "deposit", "amount": 50 }),
|
||||
);
|
||||
|
||||
let executor = RuntimeExecutor::new();
|
||||
let mut agg = agg;
|
||||
let result = agg.handle_command(command, &executor).await;
|
||||
|
||||
assert!(result.is_err());
|
||||
match result.unwrap_err() {
|
||||
AggregateError::TenantAccessDenied { .. } => {}
|
||||
_ => panic!("Expected TenantAccessDenied"),
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn handle_command_validates_aggregate_id() {
|
||||
let agg = test_instance();
|
||||
let wrong_agg_id = AggregateId::new_v7();
|
||||
|
||||
let command = Command::new(
|
||||
agg.tenant_id.clone(),
|
||||
wrong_agg_id,
|
||||
AggregateType::new("Account"),
|
||||
json!({ "type": "deposit", "amount": 50 }),
|
||||
);
|
||||
|
||||
let executor = RuntimeExecutor::new();
|
||||
let mut agg = agg;
|
||||
let result = agg.handle_command(command, &executor).await;
|
||||
|
||||
assert!(result.is_err());
|
||||
match result.unwrap_err() {
|
||||
AggregateError::NotFound(_) => {}
|
||||
_ => panic!("Expected NotFound"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn instance_is_send_sync() {
|
||||
fn assert_send_sync<T: Send + Sync>() {}
|
||||
assert_send_sync::<AggregateInstance>();
|
||||
}
|
||||
}
|
||||
3
aggregate/src/config/mod.rs
Normal file
3
aggregate/src/config/mod.rs
Normal file
@@ -0,0 +1,3 @@
|
||||
mod settings;
|
||||
|
||||
pub use settings::*;
|
||||
274
aggregate/src/config/settings.rs
Normal file
274
aggregate/src/config/settings.rs
Normal file
@@ -0,0 +1,274 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::Path;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(default)]
|
||||
pub struct Settings {
|
||||
pub nats_url: String,
|
||||
pub storage_path: String,
|
||||
pub logger_socket: Option<String>,
|
||||
pub snapshot_threshold: u64,
|
||||
pub max_retries: u32,
|
||||
pub multi_tenant_enabled: bool,
|
||||
pub default_tenant_id: Option<String>,
|
||||
pub shard_id: String,
|
||||
pub placement_bucket: String,
|
||||
pub placement_key: String,
|
||||
pub grpc_addr: String,
|
||||
pub decide_program: String,
|
||||
pub apply_program: String,
|
||||
}
|
||||
|
||||
impl Default for Settings {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
nats_url: "nats://localhost:4222".to_string(),
|
||||
storage_path: "./data".to_string(),
|
||||
logger_socket: None,
|
||||
snapshot_threshold: 10,
|
||||
max_retries: 3,
|
||||
multi_tenant_enabled: true,
|
||||
default_tenant_id: None,
|
||||
shard_id: "local".to_string(),
|
||||
placement_bucket: "AGGREGATE_PLACEMENT".to_string(),
|
||||
placement_key: "aggregate_placement".to_string(),
|
||||
grpc_addr: "0.0.0.0:50051".to_string(),
|
||||
decide_program: "function decide(state, command) { return []; }".to_string(),
|
||||
apply_program: "function apply(state, event) { return state; }".to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Settings {
|
||||
pub fn from_env() -> Result<Self, std::env::VarError> {
|
||||
let mut settings = Self::default();
|
||||
settings.apply_env_overrides();
|
||||
Ok(settings)
|
||||
}
|
||||
|
||||
pub fn from_yaml(yaml: &str) -> Result<Self, serde_yaml::Error> {
|
||||
serde_yaml::from_str(yaml)
|
||||
}
|
||||
|
||||
pub fn from_toml(toml_str: &str) -> Result<Self, toml::de::Error> {
|
||||
toml::from_str(toml_str)
|
||||
}
|
||||
|
||||
pub fn from_json(json: &str) -> Result<Self, serde_json::Error> {
|
||||
serde_json::from_str(json)
|
||||
}
|
||||
|
||||
pub fn from_file(path: impl AsRef<Path>) -> Result<Self, SettingsLoadError> {
|
||||
let path = path.as_ref();
|
||||
let raw = std::fs::read_to_string(path)?;
|
||||
let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
|
||||
|
||||
match ext {
|
||||
"yaml" | "yml" => Ok(Self::from_yaml(&raw)?),
|
||||
"toml" => Ok(Self::from_toml(&raw)?),
|
||||
"json" => Ok(Self::from_json(&raw)?),
|
||||
_ => Err(SettingsLoadError::UnsupportedFormat {
|
||||
path: path.display().to_string(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn load_from_file_with_env_overrides(
|
||||
path: impl AsRef<Path>,
|
||||
) -> Result<Self, SettingsLoadError> {
|
||||
let mut settings = Self::from_file(path)?;
|
||||
settings.apply_env_overrides();
|
||||
Ok(settings)
|
||||
}
|
||||
|
||||
fn apply_env_overrides(&mut self) {
|
||||
if let Ok(url) = std::env::var("AGGREGATE_NATS_URL") {
|
||||
self.nats_url = url;
|
||||
}
|
||||
|
||||
if let Ok(path) = std::env::var("AGGREGATE_STORAGE_PATH") {
|
||||
self.storage_path = path;
|
||||
}
|
||||
|
||||
if let Ok(socket) = std::env::var("AGGREGATE_LOGGER_SOCKET") {
|
||||
self.logger_socket = Some(socket);
|
||||
}
|
||||
|
||||
if let Ok(threshold) = std::env::var("AGGREGATE_SNAPSHOT_THRESHOLD") {
|
||||
if let Ok(value) = threshold.parse() {
|
||||
self.snapshot_threshold = value;
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(retries) = std::env::var("AGGREGATE_MAX_RETRIES") {
|
||||
if let Ok(value) = retries.parse() {
|
||||
self.max_retries = value;
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(enabled) = std::env::var("AGGREGATE_MULTI_TENANT") {
|
||||
if let Ok(value) = enabled.parse() {
|
||||
self.multi_tenant_enabled = value;
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(default_tenant_id) = std::env::var("AGGREGATE_DEFAULT_TENANT_ID") {
|
||||
if default_tenant_id.is_empty() {
|
||||
self.default_tenant_id = None;
|
||||
} else {
|
||||
self.default_tenant_id = Some(default_tenant_id);
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(shard_id) = std::env::var("AGGREGATE_SHARD_ID") {
|
||||
if !shard_id.is_empty() {
|
||||
self.shard_id = shard_id;
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(bucket) = std::env::var("AGGREGATE_PLACEMENT_BUCKET") {
|
||||
if !bucket.is_empty() {
|
||||
self.placement_bucket = bucket;
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(key) = std::env::var("AGGREGATE_PLACEMENT_KEY") {
|
||||
if !key.is_empty() {
|
||||
self.placement_key = key;
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(addr) = std::env::var("AGGREGATE_GRPC_ADDR") {
|
||||
if !addr.is_empty() {
|
||||
self.grpc_addr = addr;
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(program) = std::env::var("AGGREGATE_DECIDE_PROGRAM") {
|
||||
if !program.is_empty() {
|
||||
self.decide_program = program;
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(program) = std::env::var("AGGREGATE_APPLY_PROGRAM") {
|
||||
if !program.is_empty() {
|
||||
self.apply_program = program;
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(path) = std::env::var("AGGREGATE_DECIDE_PROGRAM_PATH") {
|
||||
if let Ok(raw) = std::fs::read_to_string(path) {
|
||||
if !raw.is_empty() {
|
||||
self.decide_program = raw;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(path) = std::env::var("AGGREGATE_APPLY_PROGRAM_PATH") {
|
||||
if let Ok(raw) = std::fs::read_to_string(path) {
|
||||
if !raw.is_empty() {
|
||||
self.apply_program = raw;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn validate(&self) -> Result<(), String> {
|
||||
if self.nats_url.is_empty() {
|
||||
return Err("NATS URL is required".to_string());
|
||||
}
|
||||
|
||||
if self.storage_path.is_empty() {
|
||||
return Err("Storage path is required".to_string());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum SettingsLoadError {
|
||||
#[error("Failed to read config file: {0}")]
|
||||
Io(#[from] std::io::Error),
|
||||
#[error("Failed to parse YAML config: {0}")]
|
||||
Yaml(#[from] serde_yaml::Error),
|
||||
#[error("Failed to parse TOML config: {0}")]
|
||||
Toml(#[from] toml::de::Error),
|
||||
#[error("Failed to parse JSON config: {0}")]
|
||||
Json(#[from] serde_json::Error),
|
||||
#[error("Unsupported config format: {path}")]
|
||||
UnsupportedFormat { path: String },
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use tempfile::tempdir;
|
||||
|
||||
#[test]
|
||||
fn settings_from_env() {
|
||||
std::env::set_var("AGGREGATE_NATS_URL", "nats://localhost:4222");
|
||||
let settings = Settings::from_env().unwrap();
|
||||
assert_eq!(settings.nats_url, "nats://localhost:4222");
|
||||
std::env::remove_var("AGGREGATE_NATS_URL");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn settings_validation() {
|
||||
let settings = Settings {
|
||||
nats_url: "".to_string(),
|
||||
..Default::default()
|
||||
};
|
||||
assert!(settings.validate().is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn settings_from_yaml_file_and_env_override() {
|
||||
let dir = tempdir().unwrap();
|
||||
let file_path = dir.path().join("aggregate.yaml");
|
||||
std::fs::write(
|
||||
&file_path,
|
||||
r#"
|
||||
nats_url: "nats://from-file:4222"
|
||||
storage_path: "/tmp/agg"
|
||||
snapshot_threshold: 25
|
||||
multi_tenant_enabled: false
|
||||
"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
std::env::set_var("AGGREGATE_NATS_URL", "nats://from-env:4222");
|
||||
let settings = Settings::load_from_file_with_env_overrides(&file_path).unwrap();
|
||||
assert_eq!(settings.nats_url, "nats://from-env:4222");
|
||||
assert_eq!(settings.storage_path, "/tmp/agg");
|
||||
assert_eq!(settings.snapshot_threshold, 25);
|
||||
assert!(!settings.multi_tenant_enabled);
|
||||
std::env::remove_var("AGGREGATE_NATS_URL");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn settings_from_toml_file() {
|
||||
let dir = tempdir().unwrap();
|
||||
let file_path = dir.path().join("aggregate.toml");
|
||||
std::fs::write(
|
||||
&file_path,
|
||||
r#"
|
||||
nats_url = "nats://from-file:4222"
|
||||
storage_path = "/tmp/agg"
|
||||
max_retries = 7
|
||||
"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let settings = Settings::from_file(&file_path).unwrap();
|
||||
assert_eq!(settings.nats_url, "nats://from-file:4222");
|
||||
assert_eq!(settings.storage_path, "/tmp/agg");
|
||||
assert_eq!(settings.max_retries, 7);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn settings_is_clone() {
|
||||
let s = Settings::default();
|
||||
let _s2 = s.clone();
|
||||
}
|
||||
}
|
||||
24
aggregate/src/container.rs
Normal file
24
aggregate/src/container.rs
Normal file
@@ -0,0 +1,24 @@
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
#[test]
|
||||
fn dockerfile_is_multi_stage_and_builds_selected_binary() {
|
||||
let raw = std::fs::read_to_string("../docker/Dockerfile.rust").unwrap();
|
||||
assert!(raw.contains("AS builder"));
|
||||
assert!(raw.contains("FROM debian:"));
|
||||
assert!(raw.contains("ARG PACKAGE"));
|
||||
assert!(raw.contains("ARG BIN"));
|
||||
assert!(raw.contains("cargo build -p ${PACKAGE} --bin ${BIN} --release"));
|
||||
assert!(raw.contains("COPY --from=builder"));
|
||||
assert!(raw.contains("ENTRYPOINT"));
|
||||
assert!(raw.contains("FROM"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn docker_compose_is_valid_yaml_and_has_services() {
|
||||
let raw = std::fs::read_to_string("../docker-compose.yml").unwrap();
|
||||
let doc: serde_yaml::Value = serde_yaml::from_str(&raw).unwrap();
|
||||
let services = doc.get("services").and_then(|v| v.as_mapping()).unwrap();
|
||||
assert!(services.contains_key(serde_yaml::Value::from("nats")));
|
||||
assert!(services.contains_key(serde_yaml::Value::from("aggregate")));
|
||||
}
|
||||
}
|
||||
7
aggregate/src/gateway/mod.rs
Normal file
7
aggregate/src/gateway/mod.rs
Normal file
@@ -0,0 +1,7 @@
|
||||
pub const TENANT_ID_METADATA_KEY: &str = "x-tenant-id";
|
||||
|
||||
pub mod proto {
|
||||
tonic::include_proto!("aggregate.gateway.v1");
|
||||
}
|
||||
|
||||
pub mod server;
|
||||
306
aggregate/src/gateway/server.rs
Normal file
306
aggregate/src/gateway/server.rs
Normal file
@@ -0,0 +1,306 @@
|
||||
use super::proto::command_service_server::{CommandService, CommandServiceServer};
|
||||
use super::proto::{Event as ProtoEvent, SubmitCommandRequest, SubmitCommandResponse};
|
||||
use crate::aggregate::AggregateHandler;
|
||||
use crate::observability::Observability;
|
||||
use crate::placement::TenantPlacementManager;
|
||||
use crate::types::{AggregateError, AggregateId, AggregateType, Command, TenantId};
|
||||
use std::collections::HashMap;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use tonic::{Request, Response, Status};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct GrpcCommandServer {
|
||||
handler: AggregateHandler,
|
||||
placement: Arc<TenantPlacementManager>,
|
||||
observability: Arc<Observability>,
|
||||
multi_tenant_enabled: bool,
|
||||
default_tenant_id: Option<TenantId>,
|
||||
}
|
||||
|
||||
impl GrpcCommandServer {
|
||||
pub fn new(
|
||||
handler: AggregateHandler,
|
||||
placement: Arc<TenantPlacementManager>,
|
||||
observability: Arc<Observability>,
|
||||
multi_tenant_enabled: bool,
|
||||
default_tenant_id: Option<TenantId>,
|
||||
) -> Self {
|
||||
Self {
|
||||
handler,
|
||||
placement,
|
||||
observability,
|
||||
multi_tenant_enabled,
|
||||
default_tenant_id,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn service(self) -> CommandServiceServer<Self> {
|
||||
CommandServiceServer::new(self)
|
||||
}
|
||||
}
|
||||
|
||||
#[tonic::async_trait]
|
||||
impl CommandService for GrpcCommandServer {
|
||||
async fn submit_command(
|
||||
&self,
|
||||
request: Request<SubmitCommandRequest>,
|
||||
) -> Result<Response<SubmitCommandResponse>, Status> {
|
||||
let correlation_id = request
|
||||
.metadata()
|
||||
.get("x-correlation-id")
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.map(|s| s.trim())
|
||||
.filter(|s| !s.is_empty())
|
||||
.map(|s| s.to_string());
|
||||
let traceparent = request
|
||||
.metadata()
|
||||
.get("traceparent")
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.map(|s| s.trim())
|
||||
.filter(|s| !s.is_empty())
|
||||
.map(|s| s.to_string());
|
||||
let trace_id = traceparent.as_deref().and_then(trace_id_from_traceparent);
|
||||
|
||||
let metadata_tenant = request
|
||||
.metadata()
|
||||
.get(super::TENANT_ID_METADATA_KEY)
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.unwrap_or("")
|
||||
.to_string();
|
||||
|
||||
let req = request.into_inner();
|
||||
|
||||
let tenant_id = resolve_tenant_id(
|
||||
&req.tenant_id,
|
||||
&metadata_tenant,
|
||||
self.multi_tenant_enabled,
|
||||
self.default_tenant_id.as_ref(),
|
||||
)
|
||||
.map_err(Status::invalid_argument)?;
|
||||
|
||||
if !tenant_id.as_str().is_empty() && !is_valid_tenant_id(tenant_id.as_str()) {
|
||||
return Err(Status::invalid_argument("invalid tenant_id"));
|
||||
}
|
||||
|
||||
let aggregate_id = AggregateId::from_str(&req.aggregate_id)
|
||||
.map_err(|e| Status::invalid_argument(e.to_string()))?;
|
||||
let aggregate_type = AggregateType::from(req.aggregate_type);
|
||||
|
||||
let payload: serde_json::Value = serde_json::from_str(&req.payload_json)
|
||||
.map_err(|e| Status::invalid_argument(e.to_string()))?;
|
||||
|
||||
let command_id = if req.command_id.is_empty() {
|
||||
uuid::Uuid::now_v7()
|
||||
} else {
|
||||
uuid::Uuid::parse_str(&req.command_id)
|
||||
.map_err(|e| Status::invalid_argument(e.to_string()))?
|
||||
};
|
||||
|
||||
let metadata: HashMap<String, serde_json::Value> = req
|
||||
.metadata
|
||||
.into_iter()
|
||||
.map(|(k, v)| (k, serde_json::Value::String(v)))
|
||||
.collect();
|
||||
|
||||
let mut metadata = metadata;
|
||||
if let Some(correlation_id) = correlation_id.as_deref() {
|
||||
metadata.insert(
|
||||
"correlation_id".to_string(),
|
||||
serde_json::Value::String(correlation_id.to_string()),
|
||||
);
|
||||
}
|
||||
if let Some(traceparent) = traceparent.as_deref() {
|
||||
metadata.insert(
|
||||
"traceparent".to_string(),
|
||||
serde_json::Value::String(traceparent.to_string()),
|
||||
);
|
||||
}
|
||||
|
||||
let command = Command {
|
||||
tenant_id: tenant_id.clone(),
|
||||
command_id,
|
||||
aggregate_id: aggregate_id.clone(),
|
||||
aggregate_type: aggregate_type.clone(),
|
||||
payload,
|
||||
metadata,
|
||||
};
|
||||
|
||||
let span = self.observability.start_command_span(
|
||||
&aggregate_id.to_string(),
|
||||
aggregate_type.as_str(),
|
||||
tenant_id.as_str(),
|
||||
&command_id.to_string(),
|
||||
correlation_id.as_deref(),
|
||||
trace_id.as_deref(),
|
||||
);
|
||||
|
||||
let _guard = self
|
||||
.placement
|
||||
.begin_command(&tenant_id)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
self.observability.record_command_error(&span, &e);
|
||||
map_aggregate_error(e)
|
||||
})?;
|
||||
|
||||
let events = self.handler.handle_command(command).await.map_err(|e| {
|
||||
self.observability.record_command_error(&span, &e);
|
||||
map_aggregate_error(e)
|
||||
})?;
|
||||
|
||||
self.observability
|
||||
.record_command_success(&span, events.len());
|
||||
|
||||
let proto_events = events
|
||||
.into_iter()
|
||||
.map(|e| ProtoEvent {
|
||||
event_id: e.event_id.to_string(),
|
||||
command_id: e.command_id.to_string(),
|
||||
aggregate_id: e.aggregate_id.to_string(),
|
||||
aggregate_type: e.aggregate_type.to_string(),
|
||||
version: e.version.as_u64(),
|
||||
event_type: e.event_type,
|
||||
payload_json: serde_json::to_string(&e.payload)
|
||||
.unwrap_or_else(|_| "{}".to_string()),
|
||||
timestamp_rfc3339: e.timestamp.to_rfc3339(),
|
||||
})
|
||||
.collect();
|
||||
|
||||
let mut response = Response::new(SubmitCommandResponse {
|
||||
events: proto_events,
|
||||
});
|
||||
if let Some(correlation_id) = correlation_id.as_deref() {
|
||||
if let Ok(v) = tonic::metadata::MetadataValue::try_from(correlation_id) {
|
||||
response.metadata_mut().insert("x-correlation-id", v);
|
||||
}
|
||||
}
|
||||
if let Some(traceparent) = traceparent.as_deref() {
|
||||
if let Ok(v) = tonic::metadata::MetadataValue::try_from(traceparent) {
|
||||
response.metadata_mut().insert("traceparent", v);
|
||||
}
|
||||
}
|
||||
Ok(response)
|
||||
}
|
||||
}
|
||||
|
||||
fn trace_id_from_traceparent(traceparent: &str) -> Option<String> {
|
||||
shared::trace_id_from_traceparent(traceparent).map(|s| s.to_string())
|
||||
}
|
||||
|
||||
fn map_aggregate_error(error: AggregateError) -> Status {
|
||||
match error {
|
||||
AggregateError::TenantNotHosted { .. } => Status::unavailable(error.to_string()),
|
||||
AggregateError::TenantDraining { .. } => Status::unavailable(error.to_string()),
|
||||
AggregateError::TenantAccessDenied { .. } => Status::permission_denied(error.to_string()),
|
||||
AggregateError::ValidationError(_) => Status::invalid_argument(error.to_string()),
|
||||
AggregateError::VersionConflict { .. } => Status::aborted(error.to_string()),
|
||||
AggregateError::NotFound(_) => Status::not_found(error.to_string()),
|
||||
AggregateError::StorageError(_) => Status::internal(error.to_string()),
|
||||
AggregateError::StreamError(_) => Status::unavailable(error.to_string()),
|
||||
AggregateError::RehydrationError(_) => Status::internal(error.to_string()),
|
||||
AggregateError::DecideError(_) => Status::failed_precondition(error.to_string()),
|
||||
AggregateError::ApplyError(_) => Status::failed_precondition(error.to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
fn resolve_tenant_id(
|
||||
explicit: &str,
|
||||
metadata: &str,
|
||||
multi_tenant_enabled: bool,
|
||||
default_tenant_id: Option<&TenantId>,
|
||||
) -> Result<TenantId, &'static str> {
|
||||
if !explicit.is_empty() {
|
||||
return Ok(TenantId::new(explicit));
|
||||
}
|
||||
if !metadata.is_empty() {
|
||||
return Ok(TenantId::new(metadata));
|
||||
}
|
||||
|
||||
if multi_tenant_enabled {
|
||||
if let Some(default_tenant_id) = default_tenant_id {
|
||||
return Ok(default_tenant_id.clone());
|
||||
}
|
||||
return Err("tenant_id is required");
|
||||
}
|
||||
|
||||
Ok(TenantId::default())
|
||||
}
|
||||
|
||||
fn is_valid_tenant_id(id: &str) -> bool {
|
||||
id.chars()
|
||||
.all(|c| c.is_alphanumeric() || c == '-' || c == '_')
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::runtime::{ExecutorConfig, RuntimeExecutor};
|
||||
use crate::storage::StorageClient;
|
||||
use crate::stream::StreamClient;
|
||||
use tempfile::tempdir;
|
||||
use tonic::transport::{Channel, Server};
|
||||
|
||||
#[tokio::test]
|
||||
async fn grpc_submit_command_rejects_unhosted_tenant() {
|
||||
let obs = Arc::new(Observability::default());
|
||||
let placement = Arc::new(TenantPlacementManager::new(obs.clone()));
|
||||
placement
|
||||
.set_hosted_tenants(vec!["tenant-a".to_string()])
|
||||
.await;
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("test.mdbx");
|
||||
let storage = StorageClient::open(path.to_string_lossy().to_string()).unwrap();
|
||||
let stream = StreamClient::in_memory();
|
||||
let executor = RuntimeExecutor::with_config(ExecutorConfig::default().with_mock_runtime());
|
||||
|
||||
let handler = AggregateHandler::new(
|
||||
storage,
|
||||
stream,
|
||||
executor,
|
||||
"decide".to_string(),
|
||||
"apply".to_string(),
|
||||
);
|
||||
|
||||
let service = GrpcCommandServer::new(handler, placement, obs, true, None).service();
|
||||
let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||
let addr = listener.local_addr().unwrap();
|
||||
|
||||
tokio::spawn(async move {
|
||||
Server::builder()
|
||||
.add_service(service)
|
||||
.serve_with_incoming_shutdown(
|
||||
tokio_stream::wrappers::TcpListenerStream::new(listener),
|
||||
async move {
|
||||
tokio::time::sleep(std::time::Duration::from_millis(200)).await;
|
||||
},
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
});
|
||||
|
||||
let channel = Channel::from_shared(format!("http://{}", addr))
|
||||
.unwrap()
|
||||
.connect()
|
||||
.await
|
||||
.unwrap();
|
||||
let mut client =
|
||||
super::super::proto::command_service_client::CommandServiceClient::new(channel);
|
||||
|
||||
let resp = client
|
||||
.submit_command(SubmitCommandRequest {
|
||||
tenant_id: "tenant-b".to_string(),
|
||||
command_id: uuid::Uuid::now_v7().to_string(),
|
||||
aggregate_id: AggregateId::new_v7().to_string(),
|
||||
aggregate_type: "Account".to_string(),
|
||||
payload_json: "{}".to_string(),
|
||||
metadata: HashMap::new(),
|
||||
})
|
||||
.await;
|
||||
|
||||
assert!(resp.is_err());
|
||||
let status = resp.unwrap_err();
|
||||
assert_eq!(status.code(), tonic::Code::Unavailable);
|
||||
}
|
||||
}
|
||||
230
aggregate/src/http_server.rs
Normal file
230
aggregate/src/http_server.rs
Normal file
@@ -0,0 +1,230 @@
|
||||
use crate::server::{AdminResponse, AdminServer};
|
||||
use axum::extract::{Path, State};
|
||||
use axum::http::{HeaderValue, StatusCode};
|
||||
use axum::response::{IntoResponse, Response};
|
||||
use axum::routing::{get, post};
|
||||
use axum::{Json, Router};
|
||||
use serde::Deserialize;
|
||||
use std::future::Future;
|
||||
use std::sync::Arc;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct HttpState {
|
||||
pub admin: Arc<AdminServer>,
|
||||
}
|
||||
|
||||
pub fn router(admin: Arc<AdminServer>) -> Router {
|
||||
let state = HttpState { admin };
|
||||
Router::new()
|
||||
.route("/health", get(health_route))
|
||||
.route("/ready", get(ready))
|
||||
.route("/metrics", get(metrics))
|
||||
.route("/admin/tenants", get(admin_tenants))
|
||||
.route("/admin/tenant/:tenant_id/status", get(admin_tenant_status))
|
||||
.route("/admin/tenant/:tenant_id/ready", get(admin_tenant_ready))
|
||||
.route("/admin/tenant/:tenant_id/drain", post(admin_tenant_drain))
|
||||
.route("/admin/drain", post(admin_drain))
|
||||
.route("/admin/reload", post(admin_reload))
|
||||
.with_state(state)
|
||||
}
|
||||
|
||||
pub async fn serve(
|
||||
listener: tokio::net::TcpListener,
|
||||
admin: Arc<AdminServer>,
|
||||
shutdown: impl Future<Output = ()> + Send + 'static,
|
||||
) {
|
||||
axum::serve(listener, router(admin))
|
||||
.with_graceful_shutdown(shutdown)
|
||||
.await
|
||||
.expect("http server failed");
|
||||
}
|
||||
|
||||
async fn health_route(State(state): State<HttpState>) -> Response {
|
||||
proxy_json(state.admin.get("/health").await).await
|
||||
}
|
||||
|
||||
async fn ready(State(state): State<HttpState>) -> Response {
|
||||
proxy_json(state.admin.get("/ready").await).await
|
||||
}
|
||||
|
||||
async fn admin_tenants(State(state): State<HttpState>) -> Response {
|
||||
proxy_json(state.admin.get("/admin/tenants").await).await
|
||||
}
|
||||
|
||||
async fn metrics(State(state): State<HttpState>) -> Response {
|
||||
let resp = state.admin.get("/metrics").await;
|
||||
let mut response = (StatusCode::OK, resp.text().await).into_response();
|
||||
response.headers_mut().insert(
|
||||
axum::http::header::CONTENT_TYPE,
|
||||
HeaderValue::from_static("text/plain; version=0.0.4"),
|
||||
);
|
||||
response
|
||||
}
|
||||
|
||||
async fn admin_drain(
|
||||
State(state): State<HttpState>,
|
||||
Json(body): Json<serde_json::Value>,
|
||||
) -> Response {
|
||||
proxy_json(state.admin.post("/admin/drain", body).await).await
|
||||
}
|
||||
|
||||
async fn admin_reload(
|
||||
State(state): State<HttpState>,
|
||||
Json(body): Json<serde_json::Value>,
|
||||
) -> Response {
|
||||
proxy_json(state.admin.post("/admin/reload", body).await).await
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct DrainBody {
|
||||
timeout_ms: Option<u64>,
|
||||
}
|
||||
|
||||
async fn admin_tenant_status(
|
||||
State(state): State<HttpState>,
|
||||
Path(tenant_id): Path<String>,
|
||||
) -> Response {
|
||||
let status = state
|
||||
.admin
|
||||
.placement_manager()
|
||||
.tenant_status(&crate::types::TenantId::new(tenant_id))
|
||||
.await;
|
||||
let mut response = (StatusCode::OK, serde_json::to_string(&status).unwrap()).into_response();
|
||||
response.headers_mut().insert(
|
||||
axum::http::header::CONTENT_TYPE,
|
||||
HeaderValue::from_static("application/json"),
|
||||
);
|
||||
response
|
||||
}
|
||||
|
||||
async fn admin_tenant_ready(
|
||||
State(state): State<HttpState>,
|
||||
Path(tenant_id): Path<String>,
|
||||
) -> Response {
|
||||
let tenant_id = crate::types::TenantId::new(tenant_id);
|
||||
let status = state
|
||||
.admin
|
||||
.placement_manager()
|
||||
.tenant_status(&tenant_id)
|
||||
.await;
|
||||
let ready = state.admin.health_checker().is_ready() && status.accepting;
|
||||
let mut response = (StatusCode::OK, serde_json::to_string(&ready).unwrap()).into_response();
|
||||
response.headers_mut().insert(
|
||||
axum::http::header::CONTENT_TYPE,
|
||||
HeaderValue::from_static("application/json"),
|
||||
);
|
||||
response
|
||||
}
|
||||
|
||||
async fn admin_tenant_drain(
|
||||
State(state): State<HttpState>,
|
||||
Path(tenant_id): Path<String>,
|
||||
body: Option<Json<DrainBody>>,
|
||||
) -> Response {
|
||||
let tenant_id = crate::types::TenantId::new(tenant_id);
|
||||
state
|
||||
.admin
|
||||
.placement_manager()
|
||||
.drain_tenant(&tenant_id)
|
||||
.await;
|
||||
|
||||
let timeout = body
|
||||
.and_then(|b| b.timeout_ms)
|
||||
.map(std::time::Duration::from_millis)
|
||||
.unwrap_or(std::time::Duration::from_secs(10));
|
||||
|
||||
let drained = state
|
||||
.admin
|
||||
.placement_manager()
|
||||
.wait_drained_with_timeout(&tenant_id, timeout)
|
||||
.await;
|
||||
|
||||
let status = state
|
||||
.admin
|
||||
.placement_manager()
|
||||
.tenant_status(&tenant_id)
|
||||
.await;
|
||||
let resp = serde_json::json!({ "drained": drained, "status": status });
|
||||
let mut response = (StatusCode::OK, serde_json::to_string(&resp).unwrap()).into_response();
|
||||
response.headers_mut().insert(
|
||||
axum::http::header::CONTENT_TYPE,
|
||||
HeaderValue::from_static("application/json"),
|
||||
);
|
||||
response
|
||||
}
|
||||
|
||||
async fn proxy_json(resp: AdminResponse) -> Response {
|
||||
let mut response = (StatusCode::OK, resp.text().await).into_response();
|
||||
response.headers_mut().insert(
|
||||
axum::http::header::CONTENT_TYPE,
|
||||
HeaderValue::from_static("application/json"),
|
||||
);
|
||||
response
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::observability::Observability;
|
||||
use crate::server::HealthChecker;
|
||||
use tokio::io::{AsyncReadExt, AsyncWriteExt};
|
||||
|
||||
async fn http_get(addr: std::net::SocketAddr, path: &str) -> String {
|
||||
let mut stream = tokio::net::TcpStream::connect(addr).await.unwrap();
|
||||
let req = format!(
|
||||
"GET {} HTTP/1.1\r\nHost: localhost\r\nConnection: close\r\n\r\n",
|
||||
path
|
||||
);
|
||||
stream.write_all(req.as_bytes()).await.unwrap();
|
||||
let mut buf = Vec::new();
|
||||
stream.read_to_end(&mut buf).await.unwrap();
|
||||
String::from_utf8_lossy(&buf).to_string()
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn http_server_exposes_health_ready_metrics() {
|
||||
let health = HealthChecker::new();
|
||||
health.set_storage_healthy(true);
|
||||
health.set_stream_healthy(true);
|
||||
|
||||
let admin = Arc::new(AdminServer::new(
|
||||
Observability::default(),
|
||||
health,
|
||||
"test-shard".to_string(),
|
||||
));
|
||||
admin
|
||||
.placement_manager()
|
||||
.set_hosted_tenants(vec!["test-tenant".to_string()])
|
||||
.await;
|
||||
let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||
let addr = listener.local_addr().unwrap();
|
||||
|
||||
let (tx, rx) = tokio::sync::oneshot::channel::<()>();
|
||||
let handle = tokio::spawn(async move {
|
||||
serve(listener, admin, async move {
|
||||
let _ = rx.await;
|
||||
})
|
||||
.await;
|
||||
});
|
||||
|
||||
let health_resp = http_get(addr, "/health").await;
|
||||
assert!(health_resp.starts_with("HTTP/1.1 200"));
|
||||
|
||||
let ready_resp = http_get(addr, "/ready").await;
|
||||
assert!(ready_resp.starts_with("HTTP/1.1 200"));
|
||||
|
||||
let metrics_resp = http_get(addr, "/metrics").await;
|
||||
assert!(metrics_resp.starts_with("HTTP/1.1 200"));
|
||||
assert!(metrics_resp.contains("aggregate_commands_total"));
|
||||
|
||||
let status_resp = http_get(addr, "/admin/tenant/test-tenant/status").await;
|
||||
assert!(status_resp.starts_with("HTTP/1.1 200"));
|
||||
assert!(status_resp.contains("test-tenant"));
|
||||
|
||||
let ready_resp = http_get(addr, "/admin/tenant/test-tenant/ready").await;
|
||||
assert!(ready_resp.starts_with("HTTP/1.1 200"));
|
||||
|
||||
let _ = tx.send(());
|
||||
handle.await.unwrap();
|
||||
}
|
||||
}
|
||||
26
aggregate/src/lib.rs
Normal file
26
aggregate/src/lib.rs
Normal file
@@ -0,0 +1,26 @@
|
||||
pub mod aggregate;
|
||||
pub mod config;
|
||||
pub mod container;
|
||||
pub mod gateway;
|
||||
pub mod http_server;
|
||||
pub mod observability;
|
||||
pub mod placement;
|
||||
pub mod query;
|
||||
pub mod runtime;
|
||||
pub mod server;
|
||||
pub mod storage;
|
||||
pub mod stream;
|
||||
pub mod swarm;
|
||||
pub mod types;
|
||||
|
||||
pub use aggregate::{AggregateHandler, AggregateInstance};
|
||||
pub use config::Settings;
|
||||
pub use query::{
|
||||
AggregateProjection, QueryClient, QueryConfig, QueryError, QueryRequest, QueryResponse,
|
||||
QueryServer, StateProjection,
|
||||
};
|
||||
pub use runtime::{ExecutorConfig, RuntimeExecutor};
|
||||
pub use server::{CommandRequest, CommandResponse, CommandServer, HealthChecker, HealthStatus};
|
||||
pub use storage::StorageClient;
|
||||
pub use stream::StreamClient;
|
||||
pub use types::*;
|
||||
213
aggregate/src/main.rs
Normal file
213
aggregate/src/main.rs
Normal file
@@ -0,0 +1,213 @@
|
||||
use aggregate::config::Settings;
|
||||
use aggregate::gateway::server::GrpcCommandServer;
|
||||
use aggregate::http_server;
|
||||
use aggregate::observability::Observability;
|
||||
use aggregate::runtime::RuntimeExecutor;
|
||||
use aggregate::server::AdminServer;
|
||||
use aggregate::storage::StorageClient;
|
||||
use aggregate::stream::StreamClient;
|
||||
use aggregate::swarm::TenantPlacementKvClient;
|
||||
use aggregate::{aggregate::AggregateHandler, placement::TenantPlacementManager};
|
||||
use futures::StreamExt;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
match std::env::args().nth(1).as_deref() {
|
||||
Some("-h") | Some("--help") => {
|
||||
print_help();
|
||||
return;
|
||||
}
|
||||
Some("serve") | None => serve().await,
|
||||
Some(other) => {
|
||||
eprintln!("Unknown command: {}", other);
|
||||
print_help();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn serve() {
|
||||
let settings = load_settings();
|
||||
|
||||
let observability = Observability::default();
|
||||
let health_checker = aggregate::server::HealthChecker::new();
|
||||
let admin = Arc::new(AdminServer::new(
|
||||
observability,
|
||||
health_checker,
|
||||
settings.shard_id.clone(),
|
||||
));
|
||||
|
||||
spawn_health_probe(admin.clone(), settings.clone());
|
||||
spawn_placement_watcher(admin.placement_manager(), settings.clone());
|
||||
|
||||
let storage = StorageClient::open(settings.storage_path.clone()).unwrap();
|
||||
let stream = StreamClient::new(settings.nats_url.clone()).await.unwrap();
|
||||
let _ = stream.setup_stream().await;
|
||||
let executor = RuntimeExecutor::new();
|
||||
|
||||
let handler = AggregateHandler::new(
|
||||
storage,
|
||||
stream,
|
||||
executor,
|
||||
settings.decide_program.clone(),
|
||||
settings.apply_program.clone(),
|
||||
)
|
||||
.with_snapshot_threshold(settings.snapshot_threshold)
|
||||
.with_max_retries(settings.max_retries);
|
||||
|
||||
let grpc_addr: std::net::SocketAddr = settings.grpc_addr.parse().unwrap();
|
||||
let grpc_service = GrpcCommandServer::new(
|
||||
handler,
|
||||
admin.placement_manager(),
|
||||
admin.observability(),
|
||||
settings.multi_tenant_enabled,
|
||||
settings
|
||||
.default_tenant_id
|
||||
.as_ref()
|
||||
.map(aggregate::types::TenantId::new),
|
||||
)
|
||||
.service();
|
||||
|
||||
let addr = std::env::var("AGGREGATE_HTTP_ADDR").unwrap_or_else(|_| "0.0.0.0:8080".to_string());
|
||||
let listener = tokio::net::TcpListener::bind(&addr).await.unwrap();
|
||||
|
||||
let (shutdown_tx, _) = tokio::sync::broadcast::channel::<()>(1);
|
||||
let mut http_shutdown = shutdown_tx.subscribe();
|
||||
let mut grpc_shutdown = shutdown_tx.subscribe();
|
||||
|
||||
let http_task = tokio::spawn(async move {
|
||||
http_server::serve(listener, admin, async move {
|
||||
let _ = http_shutdown.recv().await;
|
||||
})
|
||||
.await;
|
||||
});
|
||||
|
||||
let grpc_task = tokio::spawn(async move {
|
||||
tonic::transport::Server::builder()
|
||||
.add_service(grpc_service)
|
||||
.serve_with_shutdown(grpc_addr, async move {
|
||||
let _ = grpc_shutdown.recv().await;
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
});
|
||||
|
||||
let _ = tokio::signal::ctrl_c().await;
|
||||
let _ = shutdown_tx.send(());
|
||||
|
||||
let _ = tokio::join!(http_task, grpc_task);
|
||||
}
|
||||
|
||||
fn print_help() {
|
||||
println!(
|
||||
"aggregate\n\nUSAGE:\n aggregate [COMMAND]\n\nCOMMANDS:\n serve Start the HTTP server (default)\n\nOPTIONS:\n -h, --help Print help\n"
|
||||
);
|
||||
}
|
||||
|
||||
fn load_settings() -> Settings {
|
||||
if let Ok(path) = std::env::var("AGGREGATE_CONFIG_PATH") {
|
||||
if let Ok(settings) = Settings::load_from_file_with_env_overrides(path) {
|
||||
return settings;
|
||||
}
|
||||
}
|
||||
|
||||
Settings::from_env().unwrap_or_default()
|
||||
}
|
||||
|
||||
fn spawn_health_probe(admin: Arc<AdminServer>, settings: Settings) {
|
||||
tokio::spawn(async move {
|
||||
loop {
|
||||
let storage_ok = StorageClient::open(settings.storage_path.clone()).is_ok();
|
||||
admin.health_checker().set_storage_healthy(storage_ok);
|
||||
|
||||
let stream_ok = tokio::time::timeout(Duration::from_secs(1), async {
|
||||
let stream = StreamClient::new(settings.nats_url.clone()).await?;
|
||||
let _ = stream.setup_stream().await;
|
||||
Ok::<_, aggregate::types::AggregateError>(())
|
||||
})
|
||||
.await
|
||||
.is_ok_and(|r| r.is_ok());
|
||||
|
||||
admin.health_checker().set_stream_healthy(stream_ok);
|
||||
|
||||
tokio::time::sleep(Duration::from_secs(5)).await;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
fn spawn_placement_watcher(placement: Arc<TenantPlacementManager>, settings: Settings) {
|
||||
tokio::spawn(async move {
|
||||
loop {
|
||||
let client = TenantPlacementKvClient::connect(
|
||||
settings.nats_url.clone(),
|
||||
settings.placement_bucket.clone(),
|
||||
)
|
||||
.await;
|
||||
|
||||
let client = match client {
|
||||
Ok(c) => c,
|
||||
Err(_) => {
|
||||
tokio::time::sleep(Duration::from_secs(1)).await;
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
if let Ok(Some(value)) = client.get_json(&settings.placement_key).await {
|
||||
apply_placement_value(&placement, &settings.shard_id, value).await;
|
||||
}
|
||||
|
||||
let watch = client.watch_json(&settings.placement_key).await;
|
||||
let mut stream = match watch {
|
||||
Ok(s) => s,
|
||||
Err(_) => {
|
||||
tokio::time::sleep(Duration::from_secs(1)).await;
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
while let Some(update) = stream.next().await {
|
||||
if let Ok(value) = update {
|
||||
apply_placement_value(&placement, &settings.shard_id, value).await;
|
||||
}
|
||||
}
|
||||
|
||||
tokio::time::sleep(Duration::from_secs(1)).await;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
async fn apply_placement_value(
|
||||
placement: &TenantPlacementManager,
|
||||
shard_id: &str,
|
||||
value: serde_json::Value,
|
||||
) {
|
||||
if let Some(map) = value.as_object() {
|
||||
let placement_map = map
|
||||
.iter()
|
||||
.filter_map(|(k, v)| v.as_str().map(|s| (k.clone(), s.to_string())))
|
||||
.collect::<std::collections::HashMap<_, _>>();
|
||||
placement
|
||||
.apply_placement_map(shard_id, &placement_map)
|
||||
.await;
|
||||
return;
|
||||
}
|
||||
|
||||
if let Some(map) = value.get("placement").and_then(|v| v.as_object()) {
|
||||
let placement_map = map
|
||||
.iter()
|
||||
.filter_map(|(k, v)| v.as_str().map(|s| (k.clone(), s.to_string())))
|
||||
.collect::<std::collections::HashMap<_, _>>();
|
||||
placement
|
||||
.apply_placement_map(shard_id, &placement_map)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
#[test]
|
||||
fn binary_exists() {
|
||||
assert!(std::env::current_exe().is_ok());
|
||||
}
|
||||
}
|
||||
365
aggregate/src/observability/metrics.rs
Normal file
365
aggregate/src/observability/metrics.rs
Normal file
@@ -0,0 +1,365 @@
|
||||
use std::collections::HashMap;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::RwLock;
|
||||
use std::time::Duration;
|
||||
|
||||
pub trait MetricsRegistry: Send + Sync {
|
||||
fn increment_counter(&self, name: &str, labels: &[(&str, &str)]);
|
||||
fn record_histogram(&self, name: &str, value: f64, labels: &[(&str, &str)]);
|
||||
fn export_prometheus(&self) -> String;
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct AtomicHistogram {
|
||||
count: AtomicU64,
|
||||
sum: AtomicU64,
|
||||
buckets: Vec<(f64, AtomicU64)>,
|
||||
}
|
||||
|
||||
impl AtomicHistogram {
|
||||
fn new() -> Self {
|
||||
let buckets: Vec<(f64, AtomicU64)> = vec![
|
||||
0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
|
||||
]
|
||||
.into_iter()
|
||||
.map(|v| (v, AtomicU64::new(0)))
|
||||
.collect();
|
||||
|
||||
Self {
|
||||
count: AtomicU64::new(0),
|
||||
sum: AtomicU64::new(0),
|
||||
buckets,
|
||||
}
|
||||
}
|
||||
|
||||
fn observe(&self, duration: Duration) {
|
||||
let value_ms = duration.as_secs_f64() * 1000.0;
|
||||
self.count.fetch_add(1, Ordering::Relaxed);
|
||||
self.sum
|
||||
.fetch_add((value_ms * 1000.0) as u64, Ordering::Relaxed);
|
||||
|
||||
for (threshold, count) in &self.buckets {
|
||||
if value_ms <= *threshold {
|
||||
count.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn export(&self, name: &str, labels: &str) -> String {
|
||||
let mut output = String::new();
|
||||
let count = self.count.load(Ordering::Relaxed);
|
||||
let sum = self.sum.load(Ordering::Relaxed) as f64 / 1000.0;
|
||||
|
||||
let label_str = if labels.is_empty() {
|
||||
String::new()
|
||||
} else {
|
||||
format!("{{{}}}", labels.trim_start_matches(','))
|
||||
};
|
||||
|
||||
output.push_str(&format!("{}_sum{} {}\n", name, label_str, sum));
|
||||
output.push_str(&format!("{}_count{} {}\n", name, label_str, count));
|
||||
|
||||
for (threshold, bucket_count) in &self.buckets {
|
||||
let c = bucket_count.load(Ordering::Relaxed);
|
||||
let bucket_labels = if labels.is_empty() {
|
||||
format!("le=\"{}\"", threshold)
|
||||
} else {
|
||||
format!("le=\"{}\"{}", threshold, labels)
|
||||
};
|
||||
output.push_str(&format!("{}_bucket{{{}}} {}\n", name, bucket_labels, c));
|
||||
}
|
||||
let inf_labels = if labels.is_empty() {
|
||||
"le=\"+Inf\"".to_string()
|
||||
} else {
|
||||
format!("le=\"+Inf\"{}", labels)
|
||||
};
|
||||
output.push_str(&format!("{}_bucket{{{}}} {}\n", name, inf_labels, count));
|
||||
|
||||
output
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for AtomicHistogram {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Metrics {
|
||||
commands_total: RwLock<HashMap<String, AtomicU64>>,
|
||||
command_errors_total: RwLock<HashMap<String, AtomicU64>>,
|
||||
command_duration: RwLock<HashMap<String, AtomicHistogram>>,
|
||||
version_conflicts: AtomicU64,
|
||||
tenant_errors: AtomicU64,
|
||||
rehydration_duration: RwLock<HashMap<String, AtomicHistogram>>,
|
||||
in_flight: RwLock<HashMap<String, AtomicU64>>,
|
||||
}
|
||||
|
||||
impl Metrics {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
commands_total: RwLock::new(HashMap::new()),
|
||||
command_errors_total: RwLock::new(HashMap::new()),
|
||||
command_duration: RwLock::new(HashMap::new()),
|
||||
version_conflicts: AtomicU64::new(0),
|
||||
tenant_errors: AtomicU64::new(0),
|
||||
rehydration_duration: RwLock::new(HashMap::new()),
|
||||
in_flight: RwLock::new(HashMap::new()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn increment_commands_total(&self, aggregate_type: &str, tenant_id: &str) {
|
||||
let key = format!("{}:{}", aggregate_type, tenant_id);
|
||||
let map = self.commands_total.read().unwrap();
|
||||
if let Some(counter) = map.get(&key) {
|
||||
counter.fetch_add(1, Ordering::Relaxed);
|
||||
return;
|
||||
}
|
||||
drop(map);
|
||||
let mut map = self.commands_total.write().unwrap();
|
||||
let counter = map.entry(key).or_insert_with(|| AtomicU64::new(0));
|
||||
counter.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn increment_command_errors_total(
|
||||
&self,
|
||||
aggregate_type: &str,
|
||||
tenant_id: &str,
|
||||
error_kind: &str,
|
||||
) {
|
||||
let key = format!("{}:{}:{}", aggregate_type, tenant_id, error_kind);
|
||||
let map = self.command_errors_total.read().unwrap();
|
||||
if let Some(counter) = map.get(&key) {
|
||||
counter.fetch_add(1, Ordering::Relaxed);
|
||||
return;
|
||||
}
|
||||
drop(map);
|
||||
let mut map = self.command_errors_total.write().unwrap();
|
||||
let counter = map.entry(key).or_insert_with(|| AtomicU64::new(0));
|
||||
counter.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn record_command_duration(&self, duration: Duration, aggregate_type: &str) {
|
||||
let mut map = self.command_duration.write().unwrap();
|
||||
let histogram = map.entry(aggregate_type.to_string()).or_default();
|
||||
histogram.observe(duration);
|
||||
}
|
||||
|
||||
pub fn increment_version_conflicts(&self) {
|
||||
self.version_conflicts.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn increment_tenant_errors(&self) {
|
||||
self.tenant_errors.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn record_rehydration_duration(&self, duration: Duration, aggregate_type: &str) {
|
||||
let mut map = self.rehydration_duration.write().unwrap();
|
||||
let histogram = map.entry(aggregate_type.to_string()).or_default();
|
||||
histogram.observe(duration);
|
||||
}
|
||||
|
||||
pub fn set_in_flight(&self, tenant_id: &str, value: u64) {
|
||||
let map = self.in_flight.read().unwrap();
|
||||
if let Some(gauge) = map.get(tenant_id) {
|
||||
gauge.store(value, Ordering::Relaxed);
|
||||
return;
|
||||
}
|
||||
drop(map);
|
||||
let mut map = self.in_flight.write().unwrap();
|
||||
let gauge = map
|
||||
.entry(tenant_id.to_string())
|
||||
.or_insert_with(|| AtomicU64::new(0));
|
||||
gauge.store(value, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn export_prometheus(&self) -> String {
|
||||
let mut output = String::new();
|
||||
|
||||
output.push_str("# HELP aggregate_commands_total Total number of commands processed\n");
|
||||
output.push_str("# TYPE aggregate_commands_total counter\n");
|
||||
{
|
||||
let map = self.commands_total.read().unwrap();
|
||||
for (key, counter) in map.iter() {
|
||||
let parts: Vec<&str> = key.split(':').collect();
|
||||
if parts.len() == 2 {
|
||||
let value = counter.load(Ordering::Relaxed);
|
||||
output.push_str(&format!(
|
||||
"aggregate_commands_total{{aggregate_type=\"{}\",tenant_id=\"{}\"}} {}\n",
|
||||
parts[0], parts[1], value
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
output.push_str("\n# HELP aggregate_command_errors_total Total number of command errors\n");
|
||||
output.push_str("# TYPE aggregate_command_errors_total counter\n");
|
||||
{
|
||||
let map = self.command_errors_total.read().unwrap();
|
||||
for (key, counter) in map.iter() {
|
||||
let parts: Vec<&str> = key.split(':').collect();
|
||||
if parts.len() == 3 {
|
||||
let value = counter.load(Ordering::Relaxed);
|
||||
output.push_str(&format!(
|
||||
"aggregate_command_errors_total{{aggregate_type=\"{}\",tenant_id=\"{}\",error_kind=\"{}\"}} {}\n",
|
||||
parts[0], parts[1], parts[2], value
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
output
|
||||
.push_str("\n# HELP aggregate_command_duration_seconds Command processing duration\n");
|
||||
output.push_str("# TYPE aggregate_command_duration_seconds histogram\n");
|
||||
{
|
||||
let map = self.command_duration.read().unwrap();
|
||||
for (aggregate_type, histogram) in map.iter() {
|
||||
let labels = format!(",aggregate_type=\"{}\"", aggregate_type);
|
||||
output.push_str(&histogram.export("aggregate_command_duration_seconds", &labels));
|
||||
}
|
||||
}
|
||||
|
||||
output
|
||||
.push_str("\n# HELP aggregate_version_conflicts_total Total version conflict errors\n");
|
||||
output.push_str("# TYPE aggregate_version_conflicts_total counter\n");
|
||||
output.push_str(&format!(
|
||||
"aggregate_version_conflicts_total {}\n",
|
||||
self.version_conflicts.load(Ordering::Relaxed)
|
||||
));
|
||||
|
||||
output
|
||||
.push_str("\n# HELP aggregate_tenant_errors_total Total tenant access denied errors\n");
|
||||
output.push_str("# TYPE aggregate_tenant_errors_total counter\n");
|
||||
output.push_str(&format!(
|
||||
"aggregate_tenant_errors_total {}\n",
|
||||
self.tenant_errors.load(Ordering::Relaxed)
|
||||
));
|
||||
|
||||
output.push_str(
|
||||
"\n# HELP aggregate_rehydration_duration_seconds Aggregate rehydration duration\n",
|
||||
);
|
||||
output.push_str("# TYPE aggregate_rehydration_duration_seconds histogram\n");
|
||||
{
|
||||
let map = self.rehydration_duration.read().unwrap();
|
||||
for (aggregate_type, histogram) in map.iter() {
|
||||
let labels = format!(",aggregate_type=\"{}\"", aggregate_type);
|
||||
output
|
||||
.push_str(&histogram.export("aggregate_rehydration_duration_seconds", &labels));
|
||||
}
|
||||
}
|
||||
|
||||
output.push_str(
|
||||
"\n# HELP aggregate_in_flight_commands Number of in-flight commands by tenant\n",
|
||||
);
|
||||
output.push_str("# TYPE aggregate_in_flight_commands gauge\n");
|
||||
{
|
||||
let map = self.in_flight.read().unwrap();
|
||||
for (tenant_id, gauge) in map.iter() {
|
||||
let value = gauge.load(Ordering::Relaxed);
|
||||
output.push_str(&format!(
|
||||
"aggregate_in_flight_commands{{tenant_id=\"{}\"}} {}\n",
|
||||
tenant_id, value
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
output
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Metrics {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::time::Duration;
|
||||
|
||||
#[test]
|
||||
fn metrics_increment_commands() {
|
||||
let metrics = Metrics::new();
|
||||
metrics.increment_commands_total("Account", "tenant-a");
|
||||
metrics.increment_commands_total("Account", "tenant-a");
|
||||
metrics.increment_commands_total("Account", "tenant-b");
|
||||
|
||||
let output = metrics.export_prometheus();
|
||||
assert!(output.contains(
|
||||
"aggregate_commands_total{aggregate_type=\"Account\",tenant_id=\"tenant-a\"} 2"
|
||||
));
|
||||
assert!(output.contains(
|
||||
"aggregate_commands_total{aggregate_type=\"Account\",tenant_id=\"tenant-b\"} 1"
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn metrics_records_version_conflicts() {
|
||||
let metrics = Metrics::new();
|
||||
metrics.increment_version_conflicts();
|
||||
metrics.increment_version_conflicts();
|
||||
|
||||
let output = metrics.export_prometheus();
|
||||
assert!(output.contains("aggregate_version_conflicts_total 2"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn metrics_records_tenant_errors() {
|
||||
let metrics = Metrics::new();
|
||||
metrics.increment_tenant_errors();
|
||||
|
||||
let output = metrics.export_prometheus();
|
||||
assert!(output.contains("aggregate_tenant_errors_total 1"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn metrics_records_command_errors_with_labels() {
|
||||
let metrics = Metrics::new();
|
||||
metrics.increment_command_errors_total("Account", "tenant-a", "tenant_not_hosted");
|
||||
metrics.increment_command_errors_total("Account", "tenant-a", "tenant_not_hosted");
|
||||
|
||||
let output = metrics.export_prometheus();
|
||||
assert!(output.contains("aggregate_command_errors_total{aggregate_type=\"Account\",tenant_id=\"tenant-a\",error_kind=\"tenant_not_hosted\"} 2"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn metrics_records_command_duration() {
|
||||
let metrics = Metrics::new();
|
||||
metrics.record_command_duration(Duration::from_millis(50), "Account");
|
||||
|
||||
let output = metrics.export_prometheus();
|
||||
assert!(output.contains("aggregate_command_duration_seconds"));
|
||||
assert!(output
|
||||
.contains("aggregate_command_duration_seconds_count{aggregate_type=\"Account\"} 1"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn metrics_records_rehydration_duration() {
|
||||
let metrics = Metrics::new();
|
||||
metrics.record_rehydration_duration(Duration::from_millis(100), "Account");
|
||||
|
||||
let output = metrics.export_prometheus();
|
||||
assert!(output.contains("aggregate_rehydration_duration_seconds"));
|
||||
assert!(output.contains(
|
||||
"aggregate_rehydration_duration_seconds_count{aggregate_type=\"Account\"} 1"
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn metrics_export_prometheus_format() {
|
||||
let metrics = Metrics::new();
|
||||
metrics.increment_commands_total("Account", "tenant-a");
|
||||
|
||||
let output = metrics.export_prometheus();
|
||||
assert!(output.contains("# HELP aggregate_commands_total"));
|
||||
assert!(output.contains("# TYPE aggregate_commands_total counter"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn metrics_is_send_sync() {
|
||||
fn assert_send_sync<T: Send + Sync>() {}
|
||||
assert_send_sync::<Metrics>();
|
||||
}
|
||||
}
|
||||
323
aggregate/src/observability/mod.rs
Normal file
323
aggregate/src/observability/mod.rs
Normal file
@@ -0,0 +1,323 @@
|
||||
mod metrics;
|
||||
|
||||
pub use metrics::{Metrics, MetricsRegistry};
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ObservabilityConfig {
|
||||
pub service_name: String,
|
||||
pub environment: String,
|
||||
pub enable_metrics: bool,
|
||||
}
|
||||
|
||||
impl Default for ObservabilityConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
service_name: "aggregate".to_string(),
|
||||
environment: "development".to_string(),
|
||||
enable_metrics: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ObservabilityConfig {
|
||||
pub fn with_service_name(mut self, name: impl Into<String>) -> Self {
|
||||
self.service_name = name.into();
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_environment(mut self, env: impl Into<String>) -> Self {
|
||||
self.environment = env.into();
|
||||
self
|
||||
}
|
||||
|
||||
pub fn without_metrics(mut self) -> Self {
|
||||
self.enable_metrics = false;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CommandSpan {
|
||||
aggregate_id: String,
|
||||
aggregate_type: String,
|
||||
tenant_id: String,
|
||||
command_id: String,
|
||||
correlation_id: Option<String>,
|
||||
trace_id: Option<String>,
|
||||
start_time: Instant,
|
||||
}
|
||||
|
||||
impl CommandSpan {
|
||||
pub fn new(
|
||||
aggregate_id: impl Into<String>,
|
||||
aggregate_type: impl Into<String>,
|
||||
tenant_id: impl Into<String>,
|
||||
command_id: impl Into<String>,
|
||||
correlation_id: Option<String>,
|
||||
trace_id: Option<String>,
|
||||
) -> Self {
|
||||
Self {
|
||||
aggregate_id: aggregate_id.into(),
|
||||
aggregate_type: aggregate_type.into(),
|
||||
tenant_id: tenant_id.into(),
|
||||
command_id: command_id.into(),
|
||||
correlation_id,
|
||||
trace_id,
|
||||
start_time: Instant::now(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn elapsed(&self) -> std::time::Duration {
|
||||
self.start_time.elapsed()
|
||||
}
|
||||
|
||||
pub fn aggregate_id(&self) -> &str {
|
||||
&self.aggregate_id
|
||||
}
|
||||
|
||||
pub fn aggregate_type(&self) -> &str {
|
||||
&self.aggregate_type
|
||||
}
|
||||
|
||||
pub fn tenant_id(&self) -> &str {
|
||||
&self.tenant_id
|
||||
}
|
||||
|
||||
pub fn command_id(&self) -> &str {
|
||||
&self.command_id
|
||||
}
|
||||
|
||||
pub fn correlation_id(&self) -> Option<&str> {
|
||||
self.correlation_id.as_deref()
|
||||
}
|
||||
|
||||
pub fn trace_id(&self) -> Option<&str> {
|
||||
self.trace_id.as_deref()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Observability {
|
||||
config: ObservabilityConfig,
|
||||
metrics: Arc<Metrics>,
|
||||
}
|
||||
|
||||
impl Observability {
|
||||
pub fn new(config: ObservabilityConfig) -> Self {
|
||||
let metrics = Arc::new(Metrics::new());
|
||||
Self { config, metrics }
|
||||
}
|
||||
|
||||
pub fn config(&self) -> &ObservabilityConfig {
|
||||
&self.config
|
||||
}
|
||||
|
||||
pub fn metrics(&self) -> &Arc<Metrics> {
|
||||
&self.metrics
|
||||
}
|
||||
|
||||
pub fn start_command_span(
|
||||
&self,
|
||||
aggregate_id: &str,
|
||||
aggregate_type: &str,
|
||||
tenant_id: &str,
|
||||
command_id: &str,
|
||||
correlation_id: Option<&str>,
|
||||
trace_id: Option<&str>,
|
||||
) -> CommandSpan {
|
||||
tracing::info_span!(
|
||||
"command",
|
||||
aggregate_id = %aggregate_id,
|
||||
aggregate_type = %aggregate_type,
|
||||
tenant_id = %tenant_id,
|
||||
command_id = %command_id,
|
||||
correlation_id = correlation_id.unwrap_or(""),
|
||||
trace_id = trace_id.unwrap_or(""),
|
||||
);
|
||||
|
||||
CommandSpan::new(
|
||||
aggregate_id,
|
||||
aggregate_type,
|
||||
tenant_id,
|
||||
command_id,
|
||||
correlation_id.map(|s| s.to_string()),
|
||||
trace_id.map(|s| s.to_string()),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn record_command_success(&self, span: &CommandSpan, events_count: usize) {
|
||||
self.metrics
|
||||
.increment_commands_total(&span.aggregate_type, &span.tenant_id);
|
||||
self.metrics
|
||||
.record_command_duration(span.elapsed(), &span.aggregate_type);
|
||||
|
||||
tracing::info!(
|
||||
aggregate_id = %span.aggregate_id(),
|
||||
aggregate_type = %span.aggregate_type(),
|
||||
tenant_id = %span.tenant_id(),
|
||||
command_id = %span.command_id(),
|
||||
correlation_id = span.correlation_id().unwrap_or(""),
|
||||
trace_id = span.trace_id().unwrap_or(""),
|
||||
events_count = events_count,
|
||||
duration_ms = span.elapsed().as_millis() as u64,
|
||||
"Command handled successfully"
|
||||
);
|
||||
}
|
||||
|
||||
pub fn record_command_error(&self, span: &CommandSpan, error: &crate::types::AggregateError) {
|
||||
self.metrics
|
||||
.increment_commands_total(&span.aggregate_type, &span.tenant_id);
|
||||
self.metrics
|
||||
.record_command_duration(span.elapsed(), &span.aggregate_type);
|
||||
|
||||
self.metrics.increment_command_errors_total(
|
||||
&span.aggregate_type,
|
||||
&span.tenant_id,
|
||||
error_kind(error),
|
||||
);
|
||||
|
||||
if matches!(
|
||||
error,
|
||||
crate::types::AggregateError::TenantAccessDenied { .. }
|
||||
| crate::types::AggregateError::TenantNotHosted { .. }
|
||||
| crate::types::AggregateError::TenantDraining { .. }
|
||||
) {
|
||||
self.metrics.increment_tenant_errors();
|
||||
}
|
||||
|
||||
if matches!(error, crate::types::AggregateError::VersionConflict { .. }) {
|
||||
self.metrics.increment_version_conflicts();
|
||||
}
|
||||
|
||||
tracing::error!(
|
||||
aggregate_id = %span.aggregate_id(),
|
||||
aggregate_type = %span.aggregate_type(),
|
||||
tenant_id = %span.tenant_id(),
|
||||
command_id = %span.command_id(),
|
||||
correlation_id = span.correlation_id().unwrap_or(""),
|
||||
trace_id = span.trace_id().unwrap_or(""),
|
||||
error = %error,
|
||||
duration_ms = span.elapsed().as_millis() as u64,
|
||||
"Command handling failed"
|
||||
);
|
||||
}
|
||||
|
||||
pub fn record_rehydration(&self, duration: std::time::Duration, aggregate_type: &str) {
|
||||
self.metrics
|
||||
.record_rehydration_duration(duration, aggregate_type);
|
||||
|
||||
tracing::debug!(
|
||||
aggregate_type = %aggregate_type,
|
||||
duration_ms = duration.as_millis() as u64,
|
||||
"Aggregate rehydrated"
|
||||
);
|
||||
}
|
||||
|
||||
pub fn export_metrics(&self) -> String {
|
||||
self.metrics.export_prometheus()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Observability {
|
||||
fn default() -> Self {
|
||||
Self::new(ObservabilityConfig::default())
|
||||
}
|
||||
}
|
||||
|
||||
fn error_kind(error: &crate::types::AggregateError) -> &'static str {
|
||||
match error {
|
||||
crate::types::AggregateError::TenantAccessDenied { .. } => "tenant_access_denied",
|
||||
crate::types::AggregateError::TenantNotHosted { .. } => "tenant_not_hosted",
|
||||
crate::types::AggregateError::TenantDraining { .. } => "tenant_draining",
|
||||
crate::types::AggregateError::ValidationError(_) => "validation",
|
||||
crate::types::AggregateError::VersionConflict { .. } => "version_conflict",
|
||||
crate::types::AggregateError::StorageError(_) => "storage",
|
||||
crate::types::AggregateError::StreamError(_) => "stream",
|
||||
crate::types::AggregateError::RehydrationError(_) => "rehydration",
|
||||
crate::types::AggregateError::DecideError(_) => "decide",
|
||||
crate::types::AggregateError::ApplyError(_) => "apply",
|
||||
crate::types::AggregateError::NotFound(_) => "not_found",
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::types::{AggregateError, TenantId};
|
||||
|
||||
#[test]
|
||||
fn observability_config_defaults() {
|
||||
let config = ObservabilityConfig::default();
|
||||
assert_eq!(config.service_name, "aggregate");
|
||||
assert_eq!(config.environment, "development");
|
||||
assert!(config.enable_metrics);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn observability_config_builder() {
|
||||
let config = ObservabilityConfig::default()
|
||||
.with_service_name("my-service")
|
||||
.with_environment("production")
|
||||
.without_metrics();
|
||||
|
||||
assert_eq!(config.service_name, "my-service");
|
||||
assert_eq!(config.environment, "production");
|
||||
assert!(!config.enable_metrics);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn command_span_tracks_elapsed_time() {
|
||||
let span = CommandSpan::new("agg-123", "Account", "tenant-a", "cmd-456", None, None);
|
||||
std::thread::sleep(std::time::Duration::from_millis(10));
|
||||
assert!(span.elapsed() >= std::time::Duration::from_millis(10));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn observability_records_success() {
|
||||
let obs = Observability::default();
|
||||
let span = obs.start_command_span("agg-123", "Account", "tenant-a", "cmd-456", None, None);
|
||||
|
||||
obs.record_command_success(&span, 3);
|
||||
|
||||
let metrics = obs.export_metrics();
|
||||
assert!(metrics.contains("aggregate_commands_total"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn observability_records_tenant_error() {
|
||||
let obs = Observability::default();
|
||||
let span = obs.start_command_span("agg-123", "Account", "tenant-a", "cmd-456", None, None);
|
||||
|
||||
let error = AggregateError::TenantAccessDenied {
|
||||
tenant_id: TenantId::new("other-tenant"),
|
||||
};
|
||||
obs.record_command_error(&span, &error);
|
||||
|
||||
let metrics = obs.export_metrics();
|
||||
assert!(metrics.contains("aggregate_tenant_errors_total"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn observability_records_version_conflict() {
|
||||
let obs = Observability::default();
|
||||
let span = obs.start_command_span("agg-123", "Account", "tenant-a", "cmd-456", None, None);
|
||||
|
||||
let error = AggregateError::VersionConflict {
|
||||
expected: crate::types::Version::from(5),
|
||||
actual: crate::types::Version::from(4),
|
||||
};
|
||||
obs.record_command_error(&span, &error);
|
||||
|
||||
let metrics = obs.export_metrics();
|
||||
assert!(metrics.contains("aggregate_version_conflicts_total"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn observability_is_send_sync() {
|
||||
fn assert_send_sync<T: Send + Sync>() {}
|
||||
assert_send_sync::<Observability>();
|
||||
assert_send_sync::<CommandSpan>();
|
||||
}
|
||||
}
|
||||
267
aggregate/src/placement.rs
Normal file
267
aggregate/src/placement.rs
Normal file
@@ -0,0 +1,267 @@
|
||||
use crate::observability::Observability;
|
||||
use crate::types::{AggregateError, TenantId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct TenantStatus {
|
||||
pub tenant_id: TenantId,
|
||||
pub hosted: bool,
|
||||
pub accepting: bool,
|
||||
pub draining: bool,
|
||||
pub in_flight: u64,
|
||||
}
|
||||
|
||||
pub struct TenantPlacementManager {
|
||||
hosted: RwLock<HashSet<String>>,
|
||||
draining: RwLock<HashSet<String>>,
|
||||
in_flight: RwLock<HashMap<String, u64>>,
|
||||
observability: Arc<Observability>,
|
||||
}
|
||||
|
||||
impl TenantPlacementManager {
|
||||
pub fn new(observability: Arc<Observability>) -> Self {
|
||||
Self {
|
||||
hosted: RwLock::new(HashSet::new()),
|
||||
draining: RwLock::new(HashSet::new()),
|
||||
in_flight: RwLock::new(HashMap::new()),
|
||||
observability,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn set_hosted_tenants(&self, tenant_ids: impl IntoIterator<Item = String>) {
|
||||
let mut hosted = self.hosted.write().await;
|
||||
hosted.clear();
|
||||
hosted.extend(tenant_ids);
|
||||
}
|
||||
|
||||
pub async fn apply_placement_map(&self, shard_id: &str, placement: &HashMap<String, String>) {
|
||||
let tenants = placement
|
||||
.iter()
|
||||
.filter_map(|(tenant_id, assigned)| {
|
||||
if assigned == shard_id {
|
||||
Some(tenant_id.clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
self.set_hosted_tenants(tenants).await;
|
||||
}
|
||||
|
||||
pub async fn is_hosted(&self, tenant_id: &TenantId) -> bool {
|
||||
if tenant_id.as_str().is_empty() {
|
||||
return true;
|
||||
}
|
||||
self.hosted.read().await.contains(tenant_id.as_str())
|
||||
}
|
||||
|
||||
pub async fn is_draining(&self, tenant_id: &TenantId) -> bool {
|
||||
self.draining.read().await.contains(tenant_id.as_str())
|
||||
}
|
||||
|
||||
pub async fn begin_command(
|
||||
self: &Arc<Self>,
|
||||
tenant_id: &TenantId,
|
||||
) -> Result<TenantCommandGuard, AggregateError> {
|
||||
if !self.is_hosted(tenant_id).await {
|
||||
return Err(AggregateError::TenantNotHosted {
|
||||
tenant_id: tenant_id.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
if self.is_draining(tenant_id).await {
|
||||
return Err(AggregateError::TenantDraining {
|
||||
tenant_id: tenant_id.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
let mut map = self.in_flight.write().await;
|
||||
let counter = map.entry(tenant_id.as_str().to_string()).or_insert(0);
|
||||
*counter += 1;
|
||||
let value = *counter;
|
||||
drop(map);
|
||||
|
||||
self.observability
|
||||
.metrics()
|
||||
.set_in_flight(tenant_id.as_str(), value);
|
||||
|
||||
Ok(TenantCommandGuard {
|
||||
tenant_id: tenant_id.clone(),
|
||||
manager: self.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn drain_tenant(&self, tenant_id: &TenantId) {
|
||||
if tenant_id.as_str().is_empty() {
|
||||
return;
|
||||
}
|
||||
let mut draining = self.draining.write().await;
|
||||
draining.insert(tenant_id.as_str().to_string());
|
||||
}
|
||||
|
||||
pub async fn undrain_tenant(&self, tenant_id: &TenantId) {
|
||||
let mut draining = self.draining.write().await;
|
||||
draining.remove(tenant_id.as_str());
|
||||
}
|
||||
|
||||
pub async fn wait_drained(&self, tenant_id: &TenantId) {
|
||||
loop {
|
||||
let in_flight = self
|
||||
.in_flight
|
||||
.read()
|
||||
.await
|
||||
.get(tenant_id.as_str())
|
||||
.copied()
|
||||
.unwrap_or(0);
|
||||
if in_flight == 0 {
|
||||
break;
|
||||
}
|
||||
tokio::time::sleep(std::time::Duration::from_millis(10)).await;
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn wait_drained_with_timeout(
|
||||
&self,
|
||||
tenant_id: &TenantId,
|
||||
timeout: std::time::Duration,
|
||||
) -> bool {
|
||||
let deadline = tokio::time::Instant::now() + timeout;
|
||||
loop {
|
||||
let in_flight = self
|
||||
.in_flight
|
||||
.read()
|
||||
.await
|
||||
.get(tenant_id.as_str())
|
||||
.copied()
|
||||
.unwrap_or(0);
|
||||
if in_flight == 0 {
|
||||
return true;
|
||||
}
|
||||
if tokio::time::Instant::now() >= deadline {
|
||||
return false;
|
||||
}
|
||||
tokio::time::sleep(std::time::Duration::from_millis(10)).await;
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn tenant_status(&self, tenant_id: &TenantId) -> TenantStatus {
|
||||
let hosted = self.is_hosted(tenant_id).await;
|
||||
let draining = self.is_draining(tenant_id).await;
|
||||
let in_flight = self
|
||||
.in_flight
|
||||
.read()
|
||||
.await
|
||||
.get(tenant_id.as_str())
|
||||
.copied()
|
||||
.unwrap_or(0);
|
||||
TenantStatus {
|
||||
tenant_id: tenant_id.clone(),
|
||||
hosted,
|
||||
accepting: hosted && !draining,
|
||||
draining,
|
||||
in_flight,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn hosted_tenants(&self) -> Vec<TenantId> {
|
||||
let hosted = self.hosted.read().await;
|
||||
hosted.iter().map(TenantId::new).collect()
|
||||
}
|
||||
|
||||
pub async fn all_statuses(&self) -> Vec<TenantStatus> {
|
||||
let hosted = self.hosted.read().await.clone();
|
||||
let draining = self.draining.read().await.clone();
|
||||
let in_flight = self.in_flight.read().await.clone();
|
||||
|
||||
hosted
|
||||
.into_iter()
|
||||
.map(|id| {
|
||||
let tenant_id = TenantId::new(id.clone());
|
||||
let d = draining.contains(&id);
|
||||
let f = in_flight.get(&id).copied().unwrap_or(0);
|
||||
TenantStatus {
|
||||
tenant_id,
|
||||
hosted: true,
|
||||
accepting: !d,
|
||||
draining: d,
|
||||
in_flight: f,
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
async fn finish_command(&self, tenant_id: &TenantId) {
|
||||
let mut map = self.in_flight.write().await;
|
||||
let counter = map.entry(tenant_id.as_str().to_string()).or_insert(0);
|
||||
if *counter > 0 {
|
||||
*counter -= 1;
|
||||
}
|
||||
let value = *counter;
|
||||
drop(map);
|
||||
|
||||
self.observability
|
||||
.metrics()
|
||||
.set_in_flight(tenant_id.as_str(), value);
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TenantCommandGuard {
|
||||
tenant_id: TenantId,
|
||||
manager: Arc<TenantPlacementManager>,
|
||||
}
|
||||
|
||||
impl Drop for TenantCommandGuard {
|
||||
fn drop(&mut self) {
|
||||
let tenant_id = self.tenant_id.clone();
|
||||
let manager = self.manager.clone();
|
||||
tokio::spawn(async move {
|
||||
manager.finish_command(&tenant_id).await;
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::observability::Observability;
|
||||
|
||||
#[tokio::test]
|
||||
async fn placement_rejects_unhosted_tenant() {
|
||||
let obs = Arc::new(Observability::default());
|
||||
let mgr = Arc::new(TenantPlacementManager::new(obs));
|
||||
mgr.set_hosted_tenants(vec!["tenant-a".to_string()]).await;
|
||||
|
||||
let err = match mgr.begin_command(&TenantId::new("tenant-b")).await {
|
||||
Ok(_) => panic!("expected error"),
|
||||
Err(e) => e,
|
||||
};
|
||||
assert!(matches!(err, AggregateError::TenantNotHosted { .. }));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn drain_blocks_new_commands_until_in_flight_zero() {
|
||||
let obs = Arc::new(Observability::default());
|
||||
let mgr = Arc::new(TenantPlacementManager::new(obs));
|
||||
mgr.set_hosted_tenants(vec!["tenant-a".to_string()]).await;
|
||||
|
||||
let guard = mgr.begin_command(&TenantId::new("tenant-a")).await.unwrap();
|
||||
mgr.drain_tenant(&TenantId::new("tenant-a")).await;
|
||||
let err = match mgr.begin_command(&TenantId::new("tenant-a")).await {
|
||||
Ok(_) => panic!("expected error"),
|
||||
Err(e) => e,
|
||||
};
|
||||
assert!(matches!(err, AggregateError::TenantDraining { .. }));
|
||||
|
||||
drop(guard);
|
||||
mgr.wait_drained(&TenantId::new("tenant-a")).await;
|
||||
let err = match mgr.begin_command(&TenantId::new("tenant-a")).await {
|
||||
Ok(_) => panic!("expected error"),
|
||||
Err(e) => e,
|
||||
};
|
||||
assert!(matches!(err, AggregateError::TenantDraining { .. }));
|
||||
}
|
||||
}
|
||||
594
aggregate/src/query/client.rs
Normal file
594
aggregate/src/query/client.rs
Normal file
@@ -0,0 +1,594 @@
|
||||
use super::{AggregateProjection, QueryRequest, QueryResponse};
|
||||
use crate::types::TenantId;
|
||||
use futures::stream::Stream;
|
||||
use serde_json::Value as JsonValue;
|
||||
use std::collections::HashMap;
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
use thiserror::Error;
|
||||
use tokio::sync::broadcast;
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum QueryError {
|
||||
#[error("Query syntax error: {0}")]
|
||||
SyntaxError(String),
|
||||
|
||||
#[error("Connection error: {0}")]
|
||||
ConnectionError(String),
|
||||
|
||||
#[error("Tenant not found: {0}")]
|
||||
TenantNotFound(String),
|
||||
|
||||
#[error("Internal error: {0}")]
|
||||
InternalError(String),
|
||||
}
|
||||
|
||||
pub type QueryResult<T> = Result<T, QueryError>;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct QueryConfig {
|
||||
pub endpoint: Option<String>,
|
||||
pub embedded: bool,
|
||||
pub cache_size: usize,
|
||||
pub cache_ttl_seconds: u64,
|
||||
}
|
||||
|
||||
impl Default for QueryConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
endpoint: None,
|
||||
embedded: true,
|
||||
cache_size: 1000,
|
||||
cache_ttl_seconds: 60,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl QueryConfig {
|
||||
pub fn embedded() -> Self {
|
||||
Self {
|
||||
embedded: true,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remote(endpoint: impl Into<String>) -> Self {
|
||||
Self {
|
||||
endpoint: Some(endpoint.into()),
|
||||
embedded: false,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct CacheEntry {
|
||||
projection: AggregateProjection,
|
||||
inserted_at: std::time::Instant,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct QueryClient {
|
||||
config: QueryConfig,
|
||||
storage: Arc<RwLock<HashMap<String, Vec<AggregateProjection>>>>,
|
||||
cache: Arc<RwLock<lru::LruCache<String, CacheEntry>>>,
|
||||
updates: broadcast::Sender<AggregateProjection>,
|
||||
}
|
||||
|
||||
impl QueryClient {
|
||||
pub fn new(config: QueryConfig) -> Self {
|
||||
let cache = lru::LruCache::new(
|
||||
std::num::NonZeroUsize::new(config.cache_size)
|
||||
.unwrap_or_else(|| std::num::NonZeroUsize::new(1000).unwrap()),
|
||||
);
|
||||
let (updates, _) = broadcast::channel(1024);
|
||||
|
||||
Self {
|
||||
config,
|
||||
storage: Arc::new(RwLock::new(HashMap::new())),
|
||||
cache: Arc::new(RwLock::new(cache)),
|
||||
updates,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn embedded() -> Self {
|
||||
Self::new(QueryConfig::embedded())
|
||||
}
|
||||
|
||||
fn make_key(tenant_id: &str, aggregate_id: &str) -> String {
|
||||
format!("{}:{}", tenant_id, aggregate_id)
|
||||
}
|
||||
|
||||
pub async fn index(&self, projection: AggregateProjection) -> QueryResult<()> {
|
||||
let key = Self::make_key(&projection.tenant_id, &projection.aggregate_id);
|
||||
|
||||
let _ = self.updates.send(projection.clone());
|
||||
|
||||
{
|
||||
let mut cache = self.cache.write().await;
|
||||
cache.put(
|
||||
key.clone(),
|
||||
CacheEntry {
|
||||
projection: projection.clone(),
|
||||
inserted_at: std::time::Instant::now(),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
let mut storage = self.storage.write().await;
|
||||
let tenant_projections = storage.entry(projection.tenant_id.clone()).or_default();
|
||||
|
||||
if let Some(existing) = tenant_projections
|
||||
.iter_mut()
|
||||
.find(|p| p.aggregate_id == projection.aggregate_id)
|
||||
{
|
||||
*existing = projection;
|
||||
} else {
|
||||
tenant_projections.push(projection);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn subscribe(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
) -> Pin<Box<dyn Stream<Item = AggregateProjection> + Send>> {
|
||||
let tenant_id = tenant_id.as_str().to_string();
|
||||
let receiver = self.updates.subscribe();
|
||||
|
||||
Box::pin(futures::stream::unfold(
|
||||
(receiver, tenant_id),
|
||||
|(mut receiver, tenant_id)| async move {
|
||||
loop {
|
||||
match receiver.recv().await {
|
||||
Ok(proj) => {
|
||||
if proj.tenant_id == tenant_id {
|
||||
return Some((proj, (receiver, tenant_id)));
|
||||
}
|
||||
}
|
||||
Err(broadcast::error::RecvError::Lagged(_)) => continue,
|
||||
Err(broadcast::error::RecvError::Closed) => return None,
|
||||
}
|
||||
}
|
||||
},
|
||||
))
|
||||
}
|
||||
|
||||
pub async fn query(&self, request: QueryRequest) -> QueryResult<QueryResponse> {
|
||||
let storage = self.storage.read().await;
|
||||
|
||||
let tenant_projections = storage.get(&request.tenant_id);
|
||||
|
||||
let projections: Vec<AggregateProjection> = match tenant_projections {
|
||||
Some(projs) => {
|
||||
let mut filtered: Vec<_> = projs
|
||||
.iter()
|
||||
.filter(|p| {
|
||||
if let Some(ref at) = request.aggregate_type {
|
||||
&p.aggregate_type == at
|
||||
} else {
|
||||
true
|
||||
}
|
||||
})
|
||||
.filter(|p| {
|
||||
if let Some(ref filter) = request.filter {
|
||||
self.evaluate_filter(&p.state, filter).unwrap_or(false)
|
||||
} else {
|
||||
true
|
||||
}
|
||||
})
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
filtered.sort_by(|a, b| b.updated_at.cmp(&a.updated_at));
|
||||
filtered
|
||||
}
|
||||
None => Vec::new(),
|
||||
};
|
||||
|
||||
let total = projections.len();
|
||||
let offset = request.offset.unwrap_or(0);
|
||||
let limit = request.limit.unwrap_or(100);
|
||||
|
||||
let results: Vec<AggregateProjection> =
|
||||
projections.into_iter().skip(offset).take(limit).collect();
|
||||
|
||||
Ok(QueryResponse::from_results(results, total, Some(limit)))
|
||||
}
|
||||
|
||||
pub async fn get(
|
||||
&self,
|
||||
tenant_id: &TenantId,
|
||||
aggregate_id: &str,
|
||||
) -> QueryResult<Option<AggregateProjection>> {
|
||||
let key = Self::make_key(tenant_id.as_str(), aggregate_id);
|
||||
|
||||
{
|
||||
let mut cache = self.cache.write().await;
|
||||
if let Some(entry) = cache.get(&key) {
|
||||
let elapsed = entry.inserted_at.elapsed().as_secs();
|
||||
if elapsed < self.config.cache_ttl_seconds {
|
||||
return Ok(Some(entry.projection.clone()));
|
||||
}
|
||||
cache.pop(&key);
|
||||
}
|
||||
}
|
||||
|
||||
let storage = self.storage.read().await;
|
||||
let tenant_projections = storage.get(tenant_id.as_str());
|
||||
|
||||
Ok(tenant_projections.and_then(|projs| {
|
||||
projs
|
||||
.iter()
|
||||
.find(|p| p.aggregate_id == aggregate_id)
|
||||
.cloned()
|
||||
}))
|
||||
}
|
||||
|
||||
pub async fn delete(&self, tenant_id: &TenantId, aggregate_id: &str) -> QueryResult<bool> {
|
||||
let key = Self::make_key(tenant_id.as_str(), aggregate_id);
|
||||
|
||||
{
|
||||
let mut cache = self.cache.write().await;
|
||||
cache.pop(&key);
|
||||
}
|
||||
|
||||
let mut storage = self.storage.write().await;
|
||||
if let Some(tenant_projections) = storage.get_mut(tenant_id.as_str()) {
|
||||
let len_before = tenant_projections.len();
|
||||
tenant_projections.retain(|p| p.aggregate_id != aggregate_id);
|
||||
return Ok(tenant_projections.len() < len_before);
|
||||
}
|
||||
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
pub async fn clear_tenant(&self, tenant_id: &TenantId) -> QueryResult<usize> {
|
||||
let mut storage = self.storage.write().await;
|
||||
let count = storage
|
||||
.remove(tenant_id.as_str())
|
||||
.map(|v| v.len())
|
||||
.unwrap_or(0);
|
||||
|
||||
let mut cache = self.cache.write().await;
|
||||
let prefix = format!("{}:", tenant_id.as_str());
|
||||
let keys_to_remove: Vec<_> = cache
|
||||
.iter()
|
||||
.filter(|(k, _)| k.starts_with(&prefix))
|
||||
.map(|(k, _)| k.clone())
|
||||
.collect();
|
||||
|
||||
for key in keys_to_remove {
|
||||
cache.pop(&key);
|
||||
}
|
||||
|
||||
Ok(count)
|
||||
}
|
||||
|
||||
fn evaluate_filter(&self, state: &JsonValue, filter: &str) -> QueryResult<bool> {
|
||||
let filter = filter.trim();
|
||||
|
||||
if filter.is_empty() || filter == "*" {
|
||||
return Ok(true);
|
||||
}
|
||||
|
||||
if let Some((field, op_value)) = filter.split_once('>') {
|
||||
let field = field.trim();
|
||||
let value = op_value.trim();
|
||||
return self.compare_field(state, field, value, |a, b| a > b);
|
||||
}
|
||||
|
||||
if let Some((field, op_value)) = filter.split_once('<') {
|
||||
let field = field.trim();
|
||||
let value = op_value.trim();
|
||||
return self.compare_field(state, field, value, |a, b| a < b);
|
||||
}
|
||||
|
||||
if let Some((field, op_value)) = filter.split_once("==") {
|
||||
let field = field.trim();
|
||||
let value = op_value.trim();
|
||||
return self.compare_field(state, field, value, |a, b| a == b);
|
||||
}
|
||||
|
||||
if let Some((field, op_value)) = filter.split_once("!=") {
|
||||
let field = field.trim();
|
||||
let value = op_value.trim();
|
||||
return self.compare_field(state, field, value, |a, b| a != b);
|
||||
}
|
||||
|
||||
if let Some((field, op_value)) = filter.split_once(">=") {
|
||||
let field = field.trim();
|
||||
let value = op_value.trim();
|
||||
return self.compare_field(state, field, value, |a, b| a >= b);
|
||||
}
|
||||
|
||||
if let Some((field, op_value)) = filter.split_once("<=") {
|
||||
let field = field.trim();
|
||||
let value = op_value.trim();
|
||||
return self.compare_field(state, field, value, |a, b| a <= b);
|
||||
}
|
||||
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
fn compare_field<F>(
|
||||
&self,
|
||||
state: &JsonValue,
|
||||
field: &str,
|
||||
value_str: &str,
|
||||
compare: F,
|
||||
) -> QueryResult<bool>
|
||||
where
|
||||
F: Fn(f64, f64) -> bool,
|
||||
{
|
||||
let field_value = state.get(field);
|
||||
|
||||
let field_num = match field_value {
|
||||
Some(JsonValue::Number(n)) => n.as_f64().unwrap_or(f64::NAN),
|
||||
Some(JsonValue::String(s)) => s.parse::<f64>().unwrap_or(f64::NAN),
|
||||
_ => return Ok(false),
|
||||
};
|
||||
|
||||
let compare_num = value_str.parse::<f64>().unwrap_or(f64::NAN);
|
||||
|
||||
if field_num.is_nan() || compare_num.is_nan() {
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
Ok(compare(field_num, compare_num))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use futures::StreamExt;
|
||||
use serde_json::json;
|
||||
|
||||
fn create_test_client() -> QueryClient {
|
||||
QueryClient::embedded()
|
||||
}
|
||||
|
||||
fn create_test_projection(tenant: &str, id: &str, balance: i64) -> AggregateProjection {
|
||||
AggregateProjection::new(tenant, id, "Account", 1, json!({"balance": balance}))
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn query_client_index_and_query() {
|
||||
let client = create_test_client();
|
||||
|
||||
let proj = create_test_projection("tenant-a", "acc-1", 100);
|
||||
client.index(proj).await.unwrap();
|
||||
|
||||
let request = QueryRequest::new("tenant-a").with_filter("balance > 50");
|
||||
|
||||
let response = client.query(request).await.unwrap();
|
||||
|
||||
assert_eq!(response.results.len(), 1);
|
||||
assert_eq!(response.results[0].aggregate_id, "acc-1");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn query_client_tenant_isolation() {
|
||||
let client = create_test_client();
|
||||
|
||||
client
|
||||
.index(create_test_projection("tenant-a", "acc-1", 100))
|
||||
.await
|
||||
.unwrap();
|
||||
client
|
||||
.index(create_test_projection("tenant-b", "acc-2", 200))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let response_a = client.query(QueryRequest::new("tenant-a")).await.unwrap();
|
||||
let response_b = client.query(QueryRequest::new("tenant-b")).await.unwrap();
|
||||
|
||||
assert_eq!(response_a.results.len(), 1);
|
||||
assert_eq!(response_b.results.len(), 1);
|
||||
assert_eq!(response_a.results[0].state["balance"], 100);
|
||||
assert_eq!(response_b.results[0].state["balance"], 200);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn query_client_filter_operations() {
|
||||
let client = create_test_client();
|
||||
|
||||
client
|
||||
.index(create_test_projection("tenant-a", "acc-1", 100))
|
||||
.await
|
||||
.unwrap();
|
||||
client
|
||||
.index(create_test_projection("tenant-a", "acc-2", 50))
|
||||
.await
|
||||
.unwrap();
|
||||
client
|
||||
.index(create_test_projection("tenant-a", "acc-3", 150))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let gt_response = client
|
||||
.query(QueryRequest::new("tenant-a").with_filter("balance > 75"))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(gt_response.results.len(), 2);
|
||||
|
||||
let lt_response = client
|
||||
.query(QueryRequest::new("tenant-a").with_filter("balance < 75"))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(lt_response.results.len(), 1);
|
||||
|
||||
let eq_response = client
|
||||
.query(QueryRequest::new("tenant-a").with_filter("balance == 100"))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(eq_response.results.len(), 1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn query_client_pagination() {
|
||||
let client = create_test_client();
|
||||
|
||||
for i in 0..25 {
|
||||
client
|
||||
.index(create_test_projection(
|
||||
"tenant-a",
|
||||
&format!("acc-{}", i),
|
||||
i * 10,
|
||||
))
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
let page1 = client
|
||||
.query(QueryRequest::new("tenant-a").with_limit(10))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(page1.results.len(), 10);
|
||||
assert!(page1.has_more);
|
||||
|
||||
let page2 = client
|
||||
.query(QueryRequest::new("tenant-a").with_limit(10).with_offset(10))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(page2.results.len(), 10);
|
||||
|
||||
let page3 = client
|
||||
.query(QueryRequest::new("tenant-a").with_limit(10).with_offset(20))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(page3.results.len(), 5);
|
||||
assert!(!page3.has_more);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn query_client_get_by_id() {
|
||||
let client = create_test_client();
|
||||
|
||||
client
|
||||
.index(create_test_projection("tenant-a", "acc-1", 100))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let tenant = TenantId::new("tenant-a");
|
||||
let result = client.get(&tenant, "acc-1").await.unwrap();
|
||||
|
||||
assert!(result.is_some());
|
||||
let proj = result.unwrap();
|
||||
assert_eq!(proj.aggregate_id, "acc-1");
|
||||
assert_eq!(proj.state["balance"], 100);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn query_client_delete() {
|
||||
let client = create_test_client();
|
||||
|
||||
client
|
||||
.index(create_test_projection("tenant-a", "acc-1", 100))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let tenant = TenantId::new("tenant-a");
|
||||
let deleted = client.delete(&tenant, "acc-1").await.unwrap();
|
||||
assert!(deleted);
|
||||
|
||||
let result = client.get(&tenant, "acc-1").await.unwrap();
|
||||
assert!(result.is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn query_client_clear_tenant() {
|
||||
let client = create_test_client();
|
||||
|
||||
client
|
||||
.index(create_test_projection("tenant-a", "acc-1", 100))
|
||||
.await
|
||||
.unwrap();
|
||||
client
|
||||
.index(create_test_projection("tenant-a", "acc-2", 200))
|
||||
.await
|
||||
.unwrap();
|
||||
client
|
||||
.index(create_test_projection("tenant-b", "acc-3", 300))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let tenant = TenantId::new("tenant-a");
|
||||
let count = client.clear_tenant(&tenant).await.unwrap();
|
||||
assert_eq!(count, 2);
|
||||
|
||||
let response_a = client.query(QueryRequest::new("tenant-a")).await.unwrap();
|
||||
assert_eq!(response_a.results.len(), 0);
|
||||
|
||||
let response_b = client.query(QueryRequest::new("tenant-b")).await.unwrap();
|
||||
assert_eq!(response_b.results.len(), 1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn query_client_update_existing() {
|
||||
let client = create_test_client();
|
||||
|
||||
client
|
||||
.index(create_test_projection("tenant-a", "acc-1", 100))
|
||||
.await
|
||||
.unwrap();
|
||||
client
|
||||
.index(AggregateProjection::new(
|
||||
"tenant-a",
|
||||
"acc-1",
|
||||
"Account",
|
||||
2,
|
||||
json!({"balance": 250}),
|
||||
))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let response = client.query(QueryRequest::new("tenant-a")).await.unwrap();
|
||||
assert_eq!(response.results.len(), 1);
|
||||
assert_eq!(response.results[0].version, 2);
|
||||
assert_eq!(response.results[0].state["balance"], 250);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn query_client_subscribe_receives_updates() {
|
||||
let client = create_test_client();
|
||||
let mut updates = client.subscribe(TenantId::new("tenant-a"));
|
||||
|
||||
client
|
||||
.index(create_test_projection("tenant-a", "acc-1", 100))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let next = updates.next().await.unwrap();
|
||||
assert_eq!(next.tenant_id, "tenant-a");
|
||||
assert_eq!(next.aggregate_id, "acc-1");
|
||||
assert_eq!(next.state["balance"], 100);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn query_config_defaults() {
|
||||
let config = QueryConfig::default();
|
||||
assert!(config.embedded);
|
||||
assert!(config.endpoint.is_none());
|
||||
assert_eq!(config.cache_size, 1000);
|
||||
assert_eq!(config.cache_ttl_seconds, 60);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn query_request_builder() {
|
||||
let request = QueryRequest::new("tenant-a")
|
||||
.with_aggregate_type("Account")
|
||||
.with_filter("balance > 100")
|
||||
.with_limit(50)
|
||||
.with_offset(10);
|
||||
|
||||
assert_eq!(request.tenant_id, "tenant-a");
|
||||
assert_eq!(request.aggregate_type, Some("Account".to_string()));
|
||||
assert_eq!(request.filter, Some("balance > 100".to_string()));
|
||||
assert_eq!(request.limit, Some(50));
|
||||
assert_eq!(request.offset, Some(10));
|
||||
}
|
||||
}
|
||||
193
aggregate/src/query/mod.rs
Normal file
193
aggregate/src/query/mod.rs
Normal file
@@ -0,0 +1,193 @@
|
||||
mod client;
|
||||
mod projection;
|
||||
|
||||
pub use client::{QueryClient, QueryConfig, QueryError, QueryResult};
|
||||
pub use projection::{ProjectionConfig, StateProjection};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value as JsonValue;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct AggregateProjection {
|
||||
pub tenant_id: String,
|
||||
pub aggregate_id: String,
|
||||
pub aggregate_type: String,
|
||||
pub version: u64,
|
||||
pub state: JsonValue,
|
||||
pub updated_at: chrono::DateTime<chrono::Utc>,
|
||||
}
|
||||
|
||||
impl AggregateProjection {
|
||||
pub fn new(
|
||||
tenant_id: impl Into<String>,
|
||||
aggregate_id: impl Into<String>,
|
||||
aggregate_type: impl Into<String>,
|
||||
version: u64,
|
||||
state: JsonValue,
|
||||
) -> Self {
|
||||
Self {
|
||||
tenant_id: tenant_id.into(),
|
||||
aggregate_id: aggregate_id.into(),
|
||||
aggregate_type: aggregate_type.into(),
|
||||
version,
|
||||
state,
|
||||
updated_at: chrono::Utc::now(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct QueryRequest {
|
||||
pub tenant_id: String,
|
||||
pub aggregate_type: Option<String>,
|
||||
pub filter: Option<String>,
|
||||
pub limit: Option<usize>,
|
||||
pub offset: Option<usize>,
|
||||
}
|
||||
|
||||
impl QueryRequest {
|
||||
pub fn new(tenant_id: impl Into<String>) -> Self {
|
||||
Self {
|
||||
tenant_id: tenant_id.into(),
|
||||
aggregate_type: None,
|
||||
filter: None,
|
||||
limit: None,
|
||||
offset: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_aggregate_type(mut self, aggregate_type: impl Into<String>) -> Self {
|
||||
self.aggregate_type = Some(aggregate_type.into());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_filter(mut self, filter: impl Into<String>) -> Self {
|
||||
self.filter = Some(filter.into());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_limit(mut self, limit: usize) -> Self {
|
||||
self.limit = Some(limit);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_offset(mut self, offset: usize) -> Self {
|
||||
self.offset = Some(offset);
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct QueryResponse {
|
||||
pub results: Vec<AggregateProjection>,
|
||||
pub total: usize,
|
||||
pub has_more: bool,
|
||||
}
|
||||
|
||||
impl QueryResponse {
|
||||
pub fn empty() -> Self {
|
||||
Self {
|
||||
results: Vec::new(),
|
||||
total: 0,
|
||||
has_more: false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_results(
|
||||
results: Vec<AggregateProjection>,
|
||||
total: usize,
|
||||
limit: Option<usize>,
|
||||
) -> Self {
|
||||
let has_more = limit.is_some_and(|l| results.len() == l && total > results.len());
|
||||
Self {
|
||||
results,
|
||||
total,
|
||||
has_more,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct QueryServer {
|
||||
query: QueryClient,
|
||||
}
|
||||
|
||||
impl QueryServer {
|
||||
pub fn new(query: QueryClient) -> Self {
|
||||
Self { query }
|
||||
}
|
||||
|
||||
pub fn query_client(&self) -> &QueryClient {
|
||||
&self.query
|
||||
}
|
||||
|
||||
pub async fn handle(&self, request: QueryRequest) -> QueryResult<QueryResponse> {
|
||||
self.query.query(request).await
|
||||
}
|
||||
|
||||
pub async fn handle_raw(
|
||||
&self,
|
||||
tenant_id: impl Into<String>,
|
||||
aggregate_type: Option<String>,
|
||||
filter: Option<String>,
|
||||
limit: Option<usize>,
|
||||
offset: Option<usize>,
|
||||
) -> QueryResult<QueryResponse> {
|
||||
let mut request = QueryRequest::new(tenant_id);
|
||||
request.aggregate_type = aggregate_type;
|
||||
request.filter = filter;
|
||||
request.limit = limit;
|
||||
request.offset = offset;
|
||||
|
||||
self.handle(request).await
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use serde_json::json;
|
||||
|
||||
#[tokio::test]
|
||||
async fn query_server_filters_by_tenant() {
|
||||
let query = QueryClient::embedded();
|
||||
let server = QueryServer::new(query.clone());
|
||||
|
||||
query
|
||||
.index(AggregateProjection::new(
|
||||
"tenant-a",
|
||||
"agg-1",
|
||||
"Account",
|
||||
1,
|
||||
json!({ "balance": 100 }),
|
||||
))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
query
|
||||
.index(AggregateProjection::new(
|
||||
"tenant-b",
|
||||
"agg-2",
|
||||
"Account",
|
||||
1,
|
||||
json!({ "balance": 200 }),
|
||||
))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let resp = server
|
||||
.handle_raw(
|
||||
"tenant-a",
|
||||
Some("Account".to_string()),
|
||||
Some("balance > 50".to_string()),
|
||||
Some(100),
|
||||
Some(0),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(resp.total, 1);
|
||||
assert_eq!(resp.results[0].tenant_id, "tenant-a");
|
||||
assert_eq!(resp.results[0].state["balance"], 100);
|
||||
}
|
||||
}
|
||||
217
aggregate/src/query/projection.rs
Normal file
217
aggregate/src/query/projection.rs
Normal file
@@ -0,0 +1,217 @@
|
||||
use super::AggregateProjection;
|
||||
use crate::types::{AggregateId, AggregateType, Event, TenantId, Version};
|
||||
use serde_json::Value as JsonValue;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ProjectionConfig {
|
||||
pub batch_size: usize,
|
||||
pub projection_timeout_ms: u64,
|
||||
}
|
||||
|
||||
impl Default for ProjectionConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
batch_size: 100,
|
||||
projection_timeout_ms: 5000,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StateProjection {
|
||||
config: ProjectionConfig,
|
||||
handlers: Arc<RwLock<HashMap<String, ProjectionHandler>>>,
|
||||
}
|
||||
|
||||
type ProjectionHandler = Box<dyn Fn(&Event) -> Option<AggregateProjection> + Send + Sync>;
|
||||
|
||||
impl StateProjection {
|
||||
pub fn new(config: ProjectionConfig) -> Self {
|
||||
Self {
|
||||
config,
|
||||
handlers: Arc::new(RwLock::new(HashMap::new())),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_default() -> Self {
|
||||
Self::new(ProjectionConfig::default())
|
||||
}
|
||||
|
||||
pub async fn register_handler<F>(&self, aggregate_type: &str, handler: F)
|
||||
where
|
||||
F: Fn(&Event) -> Option<AggregateProjection> + Send + Sync + 'static,
|
||||
{
|
||||
let mut handlers = self.handlers.write().await;
|
||||
handlers.insert(aggregate_type.to_string(), Box::new(handler));
|
||||
}
|
||||
|
||||
pub async fn project_event(&self, event: &Event) -> Option<AggregateProjection> {
|
||||
let handlers = self.handlers.read().await;
|
||||
let aggregate_type = event.aggregate_type.as_str();
|
||||
|
||||
handlers.get(aggregate_type).and_then(|h| h(event))
|
||||
}
|
||||
|
||||
pub async fn project_events(&self, events: &[Event]) -> Vec<AggregateProjection> {
|
||||
let mut projections = Vec::with_capacity(events.len().min(self.config.batch_size));
|
||||
|
||||
for event in events.iter().take(self.config.batch_size) {
|
||||
if let Some(proj) = self.project_event(event).await {
|
||||
projections.push(proj);
|
||||
}
|
||||
}
|
||||
|
||||
projections
|
||||
}
|
||||
|
||||
pub fn default_projection_from_event(event: &Event) -> AggregateProjection {
|
||||
AggregateProjection::new(
|
||||
event.tenant_id.as_str(),
|
||||
event.aggregate_id.to_string(),
|
||||
event.aggregate_type.as_str(),
|
||||
event.version.as_u64(),
|
||||
event.payload.clone(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn default_projection_from_state(
|
||||
tenant_id: &TenantId,
|
||||
aggregate_id: &AggregateId,
|
||||
aggregate_type: &AggregateType,
|
||||
version: &Version,
|
||||
state: &JsonValue,
|
||||
) -> AggregateProjection {
|
||||
AggregateProjection::new(
|
||||
tenant_id.as_str(),
|
||||
aggregate_id.to_string(),
|
||||
aggregate_type.as_str(),
|
||||
version.as_u64(),
|
||||
state.clone(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use chrono::Utc;
|
||||
use serde_json::json;
|
||||
|
||||
fn create_test_event(tenant: &str, version: u64, event_type: &str) -> Event {
|
||||
Event {
|
||||
event_id: uuid::Uuid::now_v7(),
|
||||
tenant_id: TenantId::new(tenant),
|
||||
aggregate_id: AggregateId::new_v7(),
|
||||
aggregate_type: AggregateType::from("Account"),
|
||||
version: Version::from(version),
|
||||
event_type: event_type.to_string(),
|
||||
payload: json!({"amount": 100}),
|
||||
timestamp: Utc::now(),
|
||||
command_id: uuid::Uuid::nil(),
|
||||
correlation_id: None,
|
||||
traceparent: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn state_projection_registers_handler() {
|
||||
let projection = StateProjection::new_default();
|
||||
|
||||
projection
|
||||
.register_handler("Account", |event| {
|
||||
Some(AggregateProjection::new(
|
||||
event.tenant_id.as_str(),
|
||||
event.aggregate_id.to_string(),
|
||||
"Account",
|
||||
event.version.as_u64(),
|
||||
event.payload.clone(),
|
||||
))
|
||||
})
|
||||
.await;
|
||||
|
||||
let event = create_test_event("tenant-a", 1, "deposited");
|
||||
let result = projection.project_event(&event).await;
|
||||
|
||||
assert!(result.is_some());
|
||||
let proj = result.unwrap();
|
||||
assert_eq!(proj.aggregate_type, "Account");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn state_projection_project_events_batch() {
|
||||
let projection = StateProjection::new_default();
|
||||
|
||||
projection
|
||||
.register_handler("Account", |event| {
|
||||
Some(AggregateProjection::new(
|
||||
event.tenant_id.as_str(),
|
||||
event.aggregate_id.to_string(),
|
||||
"Account",
|
||||
event.version.as_u64(),
|
||||
event.payload.clone(),
|
||||
))
|
||||
})
|
||||
.await;
|
||||
|
||||
let events = vec![
|
||||
create_test_event("tenant-a", 1, "deposited"),
|
||||
create_test_event("tenant-a", 1, "deposited"),
|
||||
create_test_event("tenant-a", 1, "deposited"),
|
||||
];
|
||||
|
||||
let projections = projection.project_events(&events).await;
|
||||
|
||||
assert_eq!(projections.len(), 3);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn state_projection_no_handler_returns_none() {
|
||||
let projection = StateProjection::new_default();
|
||||
|
||||
let event = create_test_event("tenant-a", 1, "deposited");
|
||||
let result = projection.project_event(&event).await;
|
||||
|
||||
assert!(result.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn default_projection_from_event() {
|
||||
let event = create_test_event("tenant-a", 5, "deposited");
|
||||
let proj = StateProjection::default_projection_from_event(&event);
|
||||
|
||||
assert_eq!(proj.tenant_id, "tenant-a");
|
||||
assert_eq!(proj.version, 5);
|
||||
assert_eq!(proj.state["amount"], 100);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn default_projection_from_state() {
|
||||
let tenant_id = TenantId::new("tenant-a");
|
||||
let aggregate_id = AggregateId::new_v7();
|
||||
let aggregate_type = AggregateType::from("Account");
|
||||
let version = Version::from(10);
|
||||
let state = json!({"balance": 1000});
|
||||
|
||||
let proj = StateProjection::default_projection_from_state(
|
||||
&tenant_id,
|
||||
&aggregate_id,
|
||||
&aggregate_type,
|
||||
&version,
|
||||
&state,
|
||||
);
|
||||
|
||||
assert_eq!(proj.tenant_id, "tenant-a");
|
||||
assert_eq!(proj.aggregate_type, "Account");
|
||||
assert_eq!(proj.version, 10);
|
||||
assert_eq!(proj.state["balance"], 1000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn projection_config_defaults() {
|
||||
let config = ProjectionConfig::default();
|
||||
assert_eq!(config.batch_size, 100);
|
||||
assert_eq!(config.projection_timeout_ms, 5000);
|
||||
}
|
||||
}
|
||||
270
aggregate/src/runtime/executor.rs
Normal file
270
aggregate/src/runtime/executor.rs
Normal file
@@ -0,0 +1,270 @@
|
||||
use serde_json::Value as JsonValue;
|
||||
use std::time::Duration;
|
||||
|
||||
pub async fn execute_decide_program(
|
||||
state: &JsonValue,
|
||||
command: &JsonValue,
|
||||
program: &str,
|
||||
gas_limit: u64,
|
||||
timeout: Duration,
|
||||
) -> Result<Vec<JsonValue>, crate::types::AggregateError> {
|
||||
let _ = (state, command, program, gas_limit, timeout);
|
||||
|
||||
#[cfg(feature = "runtime-v8")]
|
||||
{
|
||||
return execute_decide_v8(state, command, program, gas_limit, timeout).await;
|
||||
}
|
||||
|
||||
#[cfg(feature = "runtime-wasm")]
|
||||
{
|
||||
return execute_decide_wasm(state, command, program, gas_limit, timeout).await;
|
||||
}
|
||||
|
||||
#[cfg(not(any(feature = "runtime-v8", feature = "runtime-wasm")))]
|
||||
{
|
||||
Err(crate::types::AggregateError::DecideError(
|
||||
"No runtime enabled. Enable 'runtime-v8' or 'runtime-wasm' feature.".to_string(),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn execute_apply_program(
|
||||
state: &JsonValue,
|
||||
event: &JsonValue,
|
||||
program: &str,
|
||||
gas_limit: u64,
|
||||
timeout: Duration,
|
||||
) -> Result<JsonValue, crate::types::AggregateError> {
|
||||
let _ = (state, event, program, gas_limit, timeout);
|
||||
|
||||
#[cfg(feature = "runtime-v8")]
|
||||
{
|
||||
return execute_apply_v8(state, event, program, gas_limit, timeout).await;
|
||||
}
|
||||
|
||||
#[cfg(feature = "runtime-wasm")]
|
||||
{
|
||||
return execute_apply_wasm(state, event, program, gas_limit, timeout).await;
|
||||
}
|
||||
|
||||
#[cfg(not(any(feature = "runtime-v8", feature = "runtime-wasm")))]
|
||||
{
|
||||
Err(crate::types::AggregateError::ApplyError(
|
||||
"No runtime enabled. Enable 'runtime-v8' or 'runtime-wasm' feature.".to_string(),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "runtime-v8")]
|
||||
async fn execute_decide_v8(
|
||||
state: &JsonValue,
|
||||
command: &JsonValue,
|
||||
program: &str,
|
||||
gas_limit: u64,
|
||||
timeout: Duration,
|
||||
) -> Result<Vec<JsonValue>, crate::types::AggregateError> {
|
||||
use std::sync::Arc;
|
||||
use v8::{Array, Context, Function, HandleScope, Isolate, Object, Scope, Script};
|
||||
|
||||
let state_str = serde_json::to_string(state).map_err(|e| {
|
||||
crate::types::AggregateError::DecideError(format!("State serialization: {}", e))
|
||||
})?;
|
||||
let command_str = serde_json::to_string(command).map_err(|e| {
|
||||
crate::types::AggregateError::DecideError(format!("Command serialization: {}", e))
|
||||
})?;
|
||||
|
||||
let result = tokio::task::spawn_blocking(move || {
|
||||
let isolate = &mut Isolate::new(v8::CreateParams::default());
|
||||
|
||||
let scope = &mut HandleScope::new(isolate);
|
||||
let context = Context::new(scope);
|
||||
let scope = &mut ContextScope::new(scope, context);
|
||||
|
||||
let source =
|
||||
v8::String::new(scope, program).ok_or_else(|| "Failed to create program string")?;
|
||||
|
||||
let script =
|
||||
Script::compile(scope, source, None).ok_or_else(|| "Failed to compile program")?;
|
||||
|
||||
script.run(scope).ok_or_else(|| "Failed to run program")?;
|
||||
|
||||
let global = context.global(scope);
|
||||
let decide_name =
|
||||
v8::String::new(scope, "decide").ok_or_else(|| "Failed to create decide string")?;
|
||||
|
||||
let decide_fn = global
|
||||
.get(scope, decide_name.into())
|
||||
.and_then(|v| v8::Local::<Function>::try_from(v).ok())
|
||||
.ok_or_else(|| "decide function not found")?;
|
||||
|
||||
let state_json = v8::String::new(scope, &state_str)
|
||||
.ok_or_else(|| "Failed to create state JSON string")?;
|
||||
let state_obj =
|
||||
v8::json::parse(scope, state_json).ok_or_else(|| "Failed to parse state JSON")?;
|
||||
|
||||
let command_json = v8::String::new(scope, &command_str)
|
||||
.ok_or_else(|| "Failed to create command JSON string")?;
|
||||
let command_obj =
|
||||
v8::json::parse(scope, command_json).ok_or_else(|| "Failed to parse command JSON")?;
|
||||
|
||||
let args: [v8::Local<v8::Value>; 2] = [state_obj.into(), command_obj.into()];
|
||||
let result = decide_fn
|
||||
.call(scope, global.into(), &args)
|
||||
.ok_or_else(|| "decide function call failed")?;
|
||||
|
||||
let result_json =
|
||||
v8::json::stringify(scope, result).ok_or_else(|| "Failed to stringify result")?;
|
||||
let result_str = result_json.to_rust_string_lossy(scope);
|
||||
|
||||
let events: Vec<JsonValue> = serde_json::from_str(&result_str)
|
||||
.map_err(|e| format!("Failed to parse result: {}", e))?;
|
||||
|
||||
Ok::<_, String>(events)
|
||||
});
|
||||
|
||||
let timeout_result = tokio::time::timeout(timeout, result).await;
|
||||
|
||||
match timeout_result {
|
||||
Ok(Ok(Ok(events))) => Ok(events),
|
||||
Ok(Ok(Err(e))) => Err(crate::types::AggregateError::DecideError(e)),
|
||||
Ok(Err(_)) => Err(crate::types::AggregateError::DecideError(
|
||||
"Task join error".to_string(),
|
||||
)),
|
||||
Err(_) => Err(crate::types::AggregateError::DecideError(
|
||||
"Execution timeout".to_string(),
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "runtime-v8")]
|
||||
async fn execute_apply_v8(
|
||||
state: &JsonValue,
|
||||
event: &JsonValue,
|
||||
program: &str,
|
||||
gas_limit: u64,
|
||||
timeout: Duration,
|
||||
) -> Result<JsonValue, crate::types::AggregateError> {
|
||||
use v8::{Context, ContextScope, Function, HandleScope, Isolate, Script};
|
||||
|
||||
let state_str = serde_json::to_string(state).map_err(|e| {
|
||||
crate::types::AggregateError::ApplyError(format!("State serialization: {}", e))
|
||||
})?;
|
||||
let event_str = serde_json::to_string(event).map_err(|e| {
|
||||
crate::types::AggregateError::ApplyError(format!("Event serialization: {}", e))
|
||||
})?;
|
||||
|
||||
let _ = gas_limit;
|
||||
|
||||
let result = tokio::task::spawn_blocking(move || {
|
||||
let isolate = &mut Isolate::new(v8::CreateParams::default());
|
||||
|
||||
let scope = &mut HandleScope::new(isolate);
|
||||
let context = Context::new(scope);
|
||||
let scope = &mut ContextScope::new(scope, context);
|
||||
|
||||
let source =
|
||||
v8::String::new(scope, program).ok_or_else(|| "Failed to create program string")?;
|
||||
|
||||
let script =
|
||||
Script::compile(scope, source, None).ok_or_else(|| "Failed to compile program")?;
|
||||
|
||||
script.run(scope).ok_or_else(|| "Failed to run program")?;
|
||||
|
||||
let global = context.global(scope);
|
||||
let apply_name =
|
||||
v8::String::new(scope, "apply").ok_or_else(|| "Failed to create apply string")?;
|
||||
|
||||
let apply_fn = global
|
||||
.get(scope, apply_name.into())
|
||||
.and_then(|v| v8::Local::<Function>::try_from(v).ok())
|
||||
.ok_or_else(|| "apply function not found")?;
|
||||
|
||||
let state_json = v8::String::new(scope, &state_str)
|
||||
.ok_or_else(|| "Failed to create state JSON string")?;
|
||||
let state_obj =
|
||||
v8::json::parse(scope, state_json).ok_or_else(|| "Failed to parse state JSON")?;
|
||||
|
||||
let event_json = v8::String::new(scope, &event_str)
|
||||
.ok_or_else(|| "Failed to create event JSON string")?;
|
||||
let event_obj =
|
||||
v8::json::parse(scope, event_json).ok_or_else(|| "Failed to parse event JSON")?;
|
||||
|
||||
let args: [v8::Local<v8::Value>; 2] = [state_obj.into(), event_obj.into()];
|
||||
let result = apply_fn
|
||||
.call(scope, global.into(), &args)
|
||||
.ok_or_else(|| "apply function call failed")?;
|
||||
|
||||
let result_json =
|
||||
v8::json::stringify(scope, result).ok_or_else(|| "Failed to stringify result")?;
|
||||
let result_str = result_json.to_rust_string_lossy(scope);
|
||||
|
||||
let new_state: JsonValue = serde_json::from_str(&result_str)
|
||||
.map_err(|e| format!("Failed to parse result: {}", e))?;
|
||||
|
||||
Ok::<_, String>(new_state)
|
||||
});
|
||||
|
||||
let timeout_result = tokio::time::timeout(timeout, result).await;
|
||||
|
||||
match timeout_result {
|
||||
Ok(Ok(Ok(new_state))) => Ok(new_state),
|
||||
Ok(Ok(Err(e))) => Err(crate::types::AggregateError::ApplyError(e)),
|
||||
Ok(Err(_)) => Err(crate::types::AggregateError::ApplyError(
|
||||
"Task join error".to_string(),
|
||||
)),
|
||||
Err(_) => Err(crate::types::AggregateError::ApplyError(
|
||||
"Execution timeout".to_string(),
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "runtime-wasm")]
|
||||
async fn execute_decide_wasm(
|
||||
state: &JsonValue,
|
||||
command: &JsonValue,
|
||||
_program: &str,
|
||||
_gas_limit: u64,
|
||||
_timeout: Duration,
|
||||
) -> Result<Vec<JsonValue>, crate::types::AggregateError> {
|
||||
let _ = (state, command);
|
||||
Err(crate::types::AggregateError::DecideError(
|
||||
"WASM runtime not yet implemented".to_string(),
|
||||
))
|
||||
}
|
||||
|
||||
#[cfg(feature = "runtime-wasm")]
|
||||
async fn execute_apply_wasm(
|
||||
state: &JsonValue,
|
||||
event: &JsonValue,
|
||||
_program: &str,
|
||||
_gas_limit: u64,
|
||||
_timeout: Duration,
|
||||
) -> Result<JsonValue, crate::types::AggregateError> {
|
||||
let _ = (state, event);
|
||||
Err(crate::types::AggregateError::ApplyError(
|
||||
"WASM runtime not yet implemented".to_string(),
|
||||
))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use serde_json::json;
|
||||
|
||||
#[tokio::test]
|
||||
async fn no_runtime_returns_error() {
|
||||
#[cfg(not(any(feature = "runtime-v8", feature = "runtime-wasm")))]
|
||||
{
|
||||
let state = json!({});
|
||||
let command = json!({});
|
||||
let result =
|
||||
execute_decide_program(&state, &command, "program", 1000, Duration::from_secs(1))
|
||||
.await;
|
||||
assert!(result.is_err());
|
||||
assert!(matches!(
|
||||
result.unwrap_err(),
|
||||
crate::types::AggregateError::DecideError(_)
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
484
aggregate/src/runtime/mod.rs
Normal file
484
aggregate/src/runtime/mod.rs
Normal file
@@ -0,0 +1,484 @@
|
||||
mod executor;
|
||||
|
||||
use lru::LruCache;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
use crate::types::{AggregateError, Command, Event};
|
||||
use serde_json::Value as JsonValue;
|
||||
|
||||
pub use executor::{execute_apply_program, execute_decide_program};
|
||||
|
||||
const DEFAULT_GAS_LIMIT: u64 = 1_000_000;
|
||||
const DEFAULT_TIMEOUT_MS: u64 = 5_000;
|
||||
const CACHE_SIZE: usize = 100;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ExecutorConfig {
|
||||
pub gas_limit: u64,
|
||||
pub timeout: Duration,
|
||||
pub cache_programs: bool,
|
||||
pub mock_runtime: bool,
|
||||
}
|
||||
|
||||
impl Default for ExecutorConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
gas_limit: DEFAULT_GAS_LIMIT,
|
||||
timeout: Duration::from_millis(DEFAULT_TIMEOUT_MS),
|
||||
cache_programs: true,
|
||||
mock_runtime: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ExecutorConfig {
|
||||
pub fn with_gas_limit(mut self, limit: u64) -> Self {
|
||||
self.gas_limit = limit;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_timeout(mut self, timeout: Duration) -> Self {
|
||||
self.timeout = timeout;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn without_cache(mut self) -> Self {
|
||||
self.cache_programs = false;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_mock_runtime(mut self) -> Self {
|
||||
self.mock_runtime = true;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Hash, Eq, PartialEq)]
|
||||
pub struct ProgramHash(String);
|
||||
|
||||
impl ProgramHash {
|
||||
pub fn new(program: &str) -> Self {
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
let mut hasher = DefaultHasher::new();
|
||||
program.hash(&mut hasher);
|
||||
Self(format!("{:x}", hasher.finish()))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DecideResult {
|
||||
pub events: Vec<JsonValue>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ApplyResult {
|
||||
pub new_state: JsonValue,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RuntimeExecutor {
|
||||
config: ExecutorConfig,
|
||||
program_cache: Arc<RwLock<LruCache<ProgramHash, String>>>,
|
||||
}
|
||||
|
||||
impl RuntimeExecutor {
|
||||
pub fn new() -> Self {
|
||||
Self::with_config(ExecutorConfig::default())
|
||||
}
|
||||
|
||||
pub fn with_config(config: ExecutorConfig) -> Self {
|
||||
let cache_size = NonZeroUsize::new(CACHE_SIZE).unwrap();
|
||||
Self {
|
||||
config,
|
||||
program_cache: Arc::new(RwLock::new(LruCache::new(cache_size))),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn execute_decide(
|
||||
&self,
|
||||
state: &JsonValue,
|
||||
command: &Command,
|
||||
decide_program: &str,
|
||||
) -> Result<DecideResult, AggregateError> {
|
||||
if self.config.mock_runtime {
|
||||
let events = mock_decide(state, command)?;
|
||||
return Ok(DecideResult { events });
|
||||
}
|
||||
|
||||
if self.config.cache_programs {
|
||||
let hash = ProgramHash::new(decide_program);
|
||||
let mut cache = self.program_cache.write().await;
|
||||
cache.put(hash.clone(), decide_program.to_string());
|
||||
}
|
||||
|
||||
let command_json = serde_json::to_value(command).map_err(|e| {
|
||||
AggregateError::DecideError(format!("Command serialization failed: {}", e))
|
||||
})?;
|
||||
|
||||
let result = executor::execute_decide_program(
|
||||
state,
|
||||
&command_json,
|
||||
decide_program,
|
||||
self.config.gas_limit,
|
||||
self.config.timeout,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(DecideResult { events: result })
|
||||
}
|
||||
|
||||
pub async fn execute_apply(
|
||||
&self,
|
||||
state: &JsonValue,
|
||||
event: &Event,
|
||||
apply_program: &str,
|
||||
) -> Result<ApplyResult, AggregateError> {
|
||||
if self.config.mock_runtime {
|
||||
let new_state = mock_apply(state, event)?;
|
||||
return Ok(ApplyResult { new_state });
|
||||
}
|
||||
|
||||
if self.config.cache_programs {
|
||||
let hash = ProgramHash::new(apply_program);
|
||||
let mut cache = self.program_cache.write().await;
|
||||
cache.put(hash.clone(), apply_program.to_string());
|
||||
}
|
||||
|
||||
let event_json = serde_json::to_value(event).map_err(|e| {
|
||||
AggregateError::ApplyError(format!("Event serialization failed: {}", e))
|
||||
})?;
|
||||
|
||||
let result = executor::execute_apply_program(
|
||||
state,
|
||||
&event_json,
|
||||
apply_program,
|
||||
self.config.gas_limit,
|
||||
self.config.timeout,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(ApplyResult { new_state: result })
|
||||
}
|
||||
|
||||
pub async fn execute_apply_raw(
|
||||
&self,
|
||||
state: &JsonValue,
|
||||
event: &JsonValue,
|
||||
apply_program: &str,
|
||||
) -> Result<ApplyResult, AggregateError> {
|
||||
if self.config.mock_runtime {
|
||||
let _ = apply_program;
|
||||
return Err(AggregateError::ApplyError(
|
||||
"mock_runtime does not support execute_apply_raw".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
if self.config.cache_programs {
|
||||
let hash = ProgramHash::new(apply_program);
|
||||
let mut cache = self.program_cache.write().await;
|
||||
cache.put(hash.clone(), apply_program.to_string());
|
||||
}
|
||||
|
||||
let result = executor::execute_apply_program(
|
||||
state,
|
||||
event,
|
||||
apply_program,
|
||||
self.config.gas_limit,
|
||||
self.config.timeout,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(ApplyResult { new_state: result })
|
||||
}
|
||||
|
||||
pub fn config(&self) -> &ExecutorConfig {
|
||||
&self.config
|
||||
}
|
||||
|
||||
pub async fn cache_size(&self) -> usize {
|
||||
self.program_cache.read().await.len()
|
||||
}
|
||||
|
||||
pub async fn clear_cache(&self) {
|
||||
self.program_cache.write().await.clear();
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for RuntimeExecutor {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
fn mock_decide(state: &JsonValue, command: &Command) -> Result<Vec<JsonValue>, AggregateError> {
|
||||
let cmd_type = command
|
||||
.payload
|
||||
.get("type")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("");
|
||||
let amount = command
|
||||
.payload
|
||||
.get("amount")
|
||||
.and_then(|v| v.as_i64())
|
||||
.unwrap_or(0);
|
||||
|
||||
match cmd_type {
|
||||
"deposit" => Ok(vec![
|
||||
serde_json::json!({ "type": "deposited", "amount": amount }),
|
||||
]),
|
||||
"withdraw" => {
|
||||
let balance = state.get("balance").and_then(|v| v.as_i64()).unwrap_or(0);
|
||||
if balance < amount {
|
||||
Err(AggregateError::DecideError(
|
||||
"Insufficient funds".to_string(),
|
||||
))
|
||||
} else {
|
||||
Ok(vec![
|
||||
serde_json::json!({ "type": "withdrawn", "amount": amount }),
|
||||
])
|
||||
}
|
||||
}
|
||||
_ => Ok(Vec::new()),
|
||||
}
|
||||
}
|
||||
|
||||
fn mock_apply(state: &JsonValue, event: &Event) -> Result<JsonValue, AggregateError> {
|
||||
let mut new_state = match state {
|
||||
JsonValue::Object(map) => JsonValue::Object(map.clone()),
|
||||
_ => serde_json::json!({}),
|
||||
};
|
||||
|
||||
let balance = new_state
|
||||
.get("balance")
|
||||
.and_then(|v| v.as_i64())
|
||||
.unwrap_or(0);
|
||||
|
||||
let amount = event
|
||||
.payload
|
||||
.get("amount")
|
||||
.and_then(|v| v.as_i64())
|
||||
.unwrap_or(0);
|
||||
|
||||
let next_balance = match event.event_type.as_str() {
|
||||
"deposited" => balance + amount,
|
||||
"withdrawn" => balance - amount,
|
||||
_ => balance,
|
||||
};
|
||||
|
||||
if let JsonValue::Object(map) = &mut new_state {
|
||||
map.insert("balance".to_string(), JsonValue::from(next_balance));
|
||||
}
|
||||
|
||||
Ok(new_state)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::types::{AggregateId, AggregateType, TenantId, Version};
|
||||
use serde_json::json;
|
||||
use std::collections::HashMap;
|
||||
|
||||
const DECIDE_PROGRAM: &str = r#"
|
||||
function decide(state, command) {
|
||||
if (command.type === "deposit") {
|
||||
return [{ type: "deposited", amount: command.amount }];
|
||||
}
|
||||
if (command.type === "withdraw") {
|
||||
if (state.balance < command.amount) {
|
||||
throw new Error("Insufficient funds");
|
||||
}
|
||||
return [{ type: "withdrawn", amount: command.amount }];
|
||||
}
|
||||
return [];
|
||||
}
|
||||
"#;
|
||||
|
||||
const APPLY_PROGRAM: &str = r#"
|
||||
function apply(state, event) {
|
||||
if (event.type === "deposited") {
|
||||
state.balance = (state.balance || 0) + event.amount;
|
||||
}
|
||||
if (event.type === "withdrawn") {
|
||||
state.balance = (state.balance || 0) - event.amount;
|
||||
}
|
||||
return state;
|
||||
}
|
||||
"#;
|
||||
|
||||
fn make_command(cmd_type: &str, amount: i64, tenant_id: &TenantId) -> Command {
|
||||
Command {
|
||||
command_id: uuid::Uuid::now_v7(),
|
||||
tenant_id: tenant_id.clone(),
|
||||
aggregate_id: AggregateId::new_v7(),
|
||||
aggregate_type: AggregateType::from("Account"),
|
||||
payload: json!({ "type": cmd_type, "amount": amount }),
|
||||
metadata: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn executor_has_defaults() {
|
||||
let executor = RuntimeExecutor::new();
|
||||
assert_eq!(executor.config().gas_limit, DEFAULT_GAS_LIMIT);
|
||||
assert!(executor.config().cache_programs);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn config_builder_works() {
|
||||
let config = ExecutorConfig::default()
|
||||
.with_gas_limit(500_000)
|
||||
.with_timeout(Duration::from_millis(1000))
|
||||
.without_cache();
|
||||
|
||||
assert_eq!(config.gas_limit, 500_000);
|
||||
assert_eq!(config.timeout, Duration::from_millis(1000));
|
||||
assert!(!config.cache_programs);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn program_hash_is_consistent() {
|
||||
let h1 = ProgramHash::new("test program");
|
||||
let h2 = ProgramHash::new("test program");
|
||||
assert_eq!(h1, h2);
|
||||
|
||||
let h3 = ProgramHash::new("different program");
|
||||
assert_ne!(h1, h3);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn decide_returns_events_for_deposit() {
|
||||
let executor = RuntimeExecutor::new();
|
||||
let tenant_id = TenantId::new("test-tenant");
|
||||
let state = json!({ "balance": 100 });
|
||||
let command = make_command("deposit", 50, &tenant_id);
|
||||
|
||||
let result = executor
|
||||
.execute_decide(&state, &command, DECIDE_PROGRAM)
|
||||
.await;
|
||||
|
||||
match result {
|
||||
Ok(decide_result) => {
|
||||
assert!(!decide_result.events.is_empty());
|
||||
}
|
||||
Err(AggregateError::DecideError(msg)) => {
|
||||
assert!(
|
||||
msg.contains("runtime")
|
||||
|| msg.contains("not available")
|
||||
|| msg.contains("not implemented")
|
||||
);
|
||||
}
|
||||
Err(e) => panic!("Unexpected error: {:?}", e),
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn decide_rejects_invalid_withdraw() {
|
||||
let executor = RuntimeExecutor::new();
|
||||
let tenant_id = TenantId::new("test-tenant");
|
||||
let state = json!({ "balance": 10 });
|
||||
let command = make_command("withdraw", 100, &tenant_id);
|
||||
|
||||
let result = executor
|
||||
.execute_decide(&state, &command, DECIDE_PROGRAM)
|
||||
.await;
|
||||
|
||||
assert!(matches!(result, Err(AggregateError::DecideError(_))));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn decide_is_deterministic() {
|
||||
let executor = RuntimeExecutor::new();
|
||||
let tenant_id = TenantId::new("test-tenant");
|
||||
let state = json!({ "balance": 100 });
|
||||
let command = make_command("deposit", 50, &tenant_id);
|
||||
|
||||
let r1 = executor
|
||||
.execute_decide(&state, &command, DECIDE_PROGRAM)
|
||||
.await;
|
||||
let r2 = executor
|
||||
.execute_decide(&state, &command, DECIDE_PROGRAM)
|
||||
.await;
|
||||
|
||||
assert_eq!(r1.is_ok(), r2.is_ok());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn apply_transitions_state() {
|
||||
let executor = RuntimeExecutor::new();
|
||||
let tenant_id = TenantId::new("test-tenant");
|
||||
let state = json!({ "balance": 100 });
|
||||
let event = Event {
|
||||
event_id: uuid::Uuid::now_v7(),
|
||||
tenant_id,
|
||||
aggregate_id: AggregateId::new_v7(),
|
||||
aggregate_type: AggregateType::from("Account"),
|
||||
event_type: "deposited".to_string(),
|
||||
version: Version::from(1),
|
||||
payload: json!({ "amount": 50 }),
|
||||
command_id: uuid::Uuid::now_v7(),
|
||||
timestamp: chrono::Utc::now(),
|
||||
correlation_id: None,
|
||||
traceparent: None,
|
||||
};
|
||||
|
||||
let result = executor.execute_apply(&state, &event, APPLY_PROGRAM).await;
|
||||
|
||||
match result {
|
||||
Ok(apply_result) => {
|
||||
assert!(apply_result.new_state.is_object());
|
||||
}
|
||||
Err(AggregateError::ApplyError(msg)) => {
|
||||
assert!(
|
||||
msg.contains("runtime")
|
||||
|| msg.contains("not available")
|
||||
|| msg.contains("not implemented")
|
||||
);
|
||||
}
|
||||
Err(e) => panic!("Unexpected error: {:?}", e),
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn cache_stores_programs() {
|
||||
let executor = RuntimeExecutor::new();
|
||||
let tenant_id = TenantId::new("test-tenant");
|
||||
let state = json!({ "balance": 100 });
|
||||
let command = make_command("deposit", 50, &tenant_id);
|
||||
|
||||
assert_eq!(executor.cache_size().await, 0);
|
||||
|
||||
let _ = executor
|
||||
.execute_decide(&state, &command, DECIDE_PROGRAM)
|
||||
.await;
|
||||
|
||||
assert_eq!(executor.cache_size().await, 1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn clear_cache_works() {
|
||||
let executor = RuntimeExecutor::new();
|
||||
let tenant_id = TenantId::new("test-tenant");
|
||||
let state = json!({ "balance": 100 });
|
||||
let command = make_command("deposit", 50, &tenant_id);
|
||||
|
||||
let _ = executor
|
||||
.execute_decide(&state, &command, DECIDE_PROGRAM)
|
||||
.await;
|
||||
assert!(executor.cache_size().await > 0);
|
||||
|
||||
executor.clear_cache().await;
|
||||
assert_eq!(executor.cache_size().await, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn executor_is_send_sync() {
|
||||
fn assert_send_sync<T: Send + Sync>() {}
|
||||
assert_send_sync::<RuntimeExecutor>();
|
||||
}
|
||||
}
|
||||
259
aggregate/src/server/health.rs
Normal file
259
aggregate/src/server/health.rs
Normal file
@@ -0,0 +1,259 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::RwLock;
|
||||
use std::time::Instant;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub enum HealthStatus {
|
||||
Healthy,
|
||||
Degraded { issues: Vec<String> },
|
||||
Unhealthy { reasons: Vec<String> },
|
||||
}
|
||||
|
||||
impl HealthStatus {
|
||||
pub fn is_healthy(&self) -> bool {
|
||||
matches!(self, Self::Healthy)
|
||||
}
|
||||
|
||||
pub fn is_degraded(&self) -> bool {
|
||||
matches!(self, Self::Degraded { .. })
|
||||
}
|
||||
|
||||
pub fn is_unhealthy(&self) -> bool {
|
||||
matches!(self, Self::Unhealthy { .. })
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ComponentHealth {
|
||||
pub name: String,
|
||||
pub status: HealthStatus,
|
||||
pub last_check: Instant,
|
||||
pub details: HashMap<String, String>,
|
||||
}
|
||||
|
||||
impl ComponentHealth {
|
||||
pub fn healthy(name: impl Into<String>) -> Self {
|
||||
Self {
|
||||
name: name.into(),
|
||||
status: HealthStatus::Healthy,
|
||||
last_check: Instant::now(),
|
||||
details: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn degraded(name: impl Into<String>, issues: Vec<String>) -> Self {
|
||||
Self {
|
||||
name: name.into(),
|
||||
status: HealthStatus::Degraded { issues },
|
||||
last_check: Instant::now(),
|
||||
details: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn unhealthy(name: impl Into<String>, reasons: Vec<String>) -> Self {
|
||||
Self {
|
||||
name: name.into(),
|
||||
status: HealthStatus::Unhealthy { reasons },
|
||||
last_check: Instant::now(),
|
||||
details: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_detail(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
|
||||
self.details.insert(key.into(), value.into());
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
pub struct HealthChecker {
|
||||
storage_healthy: AtomicBool,
|
||||
stream_healthy: AtomicBool,
|
||||
components: RwLock<HashMap<String, ComponentHealth>>,
|
||||
}
|
||||
|
||||
impl HealthChecker {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
storage_healthy: AtomicBool::new(true),
|
||||
stream_healthy: AtomicBool::new(true),
|
||||
components: RwLock::new(HashMap::new()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn storage_healthy(&self) -> bool {
|
||||
self.storage_healthy.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
pub fn stream_healthy(&self) -> bool {
|
||||
self.stream_healthy.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
pub fn set_storage_healthy(&self, healthy: bool) {
|
||||
self.storage_healthy.store(healthy, Ordering::Relaxed);
|
||||
self.update_component(
|
||||
"storage",
|
||||
healthy,
|
||||
if healthy { "connected" } else { "disconnected" },
|
||||
);
|
||||
}
|
||||
|
||||
pub fn set_stream_healthy(&self, healthy: bool) {
|
||||
self.stream_healthy.store(healthy, Ordering::Relaxed);
|
||||
self.update_component(
|
||||
"stream",
|
||||
healthy,
|
||||
if healthy { "connected" } else { "disconnected" },
|
||||
);
|
||||
}
|
||||
|
||||
fn update_component(&self, name: &str, healthy: bool, status: &str) {
|
||||
let mut components = self.components.write().unwrap();
|
||||
let health = if healthy {
|
||||
ComponentHealth::healthy(name).with_detail("status", status)
|
||||
} else {
|
||||
ComponentHealth::unhealthy(name, vec![format!("status: {}", status)])
|
||||
};
|
||||
components.insert(name.to_string(), health);
|
||||
}
|
||||
|
||||
pub fn check(&self) -> HealthStatus {
|
||||
let storage = self.storage_healthy.load(Ordering::Relaxed);
|
||||
let stream = self.stream_healthy.load(Ordering::Relaxed);
|
||||
|
||||
match (storage, stream) {
|
||||
(true, true) => HealthStatus::Healthy,
|
||||
(true, false) | (false, true) => {
|
||||
let mut issues = Vec::new();
|
||||
if !storage {
|
||||
issues.push("storage disconnected".to_string());
|
||||
}
|
||||
if !stream {
|
||||
issues.push("stream disconnected".to_string());
|
||||
}
|
||||
HealthStatus::Degraded { issues }
|
||||
}
|
||||
(false, false) => HealthStatus::Unhealthy {
|
||||
reasons: vec![
|
||||
"storage disconnected".to_string(),
|
||||
"stream disconnected".to_string(),
|
||||
],
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_ready(&self) -> bool {
|
||||
let status = self.check();
|
||||
status.is_healthy() || status.is_degraded()
|
||||
}
|
||||
|
||||
pub fn is_live(&self) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
pub fn components(&self) -> HashMap<String, ComponentHealth> {
|
||||
self.components.read().unwrap().clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for HealthChecker {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn health_status_checks() {
|
||||
let healthy = HealthStatus::Healthy;
|
||||
assert!(healthy.is_healthy());
|
||||
assert!(!healthy.is_degraded());
|
||||
assert!(!healthy.is_unhealthy());
|
||||
|
||||
let degraded = HealthStatus::Degraded {
|
||||
issues: vec!["test".to_string()],
|
||||
};
|
||||
assert!(!degraded.is_healthy());
|
||||
assert!(degraded.is_degraded());
|
||||
assert!(!degraded.is_unhealthy());
|
||||
|
||||
let unhealthy = HealthStatus::Unhealthy {
|
||||
reasons: vec!["test".to_string()],
|
||||
};
|
||||
assert!(!unhealthy.is_healthy());
|
||||
assert!(!unhealthy.is_degraded());
|
||||
assert!(unhealthy.is_unhealthy());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn component_health_builders() {
|
||||
let healthy = ComponentHealth::healthy("storage");
|
||||
assert_eq!(healthy.name, "storage");
|
||||
assert!(healthy.status.is_healthy());
|
||||
|
||||
let degraded = ComponentHealth::degraded("stream", vec!["slow".to_string()]);
|
||||
assert!(degraded.status.is_degraded());
|
||||
|
||||
let unhealthy = ComponentHealth::unhealthy("db", vec!["down".to_string()]);
|
||||
assert!(unhealthy.status.is_unhealthy());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn health_checker_starts_healthy() {
|
||||
let checker = HealthChecker::new();
|
||||
assert!(checker.check().is_healthy());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn health_checker_storage_failure() {
|
||||
let checker = HealthChecker::new();
|
||||
checker.set_storage_healthy(false);
|
||||
|
||||
let status = checker.check();
|
||||
assert!(status.is_degraded());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn health_checker_all_failures() {
|
||||
let checker = HealthChecker::new();
|
||||
checker.set_storage_healthy(false);
|
||||
checker.set_stream_healthy(false);
|
||||
|
||||
let status = checker.check();
|
||||
assert!(status.is_unhealthy());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn health_checker_is_ready() {
|
||||
let checker = HealthChecker::new();
|
||||
assert!(checker.is_ready());
|
||||
|
||||
checker.set_storage_healthy(false);
|
||||
assert!(checker.is_ready());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn health_checker_is_live() {
|
||||
let checker = HealthChecker::new();
|
||||
assert!(checker.is_live());
|
||||
|
||||
checker.set_storage_healthy(false);
|
||||
checker.set_stream_healthy(false);
|
||||
assert!(checker.is_live());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn health_checker_tracks_components() {
|
||||
let checker = HealthChecker::new();
|
||||
checker.set_storage_healthy(true);
|
||||
checker.set_stream_healthy(true);
|
||||
|
||||
let components = checker.components();
|
||||
assert!(components.contains_key("storage"));
|
||||
assert!(components.contains_key("stream"));
|
||||
}
|
||||
}
|
||||
787
aggregate/src/server/mod.rs
Normal file
787
aggregate/src/server/mod.rs
Normal file
@@ -0,0 +1,787 @@
|
||||
mod health;
|
||||
|
||||
pub use health::{HealthChecker, HealthStatus};
|
||||
|
||||
use crate::aggregate::AggregateHandler;
|
||||
use crate::observability::Observability;
|
||||
use crate::placement::{TenantPlacementManager, TenantStatus};
|
||||
use crate::types::{AggregateError, AggregateId, AggregateType, Command, Event, TenantId};
|
||||
use serde::de::DeserializeOwned;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::HashMap;
|
||||
use std::hash::{Hash, Hasher};
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CommandRequest {
|
||||
pub tenant_id: TenantId,
|
||||
pub aggregate_id: AggregateId,
|
||||
pub aggregate_type: AggregateType,
|
||||
pub payload: serde_json::Value,
|
||||
pub headers: HashMap<String, String>,
|
||||
}
|
||||
|
||||
impl CommandRequest {
|
||||
pub fn new(
|
||||
tenant_id: TenantId,
|
||||
aggregate_id: AggregateId,
|
||||
aggregate_type: AggregateType,
|
||||
payload: serde_json::Value,
|
||||
) -> Self {
|
||||
Self {
|
||||
tenant_id,
|
||||
aggregate_id,
|
||||
aggregate_type,
|
||||
payload,
|
||||
headers: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_header(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
|
||||
self.headers.insert(key.into(), value.into());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn into_command(self) -> Command {
|
||||
let mut cmd = Command::new(
|
||||
self.tenant_id,
|
||||
self.aggregate_id,
|
||||
self.aggregate_type,
|
||||
self.payload,
|
||||
);
|
||||
if let Some(correlation_id) = self
|
||||
.headers
|
||||
.get("x-correlation-id")
|
||||
.map(|s| s.trim())
|
||||
.filter(|s| !s.is_empty())
|
||||
{
|
||||
cmd.metadata.insert(
|
||||
"correlation_id".to_string(),
|
||||
serde_json::Value::String(correlation_id.to_string()),
|
||||
);
|
||||
}
|
||||
if let Some(traceparent) = self
|
||||
.headers
|
||||
.get("traceparent")
|
||||
.map(|s| s.trim())
|
||||
.filter(|s| !s.is_empty())
|
||||
{
|
||||
cmd.metadata.insert(
|
||||
"traceparent".to_string(),
|
||||
serde_json::Value::String(traceparent.to_string()),
|
||||
);
|
||||
}
|
||||
cmd
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CommandResponse {
|
||||
pub tenant_id: TenantId,
|
||||
pub aggregate_id: AggregateId,
|
||||
pub events: Vec<Event>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ServerConfig {
|
||||
pub service_name: String,
|
||||
pub validate_tenant_id: bool,
|
||||
}
|
||||
|
||||
impl Default for ServerConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
service_name: "aggregate".to_string(),
|
||||
validate_tenant_id: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct CommandServer {
|
||||
handler: AggregateHandler,
|
||||
observability: Arc<Observability>,
|
||||
health_checker: HealthChecker,
|
||||
config: ServerConfig,
|
||||
}
|
||||
|
||||
impl CommandServer {
|
||||
pub fn new(handler: AggregateHandler, observability: Observability) -> Self {
|
||||
Self {
|
||||
handler,
|
||||
observability: Arc::new(observability),
|
||||
health_checker: HealthChecker::new(),
|
||||
config: ServerConfig::default(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_config(mut self, config: ServerConfig) -> Self {
|
||||
self.config = config;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn extract_tenant_id(&self, headers: &HashMap<String, String>) -> TenantId {
|
||||
headers
|
||||
.get("x-tenant-id")
|
||||
.map(TenantId::new)
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
pub fn validate_tenant_id(&self, tenant_id: &TenantId) -> Result<(), ServerError> {
|
||||
if !self.config.validate_tenant_id {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let id = tenant_id.as_str();
|
||||
if id.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if !id
|
||||
.chars()
|
||||
.all(|c| c.is_alphanumeric() || c == '-' || c == '_')
|
||||
{
|
||||
return Err(ServerError::InvalidTenantId {
|
||||
tenant_id: tenant_id.clone(),
|
||||
reason:
|
||||
"tenant_id must contain only alphanumeric characters, hyphens, and underscores"
|
||||
.to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn handle(&self, request: CommandRequest) -> Result<CommandResponse, ServerError> {
|
||||
let tenant_id = request.tenant_id.clone();
|
||||
let aggregate_id = request.aggregate_id.clone();
|
||||
let aggregate_type = request.aggregate_type.clone();
|
||||
|
||||
self.validate_tenant_id(&tenant_id)?;
|
||||
|
||||
let correlation_id = request
|
||||
.headers
|
||||
.get("x-correlation-id")
|
||||
.map(|s| s.trim())
|
||||
.filter(|s| !s.is_empty())
|
||||
.map(|s| s.to_string());
|
||||
let trace_id = request
|
||||
.headers
|
||||
.get("traceparent")
|
||||
.map(|s| s.trim())
|
||||
.filter(|s| !s.is_empty())
|
||||
.and_then(trace_id_from_traceparent);
|
||||
|
||||
let span = self.observability.start_command_span(
|
||||
&aggregate_id.to_string(),
|
||||
aggregate_type.as_str(),
|
||||
tenant_id.as_str(),
|
||||
"cmd",
|
||||
correlation_id.as_deref(),
|
||||
trace_id.as_deref(),
|
||||
);
|
||||
|
||||
let command = request.into_command();
|
||||
|
||||
match self.handler.handle_command(command).await {
|
||||
Ok(events) => {
|
||||
self.observability
|
||||
.record_command_success(&span, events.len());
|
||||
Ok(CommandResponse {
|
||||
tenant_id,
|
||||
aggregate_id,
|
||||
events,
|
||||
})
|
||||
}
|
||||
Err(e) => {
|
||||
self.observability.record_command_error(&span, &e);
|
||||
Err(e.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn handle_raw(
|
||||
&self,
|
||||
tenant_id: Option<&str>,
|
||||
aggregate_id: &str,
|
||||
aggregate_type: &str,
|
||||
payload: serde_json::Value,
|
||||
headers: HashMap<String, String>,
|
||||
) -> Result<CommandResponse, ServerError> {
|
||||
let resolved_tenant_id = tenant_id
|
||||
.map(TenantId::new)
|
||||
.unwrap_or_else(|| self.extract_tenant_id(&headers));
|
||||
|
||||
let request = CommandRequest::new(
|
||||
resolved_tenant_id,
|
||||
AggregateId::from_str(aggregate_id).map_err(|e| ServerError::InvalidAggregateId {
|
||||
id: aggregate_id.to_string(),
|
||||
reason: e.to_string(),
|
||||
})?,
|
||||
AggregateType::from(aggregate_type),
|
||||
payload,
|
||||
)
|
||||
.with_headers(headers);
|
||||
|
||||
self.handle(request).await
|
||||
}
|
||||
|
||||
pub async fn health_check(&self) -> HealthStatus {
|
||||
self.health_checker.check()
|
||||
}
|
||||
|
||||
pub async fn ready_check(&self) -> bool {
|
||||
self.health_checker.is_ready()
|
||||
}
|
||||
|
||||
pub fn metrics(&self) -> String {
|
||||
self.observability.export_metrics()
|
||||
}
|
||||
|
||||
pub fn health_checker(&self) -> &HealthChecker {
|
||||
&self.health_checker
|
||||
}
|
||||
|
||||
pub fn observability(&self) -> &Arc<Observability> {
|
||||
&self.observability
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct HealthReport {
|
||||
pub status: HealthStatus,
|
||||
pub nats_connected: bool,
|
||||
pub storage_connected: bool,
|
||||
pub active_aggregates: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct TenantInfo {
|
||||
pub tenant_id: TenantId,
|
||||
pub aggregate_count: usize,
|
||||
pub last_activity: chrono::DateTime<chrono::Utc>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct AdminResponse {
|
||||
status: u16,
|
||||
body: String,
|
||||
}
|
||||
|
||||
impl AdminResponse {
|
||||
pub fn status(&self) -> AdminStatus {
|
||||
AdminStatus { code: self.status }
|
||||
}
|
||||
|
||||
pub async fn text(&self) -> String {
|
||||
self.body.clone()
|
||||
}
|
||||
|
||||
pub async fn json<T: DeserializeOwned>(&self) -> T {
|
||||
serde_json::from_str(&self.body).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct AdminStatus {
|
||||
code: u16,
|
||||
}
|
||||
|
||||
impl AdminStatus {
|
||||
pub fn is_success(&self) -> bool {
|
||||
(200..300).contains(&self.code)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct HashRing {
|
||||
replicas: usize,
|
||||
ring: BTreeMap<u64, String>,
|
||||
}
|
||||
|
||||
impl HashRing {
|
||||
pub fn new(replicas: usize) -> Self {
|
||||
Self {
|
||||
replicas: replicas.max(1),
|
||||
ring: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_node(&mut self, node: impl Into<String>) {
|
||||
let node = node.into();
|
||||
for i in 0..self.replicas {
|
||||
let key = Self::hash(&(node.as_str(), i));
|
||||
self.ring.insert(key, node.clone());
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remove_node(&mut self, node: &str) {
|
||||
let keys: Vec<u64> = self
|
||||
.ring
|
||||
.iter()
|
||||
.filter_map(|(k, v)| if v == node { Some(*k) } else { None })
|
||||
.collect();
|
||||
for k in keys {
|
||||
self.ring.remove(&k);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn route(&self, tenant_id: &str) -> Option<&str> {
|
||||
if self.ring.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let h = Self::hash(&tenant_id);
|
||||
let (_, node) = self
|
||||
.ring
|
||||
.range(h..)
|
||||
.next()
|
||||
.or_else(|| self.ring.iter().next())?;
|
||||
Some(node.as_str())
|
||||
}
|
||||
|
||||
fn hash<T: Hash>(value: &T) -> u64 {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
value.hash(&mut hasher);
|
||||
hasher.finish()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct AdminServer {
|
||||
observability: Arc<Observability>,
|
||||
health_checker: Arc<HealthChecker>,
|
||||
shard_id: String,
|
||||
placement: Arc<TenantPlacementManager>,
|
||||
}
|
||||
|
||||
impl AdminServer {
|
||||
pub fn new(
|
||||
observability: Observability,
|
||||
health_checker: HealthChecker,
|
||||
shard_id: String,
|
||||
) -> Self {
|
||||
let observability = Arc::new(observability);
|
||||
let placement = Arc::new(TenantPlacementManager::new(observability.clone()));
|
||||
Self {
|
||||
observability,
|
||||
health_checker: Arc::new(health_checker),
|
||||
shard_id,
|
||||
placement,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub async fn new_test() -> Self {
|
||||
let health = HealthChecker::new();
|
||||
health.set_storage_healthy(true);
|
||||
health.set_stream_healthy(true);
|
||||
|
||||
let server = Self::new(Observability::default(), health, "test-shard".to_string());
|
||||
let span = server.observability.start_command_span(
|
||||
"agg-1",
|
||||
"Account",
|
||||
"test-tenant",
|
||||
"cmd-1",
|
||||
None,
|
||||
None,
|
||||
);
|
||||
server.observability.record_command_success(&span, 1);
|
||||
server
|
||||
.placement
|
||||
.set_hosted_tenants(vec!["test-tenant".to_string()])
|
||||
.await;
|
||||
server
|
||||
}
|
||||
|
||||
pub fn placement_manager(&self) -> Arc<TenantPlacementManager> {
|
||||
self.placement.clone()
|
||||
}
|
||||
|
||||
pub fn observability(&self) -> Arc<Observability> {
|
||||
self.observability.clone()
|
||||
}
|
||||
|
||||
pub fn health_checker(&self) -> &HealthChecker {
|
||||
&self.health_checker
|
||||
}
|
||||
|
||||
pub async fn get(&self, path: &str) -> AdminResponse {
|
||||
match path {
|
||||
"/health" => {
|
||||
let report = self.health_report().await;
|
||||
AdminResponse {
|
||||
status: 200,
|
||||
body: serde_json::to_string(&report).unwrap(),
|
||||
}
|
||||
}
|
||||
"/ready" => AdminResponse {
|
||||
status: 200,
|
||||
body: serde_json::to_string(&self.health_checker.is_ready()).unwrap(),
|
||||
},
|
||||
"/metrics" => AdminResponse {
|
||||
status: 200,
|
||||
body: self.observability.export_metrics(),
|
||||
},
|
||||
"/admin/tenants" => {
|
||||
let list: Vec<TenantStatus> = self.placement.all_statuses().await;
|
||||
AdminResponse {
|
||||
status: 200,
|
||||
body: serde_json::to_string(&list).unwrap(),
|
||||
}
|
||||
}
|
||||
_ => AdminResponse {
|
||||
status: 404,
|
||||
body: "not found".to_string(),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn post(&self, path: &str, body: serde_json::Value) -> AdminResponse {
|
||||
match path {
|
||||
"/admin/drain" => {
|
||||
if let Some(tenant_id) = body.get("tenant_id").and_then(|v| v.as_str()) {
|
||||
let tenant_id = TenantId::new(tenant_id);
|
||||
self.placement.drain_tenant(&tenant_id).await;
|
||||
self.placement.wait_drained(&tenant_id).await;
|
||||
}
|
||||
AdminResponse {
|
||||
status: 200,
|
||||
body: "{}".to_string(),
|
||||
}
|
||||
}
|
||||
"/admin/reload" => {
|
||||
if let Some(arr) = body.get("hosted_tenants").and_then(|v| v.as_array()) {
|
||||
let tenants = arr
|
||||
.iter()
|
||||
.filter_map(|v| v.as_str().map(|s| s.to_string()))
|
||||
.collect::<Vec<_>>();
|
||||
self.placement.set_hosted_tenants(tenants).await;
|
||||
}
|
||||
|
||||
if let Some(map) = body.get("placement").and_then(|v| v.as_object()) {
|
||||
let placement = map
|
||||
.iter()
|
||||
.filter_map(|(k, v)| v.as_str().map(|s| (k.clone(), s.to_string())))
|
||||
.collect::<HashMap<_, _>>();
|
||||
self.placement
|
||||
.apply_placement_map(&self.shard_id, &placement)
|
||||
.await;
|
||||
}
|
||||
|
||||
AdminResponse {
|
||||
status: 200,
|
||||
body: "{}".to_string(),
|
||||
}
|
||||
}
|
||||
_ => AdminResponse {
|
||||
status: 404,
|
||||
body: "not found".to_string(),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn get_hosted_tenants(&self) -> Vec<TenantId> {
|
||||
self.placement.hosted_tenants().await
|
||||
}
|
||||
|
||||
async fn health_report(&self) -> HealthReport {
|
||||
let active_aggregates = self.placement.hosted_tenants().await.len();
|
||||
HealthReport {
|
||||
status: self.health_checker.check(),
|
||||
nats_connected: self.health_checker.stream_healthy(),
|
||||
storage_connected: self.health_checker.storage_healthy(),
|
||||
active_aggregates,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum ServerError {
|
||||
InvalidTenantId { tenant_id: TenantId, reason: String },
|
||||
InvalidAggregateId { id: String, reason: String },
|
||||
AggregateError(AggregateError),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ServerError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::InvalidTenantId { tenant_id, reason } => {
|
||||
write!(f, "invalid tenant_id '{}': {}", tenant_id.as_str(), reason)
|
||||
}
|
||||
Self::InvalidAggregateId { id, reason } => {
|
||||
write!(f, "invalid aggregate_id '{}': {}", id, reason)
|
||||
}
|
||||
Self::AggregateError(e) => write!(f, "{}", e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for ServerError {}
|
||||
|
||||
impl From<AggregateError> for ServerError {
|
||||
fn from(e: AggregateError) -> Self {
|
||||
Self::AggregateError(e)
|
||||
}
|
||||
}
|
||||
|
||||
impl CommandRequest {
|
||||
pub fn with_headers(mut self, headers: HashMap<String, String>) -> Self {
|
||||
self.headers = headers;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
fn trace_id_from_traceparent(traceparent: &str) -> Option<String> {
|
||||
shared::trace_id_from_traceparent(traceparent).map(|s| s.to_string())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn command_request_builder() {
|
||||
let req = CommandRequest::new(
|
||||
TenantId::new("tenant-a"),
|
||||
AggregateId::new_v7(),
|
||||
AggregateType::from("Account"),
|
||||
serde_json::json!({"type": "deposit", "amount": 100}),
|
||||
)
|
||||
.with_header("x-request-id", "req-123");
|
||||
|
||||
assert_eq!(req.tenant_id.as_str(), "tenant-a");
|
||||
assert_eq!(
|
||||
req.headers.get("x-request-id"),
|
||||
Some(&"req-123".to_string())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_tenant_id_from_header() {
|
||||
let _config = ServerConfig::default();
|
||||
let mut headers = HashMap::new();
|
||||
headers.insert("x-tenant-id".to_string(), "acme-corp".to_string());
|
||||
|
||||
let tenant_id = extract_tenant_id_static(&headers);
|
||||
assert_eq!(tenant_id.as_str(), "acme-corp");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_tenant_id_defaults_empty() {
|
||||
let headers = HashMap::new();
|
||||
|
||||
let tenant_id = extract_tenant_id_static(&headers);
|
||||
assert!(tenant_id.as_str().is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn validate_tenant_id_accepts_valid() {
|
||||
assert!(validate_tenant_id_static(&TenantId::new("acme-corp")).is_ok());
|
||||
assert!(validate_tenant_id_static(&TenantId::new("tenant_123")).is_ok());
|
||||
assert!(validate_tenant_id_static(&TenantId::new("my-tenant")).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn validate_tenant_id_rejects_invalid() {
|
||||
assert!(validate_tenant_id_static(&TenantId::new("tenant@corp")).is_err());
|
||||
assert!(validate_tenant_id_static(&TenantId::new("tenant name")).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn server_config_defaults() {
|
||||
let config = ServerConfig::default();
|
||||
assert_eq!(config.service_name, "aggregate");
|
||||
assert!(config.validate_tenant_id);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn server_error_display() {
|
||||
let err = ServerError::InvalidTenantId {
|
||||
tenant_id: TenantId::new("bad@id"),
|
||||
reason: "invalid characters".to_string(),
|
||||
};
|
||||
assert!(err.to_string().contains("bad@id"));
|
||||
}
|
||||
|
||||
fn extract_tenant_id_static(headers: &HashMap<String, String>) -> TenantId {
|
||||
headers
|
||||
.get("x-tenant-id")
|
||||
.map(TenantId::new)
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
fn validate_tenant_id_static(tenant_id: &TenantId) -> Result<(), ServerError> {
|
||||
let id = tenant_id.as_str();
|
||||
if id.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if !id
|
||||
.chars()
|
||||
.all(|c| c.is_alphanumeric() || c == '-' || c == '_')
|
||||
{
|
||||
return Err(ServerError::InvalidTenantId {
|
||||
tenant_id: tenant_id.clone(),
|
||||
reason:
|
||||
"tenant_id must contain only alphanumeric characters, hyphens, and underscores"
|
||||
.to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn admin_health_endpoint_returns_status() {
|
||||
let server = AdminServer::new_test().await;
|
||||
let resp = server.get("/health").await;
|
||||
assert!(resp.status().is_success());
|
||||
|
||||
let health: HealthReport = resp.json().await;
|
||||
assert!(health.nats_connected);
|
||||
assert!(health.storage_connected);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn admin_ready_endpoint_returns_success() {
|
||||
let server = AdminServer::new_test().await;
|
||||
let resp = server.get("/ready").await;
|
||||
assert!(resp.status().is_success());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn admin_metrics_endpoint_prometheus_format() {
|
||||
let server = AdminServer::new_test().await;
|
||||
let resp = server.get("/metrics").await;
|
||||
let body = resp.text().await;
|
||||
assert!(body.contains("aggregate_commands_total"));
|
||||
assert!(body.contains("tenant_id"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn admin_tenants_list_returns_hosted_tenants() {
|
||||
let server = AdminServer::new_test().await;
|
||||
let resp = server.get("/admin/tenants").await;
|
||||
let tenants: Vec<TenantStatus> = resp.json().await;
|
||||
assert!(tenants
|
||||
.iter()
|
||||
.any(|t| t.tenant_id == TenantId::new("test-tenant")));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn admin_drain_waits_for_in_flight_commands() {
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
let server = AdminServer::new_test().await;
|
||||
let tenant_id = TenantId::new("test-tenant");
|
||||
let guard = server
|
||||
.placement_manager()
|
||||
.begin_command(&tenant_id)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
tokio::spawn(async move {
|
||||
tokio::time::sleep(Duration::from_millis(50)).await;
|
||||
drop(guard);
|
||||
});
|
||||
|
||||
let start = Instant::now();
|
||||
let resp = server
|
||||
.post(
|
||||
"/admin/drain",
|
||||
serde_json::json!({"tenant_id": "test-tenant"}),
|
||||
)
|
||||
.await;
|
||||
|
||||
assert!(start.elapsed() < Duration::from_secs(5));
|
||||
assert!(resp.status().is_success());
|
||||
|
||||
server.placement_manager().wait_drained(&tenant_id).await;
|
||||
let status = server.placement_manager().tenant_status(&tenant_id).await;
|
||||
assert!(status.draining);
|
||||
assert!(!status.accepting);
|
||||
assert_eq!(status.in_flight, 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn admin_config_reload_updates_routing() {
|
||||
let server = AdminServer::new_test().await;
|
||||
let resp = server
|
||||
.post(
|
||||
"/admin/reload",
|
||||
serde_json::json!({"hosted_tenants": ["new-tenant"]}),
|
||||
)
|
||||
.await;
|
||||
assert!(resp.status().is_success());
|
||||
|
||||
let tenants = server.get_hosted_tenants().await;
|
||||
assert!(tenants.contains(&TenantId::new("new-tenant")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn admin_server_is_send() {
|
||||
fn assert_send<T: Send>() {}
|
||||
assert_send::<AdminServer>();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hash_ring_routes_deterministically() {
|
||||
let mut ring = HashRing::new(100);
|
||||
ring.add_node("node-a");
|
||||
ring.add_node("node-b");
|
||||
ring.add_node("node-c");
|
||||
|
||||
let r1 = ring.route("tenant-a").unwrap().to_string();
|
||||
let r2 = ring.route("tenant-a").unwrap().to_string();
|
||||
assert_eq!(r1, r2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hash_ring_distributes_tenants_evenly() {
|
||||
let mut ring = HashRing::new(200);
|
||||
ring.add_node("node-a");
|
||||
ring.add_node("node-b");
|
||||
ring.add_node("node-c");
|
||||
|
||||
let mut counts: HashMap<String, usize> = HashMap::new();
|
||||
for i in 0..3000 {
|
||||
let tenant = format!("tenant-{}", i);
|
||||
let node = ring.route(&tenant).unwrap().to_string();
|
||||
*counts.entry(node).or_insert(0) += 1;
|
||||
}
|
||||
|
||||
let avg = 3000.0 / 3.0;
|
||||
for c in counts.values() {
|
||||
let diff = (*c as f64 - avg).abs() / avg;
|
||||
assert!(diff < 0.25);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hash_ring_rebalances_on_node_add() {
|
||||
let mut ring = HashRing::new(200);
|
||||
ring.add_node("node-a");
|
||||
ring.add_node("node-b");
|
||||
|
||||
let mut before: HashMap<String, String> = HashMap::new();
|
||||
for i in 0..2000 {
|
||||
let tenant = format!("tenant-{}", i);
|
||||
before.insert(tenant.clone(), ring.route(&tenant).unwrap().to_string());
|
||||
}
|
||||
|
||||
ring.add_node("node-c");
|
||||
|
||||
let mut moved = 0usize;
|
||||
for (tenant, old) in before {
|
||||
let new = ring.route(&tenant).unwrap();
|
||||
if new != old {
|
||||
moved += 1;
|
||||
}
|
||||
}
|
||||
|
||||
assert!(moved > 0);
|
||||
assert!(moved < 2000);
|
||||
}
|
||||
}
|
||||
216
aggregate/src/storage/circuit_breaker.rs
Normal file
216
aggregate/src/storage/circuit_breaker.rs
Normal file
@@ -0,0 +1,216 @@
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum CircuitState {
|
||||
Closed,
|
||||
Open,
|
||||
HalfOpen,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CircuitBreaker {
|
||||
state: CircuitState,
|
||||
failure_count: u32,
|
||||
failure_threshold: u32,
|
||||
reset_timeout: Duration,
|
||||
last_failure_time: Option<Instant>,
|
||||
half_open_successes: u32,
|
||||
half_open_threshold: u32,
|
||||
}
|
||||
|
||||
impl CircuitBreaker {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
state: CircuitState::Closed,
|
||||
failure_count: 0,
|
||||
failure_threshold: 5,
|
||||
reset_timeout: Duration::from_secs(30),
|
||||
last_failure_time: None,
|
||||
half_open_successes: 0,
|
||||
half_open_threshold: 3,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_failure_threshold(mut self, threshold: u32) -> Self {
|
||||
self.failure_threshold = threshold;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_reset_timeout(mut self, timeout: Duration) -> Self {
|
||||
self.reset_timeout = timeout;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_half_open_threshold(mut self, threshold: u32) -> Self {
|
||||
self.half_open_threshold = threshold;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn state(&self) -> CircuitState {
|
||||
if self.state == CircuitState::Open {
|
||||
if let Some(last_failure) = self.last_failure_time {
|
||||
if last_failure.elapsed() >= self.reset_timeout {
|
||||
return CircuitState::HalfOpen;
|
||||
}
|
||||
}
|
||||
}
|
||||
self.state
|
||||
}
|
||||
|
||||
pub fn is_open(&self) -> bool {
|
||||
matches!(self.state(), CircuitState::Open)
|
||||
}
|
||||
|
||||
pub fn is_closed(&self) -> bool {
|
||||
matches!(self.state(), CircuitState::Closed)
|
||||
}
|
||||
|
||||
pub fn record_success(&mut self) {
|
||||
match self.state() {
|
||||
CircuitState::Closed => {
|
||||
self.failure_count = 0;
|
||||
}
|
||||
CircuitState::HalfOpen => {
|
||||
self.half_open_successes += 1;
|
||||
if self.half_open_successes >= self.half_open_threshold {
|
||||
self.state = CircuitState::Closed;
|
||||
self.failure_count = 0;
|
||||
self.half_open_successes = 0;
|
||||
self.last_failure_time = None;
|
||||
}
|
||||
}
|
||||
CircuitState::Open => {}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn record_failure(&mut self) {
|
||||
self.last_failure_time = Some(Instant::now());
|
||||
|
||||
match self.state() {
|
||||
CircuitState::Closed => {
|
||||
self.failure_count += 1;
|
||||
if self.failure_count >= self.failure_threshold {
|
||||
self.state = CircuitState::Open;
|
||||
}
|
||||
}
|
||||
CircuitState::HalfOpen => {
|
||||
self.state = CircuitState::Open;
|
||||
self.half_open_successes = 0;
|
||||
}
|
||||
CircuitState::Open => {}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn reset(&mut self) {
|
||||
self.state = CircuitState::Closed;
|
||||
self.failure_count = 0;
|
||||
self.last_failure_time = None;
|
||||
self.half_open_successes = 0;
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for CircuitBreaker {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::thread::sleep;
|
||||
|
||||
#[test]
|
||||
fn circuit_breaker_starts_closed() {
|
||||
let cb = CircuitBreaker::new();
|
||||
assert!(cb.is_closed());
|
||||
assert!(!cb.is_open());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn circuit_breaker_opens_after_threshold() {
|
||||
let mut cb = CircuitBreaker::new().with_failure_threshold(3);
|
||||
|
||||
cb.record_failure();
|
||||
assert!(cb.is_closed());
|
||||
|
||||
cb.record_failure();
|
||||
assert!(cb.is_closed());
|
||||
|
||||
cb.record_failure();
|
||||
assert!(cb.is_open());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn circuit_breaker_resets_after_timeout() {
|
||||
let mut cb = CircuitBreaker::new()
|
||||
.with_failure_threshold(1)
|
||||
.with_reset_timeout(Duration::from_millis(10));
|
||||
|
||||
cb.record_failure();
|
||||
assert!(cb.is_open());
|
||||
|
||||
sleep(Duration::from_millis(15));
|
||||
assert_eq!(cb.state(), CircuitState::HalfOpen);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn circuit_breaker_closes_after_half_open_successes() {
|
||||
let mut cb = CircuitBreaker::new()
|
||||
.with_failure_threshold(1)
|
||||
.with_reset_timeout(Duration::from_millis(10))
|
||||
.with_half_open_threshold(2);
|
||||
|
||||
cb.record_failure();
|
||||
assert!(cb.is_open());
|
||||
|
||||
sleep(Duration::from_millis(15));
|
||||
assert_eq!(cb.state(), CircuitState::HalfOpen);
|
||||
|
||||
cb.record_success();
|
||||
assert_eq!(cb.state(), CircuitState::HalfOpen);
|
||||
|
||||
cb.record_success();
|
||||
assert!(cb.is_closed());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn circuit_breaker_reopens_on_half_open_failure() {
|
||||
let mut cb = CircuitBreaker::new()
|
||||
.with_failure_threshold(1)
|
||||
.with_reset_timeout(Duration::from_millis(10));
|
||||
|
||||
cb.record_failure();
|
||||
assert!(cb.is_open());
|
||||
|
||||
sleep(Duration::from_millis(15));
|
||||
assert_eq!(cb.state(), CircuitState::HalfOpen);
|
||||
|
||||
cb.record_failure();
|
||||
assert!(cb.is_open());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn circuit_breaker_success_resets_failure_count() {
|
||||
let mut cb = CircuitBreaker::new().with_failure_threshold(3);
|
||||
|
||||
cb.record_failure();
|
||||
cb.record_failure();
|
||||
cb.record_success();
|
||||
assert!(cb.is_closed());
|
||||
|
||||
cb.record_failure();
|
||||
assert!(cb.is_closed());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn circuit_breaker_manual_reset() {
|
||||
let mut cb = CircuitBreaker::new().with_failure_threshold(1);
|
||||
|
||||
cb.record_failure();
|
||||
assert!(cb.is_open());
|
||||
|
||||
cb.reset();
|
||||
assert!(cb.is_closed());
|
||||
}
|
||||
}
|
||||
422
aggregate/src/storage/mod.rs
Normal file
422
aggregate/src/storage/mod.rs
Normal file
@@ -0,0 +1,422 @@
|
||||
mod circuit_breaker;
|
||||
|
||||
pub use circuit_breaker::CircuitBreaker;
|
||||
|
||||
use crate::types::{AggregateError, AggregateId, AggregateType, Snapshot, TenantId, Version};
|
||||
use edge_storage::{AggregateStore, Config as EdgeConfig, EdgeStorage, WriteResult, Writer};
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
pub struct StorageClient {
|
||||
storage: Arc<EdgeStorage>,
|
||||
aggregate_store: AggregateStore,
|
||||
writer: Arc<Writer>,
|
||||
circuit_breaker: RwLock<CircuitBreaker>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for StorageClient {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("StorageClient")
|
||||
.field("circuit_breaker", &self.circuit_breaker)
|
||||
.finish_non_exhaustive()
|
||||
}
|
||||
}
|
||||
|
||||
impl StorageClient {
|
||||
pub fn open(storage_path: impl Into<String>) -> Result<Self, StorageInitError> {
|
||||
let config = EdgeConfig::new(storage_path.into());
|
||||
let storage = EdgeStorage::open(config)?;
|
||||
let writer = Arc::new(Writer::new(storage.db().clone(), &EdgeConfig::default()));
|
||||
let aggregate_store = AggregateStore::new(storage.db().clone(), writer.clone());
|
||||
|
||||
Ok(Self {
|
||||
storage: Arc::new(storage),
|
||||
aggregate_store,
|
||||
writer,
|
||||
circuit_breaker: RwLock::new(CircuitBreaker::new()),
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn in_memory() -> Self {
|
||||
use tempfile::tempdir;
|
||||
let dir = tempdir().expect("failed to create temp dir");
|
||||
let path = dir.path().join("test.mdbx");
|
||||
std::mem::forget(dir);
|
||||
Self::open(path.to_string_lossy().to_string()).expect("failed to open in-memory storage")
|
||||
}
|
||||
|
||||
pub fn with_circuit_breaker(mut self, cb: CircuitBreaker) -> Self {
|
||||
self.circuit_breaker = RwLock::new(cb);
|
||||
self
|
||||
}
|
||||
|
||||
pub async fn get_snapshot(
|
||||
&self,
|
||||
tenant_id: &TenantId,
|
||||
aggregate_id: &AggregateId,
|
||||
) -> Result<Option<Snapshot>, AggregateError> {
|
||||
self.check_circuit().await?;
|
||||
|
||||
let key = Self::build_key(tenant_id, aggregate_id);
|
||||
|
||||
match self.aggregate_store.get_latest_snapshot(&key) {
|
||||
Ok(Some((version, data))) => {
|
||||
let snapshot = self
|
||||
.decode_snapshot(tenant_id, aggregate_id, version, &data)
|
||||
.map_err(|e| AggregateError::StorageError(e.to_string()))?;
|
||||
self.record_success().await;
|
||||
Ok(Some(snapshot))
|
||||
}
|
||||
Ok(None) => {
|
||||
self.record_success().await;
|
||||
Ok(None)
|
||||
}
|
||||
Err(e) => {
|
||||
self.record_failure().await;
|
||||
Err(AggregateError::StorageError(e.to_string()))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn get_snapshot_at_version(
|
||||
&self,
|
||||
tenant_id: &TenantId,
|
||||
aggregate_id: &AggregateId,
|
||||
version: Version,
|
||||
) -> Result<Option<Snapshot>, AggregateError> {
|
||||
self.check_circuit().await?;
|
||||
|
||||
let key = Self::build_key(tenant_id, aggregate_id);
|
||||
|
||||
match self.aggregate_store.get_snapshot(&key, version.as_u64()) {
|
||||
Ok(Some(data)) => {
|
||||
let snapshot = self
|
||||
.decode_snapshot(tenant_id, aggregate_id, version.as_u64(), &data)
|
||||
.map_err(|e| AggregateError::StorageError(e.to_string()))?;
|
||||
self.record_success().await;
|
||||
Ok(Some(snapshot))
|
||||
}
|
||||
Ok(None) => {
|
||||
self.record_success().await;
|
||||
Ok(None)
|
||||
}
|
||||
Err(e) => {
|
||||
self.record_failure().await;
|
||||
Err(AggregateError::StorageError(e.to_string()))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn put_snapshot(&self, snapshot: &Snapshot) -> Result<(), AggregateError> {
|
||||
self.check_circuit().await?;
|
||||
|
||||
let key = Self::build_key(&snapshot.tenant_id, &snapshot.aggregate_id);
|
||||
let data = serde_json::to_vec(&snapshot.state)
|
||||
.map_err(|e| AggregateError::StorageError(e.to_string()))?;
|
||||
|
||||
let result = self
|
||||
.aggregate_store
|
||||
.put_snapshot_sync(&key, snapshot.version.as_u64(), &data)
|
||||
.map_err(|e| {
|
||||
self.record_failure_sync();
|
||||
AggregateError::StorageError(e.to_string())
|
||||
})?;
|
||||
|
||||
match result {
|
||||
WriteResult::Success => {
|
||||
self.record_success().await;
|
||||
Ok(())
|
||||
}
|
||||
WriteResult::VersionConflict {
|
||||
aggregate_id: _,
|
||||
version,
|
||||
} => {
|
||||
self.record_success().await;
|
||||
Err(AggregateError::VersionConflict {
|
||||
expected: Version::from(version).increment(),
|
||||
actual: Version::from(version),
|
||||
})
|
||||
}
|
||||
WriteResult::Error(e) => {
|
||||
self.record_failure().await;
|
||||
Err(AggregateError::StorageError(e))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn get_latest_version(
|
||||
&self,
|
||||
tenant_id: &TenantId,
|
||||
aggregate_id: &AggregateId,
|
||||
) -> Result<Option<Version>, AggregateError> {
|
||||
self.check_circuit().await?;
|
||||
|
||||
let key = Self::build_key(tenant_id, aggregate_id);
|
||||
|
||||
match self.aggregate_store.get_latest_version(&key) {
|
||||
Ok(Some(v)) => {
|
||||
self.record_success().await;
|
||||
Ok(Some(Version::from(v)))
|
||||
}
|
||||
Ok(None) => {
|
||||
self.record_success().await;
|
||||
Ok(None)
|
||||
}
|
||||
Err(e) => {
|
||||
self.record_failure().await;
|
||||
Err(AggregateError::StorageError(e.to_string()))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn delete_snapshot(
|
||||
&self,
|
||||
_tenant_id: &TenantId,
|
||||
_aggregate_id: &AggregateId,
|
||||
) -> Result<(), AggregateError> {
|
||||
self.check_circuit().await?;
|
||||
|
||||
self.record_success().await;
|
||||
Err(AggregateError::StorageError(
|
||||
"Snapshot deletion not supported in event-sourced system".to_string(),
|
||||
))
|
||||
}
|
||||
|
||||
fn build_key(tenant_id: &TenantId, aggregate_id: &AggregateId) -> Vec<u8> {
|
||||
format!("{}:{}", tenant_id.as_str(), aggregate_id).into_bytes()
|
||||
}
|
||||
|
||||
fn decode_snapshot(
|
||||
&self,
|
||||
tenant_id: &TenantId,
|
||||
aggregate_id: &AggregateId,
|
||||
version: u64,
|
||||
data: &[u8],
|
||||
) -> Result<Snapshot, serde_json::Error> {
|
||||
let state = serde_json::from_slice(data)?;
|
||||
Ok(Snapshot::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
AggregateType::new("Unknown"),
|
||||
Version::from(version),
|
||||
state,
|
||||
))
|
||||
}
|
||||
|
||||
async fn check_circuit(&self) -> Result<(), AggregateError> {
|
||||
let cb = self.circuit_breaker.read().await;
|
||||
if cb.is_open() {
|
||||
return Err(AggregateError::StorageError(
|
||||
"Circuit breaker is open".to_string(),
|
||||
));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn record_success(&self) {
|
||||
let mut cb = self.circuit_breaker.write().await;
|
||||
cb.record_success();
|
||||
}
|
||||
|
||||
fn record_failure_sync(&self) {
|
||||
if let Ok(mut cb) = self.circuit_breaker.try_write() {
|
||||
cb.record_failure();
|
||||
}
|
||||
}
|
||||
|
||||
async fn record_failure(&self) {
|
||||
let mut cb = self.circuit_breaker.write().await;
|
||||
cb.record_failure();
|
||||
}
|
||||
|
||||
pub fn storage(&self) -> &Arc<EdgeStorage> {
|
||||
&self.storage
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for StorageClient {
|
||||
fn clone(&self) -> Self {
|
||||
Self {
|
||||
storage: self.storage.clone(),
|
||||
aggregate_store: self.aggregate_store.clone(),
|
||||
writer: self.writer.clone(),
|
||||
circuit_breaker: RwLock::new(CircuitBreaker::new()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum StorageInitError {
|
||||
#[error("Failed to open storage: {0}")]
|
||||
OpenError(#[from] edge_storage::Error),
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use serde_json::json;
|
||||
use tempfile::tempdir;
|
||||
|
||||
fn create_test_client() -> (tempfile::TempDir, StorageClient) {
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("test.mdbx");
|
||||
let client = StorageClient::open(path.to_string_lossy().to_string()).unwrap();
|
||||
(dir, client)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn storage_client_open() {
|
||||
let (_dir, _client) = create_test_client();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn storage_client_put_get_snapshot() {
|
||||
let (_dir, client) = create_test_client();
|
||||
|
||||
let tenant_id = TenantId::new("acme-corp");
|
||||
let aggregate_id = AggregateId::new_v7();
|
||||
let snapshot = Snapshot::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
AggregateType::new("Account"),
|
||||
Version::from(1),
|
||||
json!({"balance": 100}),
|
||||
);
|
||||
|
||||
client.put_snapshot(&snapshot).await.unwrap();
|
||||
|
||||
let retrieved = client
|
||||
.get_snapshot(&tenant_id, &aggregate_id)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert!(retrieved.is_some());
|
||||
let retrieved = retrieved.unwrap();
|
||||
assert_eq!(retrieved.version, Version::from(1));
|
||||
assert_eq!(retrieved.state, json!({"balance": 100}));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn storage_client_version_conflict() {
|
||||
let (_dir, client) = create_test_client();
|
||||
|
||||
let tenant_id = TenantId::new("acme-corp");
|
||||
let aggregate_id = AggregateId::new_v7();
|
||||
|
||||
let snapshot_v1 = Snapshot::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
AggregateType::new("Account"),
|
||||
Version::from(1),
|
||||
json!({"balance": 100}),
|
||||
);
|
||||
|
||||
client.put_snapshot(&snapshot_v1).await.unwrap();
|
||||
|
||||
let snapshot_v1_again = Snapshot::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
AggregateType::new("Account"),
|
||||
Version::from(1),
|
||||
json!({"balance": 200}),
|
||||
);
|
||||
|
||||
let result = client.put_snapshot(&snapshot_v1_again).await;
|
||||
assert!(matches!(
|
||||
result,
|
||||
Err(AggregateError::VersionConflict { .. })
|
||||
));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn storage_client_latest_version() {
|
||||
let (_dir, client) = create_test_client();
|
||||
|
||||
let tenant_id = TenantId::new("acme-corp");
|
||||
let aggregate_id = AggregateId::new_v7();
|
||||
|
||||
let version = client
|
||||
.get_latest_version(&tenant_id, &aggregate_id)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(version.is_none());
|
||||
|
||||
let snapshot_v1 = Snapshot::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
AggregateType::new("Account"),
|
||||
Version::from(1),
|
||||
json!({"balance": 100}),
|
||||
);
|
||||
client.put_snapshot(&snapshot_v1).await.unwrap();
|
||||
|
||||
let version = client
|
||||
.get_latest_version(&tenant_id, &aggregate_id)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(version, Some(Version::from(1)));
|
||||
|
||||
let snapshot_v3 = Snapshot::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
AggregateType::new("Account"),
|
||||
Version::from(3),
|
||||
json!({"balance": 300}),
|
||||
);
|
||||
client.put_snapshot(&snapshot_v3).await.unwrap();
|
||||
|
||||
let version = client
|
||||
.get_latest_version(&tenant_id, &aggregate_id)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(version, Some(Version::from(3)));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn storage_client_isolation() {
|
||||
let (_dir, client) = create_test_client();
|
||||
|
||||
let tenant_a = TenantId::new("tenant-a");
|
||||
let tenant_b = TenantId::new("tenant-b");
|
||||
let aggregate_id = AggregateId::new_v7();
|
||||
|
||||
let snapshot_a = Snapshot::new(
|
||||
tenant_a.clone(),
|
||||
aggregate_id.clone(),
|
||||
AggregateType::new("Account"),
|
||||
Version::from(1),
|
||||
json!({"owner": "A"}),
|
||||
);
|
||||
client.put_snapshot(&snapshot_a).await.unwrap();
|
||||
|
||||
let snapshot_b = Snapshot::new(
|
||||
tenant_b.clone(),
|
||||
aggregate_id.clone(),
|
||||
AggregateType::new("Account"),
|
||||
Version::from(1),
|
||||
json!({"owner": "B"}),
|
||||
);
|
||||
client.put_snapshot(&snapshot_b).await.unwrap();
|
||||
|
||||
let retrieved_a = client
|
||||
.get_snapshot(&tenant_a, &aggregate_id)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
let retrieved_b = client
|
||||
.get_snapshot(&tenant_b, &aggregate_id)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(retrieved_a.state["owner"], "A");
|
||||
assert_eq!(retrieved_b.state["owner"], "B");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn storage_client_is_send_sync() {
|
||||
fn assert_send_sync<T: Send + Sync>() {}
|
||||
assert_send_sync::<StorageClient>();
|
||||
}
|
||||
}
|
||||
284
aggregate/src/stream/circuit_breaker.rs
Normal file
284
aggregate/src/stream/circuit_breaker.rs
Normal file
@@ -0,0 +1,284 @@
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum CircuitState {
|
||||
Closed,
|
||||
Open,
|
||||
HalfOpen,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CircuitBreaker {
|
||||
state: CircuitState,
|
||||
failure_count: u32,
|
||||
failure_threshold: u32,
|
||||
reset_timeout: Duration,
|
||||
last_failure_time: Option<Instant>,
|
||||
half_open_successes: u32,
|
||||
half_open_threshold: u32,
|
||||
consecutive_successes: u32,
|
||||
}
|
||||
|
||||
impl CircuitBreaker {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
state: CircuitState::Closed,
|
||||
failure_count: 0,
|
||||
failure_threshold: 5,
|
||||
reset_timeout: Duration::from_secs(30),
|
||||
last_failure_time: None,
|
||||
half_open_successes: 0,
|
||||
half_open_threshold: 3,
|
||||
consecutive_successes: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_failure_threshold(mut self, threshold: u32) -> Self {
|
||||
self.failure_threshold = threshold;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_reset_timeout(mut self, timeout: Duration) -> Self {
|
||||
self.reset_timeout = timeout;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_half_open_threshold(mut self, threshold: u32) -> Self {
|
||||
self.half_open_threshold = threshold;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn state(&self) -> CircuitState {
|
||||
if self.state == CircuitState::Open {
|
||||
if let Some(last_failure) = self.last_failure_time {
|
||||
if last_failure.elapsed() >= self.reset_timeout {
|
||||
return CircuitState::HalfOpen;
|
||||
}
|
||||
}
|
||||
}
|
||||
self.state
|
||||
}
|
||||
|
||||
pub fn is_open(&self) -> bool {
|
||||
matches!(self.state(), CircuitState::Open)
|
||||
}
|
||||
|
||||
pub fn is_closed(&self) -> bool {
|
||||
matches!(self.state(), CircuitState::Closed)
|
||||
}
|
||||
|
||||
pub fn is_half_open(&self) -> bool {
|
||||
matches!(self.state(), CircuitState::HalfOpen)
|
||||
}
|
||||
|
||||
pub fn failure_count(&self) -> u32 {
|
||||
self.failure_count
|
||||
}
|
||||
|
||||
pub fn consecutive_successes(&self) -> u32 {
|
||||
self.consecutive_successes
|
||||
}
|
||||
|
||||
pub fn record_success(&mut self) {
|
||||
self.consecutive_successes += 1;
|
||||
|
||||
match self.state() {
|
||||
CircuitState::Closed => {
|
||||
self.failure_count = 0;
|
||||
}
|
||||
CircuitState::HalfOpen => {
|
||||
self.half_open_successes += 1;
|
||||
if self.half_open_successes >= self.half_open_threshold {
|
||||
self.state = CircuitState::Closed;
|
||||
self.failure_count = 0;
|
||||
self.half_open_successes = 0;
|
||||
self.last_failure_time = None;
|
||||
}
|
||||
}
|
||||
CircuitState::Open => {}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn record_failure(&mut self) {
|
||||
self.consecutive_successes = 0;
|
||||
self.last_failure_time = Some(Instant::now());
|
||||
|
||||
match self.state() {
|
||||
CircuitState::Closed => {
|
||||
self.failure_count += 1;
|
||||
if self.failure_count >= self.failure_threshold {
|
||||
self.state = CircuitState::Open;
|
||||
}
|
||||
}
|
||||
CircuitState::HalfOpen => {
|
||||
self.state = CircuitState::Open;
|
||||
self.half_open_successes = 0;
|
||||
}
|
||||
CircuitState::Open => {}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn reset(&mut self) {
|
||||
self.state = CircuitState::Closed;
|
||||
self.failure_count = 0;
|
||||
self.last_failure_time = None;
|
||||
self.half_open_successes = 0;
|
||||
self.consecutive_successes = 0;
|
||||
}
|
||||
|
||||
pub fn time_until_reset(&self) -> Option<Duration> {
|
||||
if self.state == CircuitState::Open {
|
||||
self.last_failure_time.map(|t| {
|
||||
let elapsed = t.elapsed();
|
||||
if elapsed < self.reset_timeout {
|
||||
self.reset_timeout - elapsed
|
||||
} else {
|
||||
Duration::ZERO
|
||||
}
|
||||
})
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for CircuitBreaker {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::thread::sleep;
|
||||
|
||||
#[test]
|
||||
fn circuit_breaker_starts_closed() {
|
||||
let cb = CircuitBreaker::new();
|
||||
assert!(cb.is_closed());
|
||||
assert!(!cb.is_open());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn circuit_breaker_opens_after_threshold() {
|
||||
let mut cb = CircuitBreaker::new().with_failure_threshold(3);
|
||||
|
||||
cb.record_failure();
|
||||
assert!(cb.is_closed());
|
||||
|
||||
cb.record_failure();
|
||||
assert!(cb.is_closed());
|
||||
|
||||
cb.record_failure();
|
||||
assert!(cb.is_open());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn circuit_breaker_resets_after_timeout() {
|
||||
let mut cb = CircuitBreaker::new()
|
||||
.with_failure_threshold(1)
|
||||
.with_reset_timeout(Duration::from_millis(10));
|
||||
|
||||
cb.record_failure();
|
||||
assert!(cb.is_open());
|
||||
|
||||
sleep(Duration::from_millis(15));
|
||||
assert!(cb.is_half_open());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn circuit_breaker_closes_after_half_open_successes() {
|
||||
let mut cb = CircuitBreaker::new()
|
||||
.with_failure_threshold(1)
|
||||
.with_reset_timeout(Duration::from_millis(10))
|
||||
.with_half_open_threshold(2);
|
||||
|
||||
cb.record_failure();
|
||||
assert!(cb.is_open());
|
||||
|
||||
sleep(Duration::from_millis(15));
|
||||
assert!(cb.is_half_open());
|
||||
|
||||
cb.record_success();
|
||||
assert!(cb.is_half_open());
|
||||
|
||||
cb.record_success();
|
||||
assert!(cb.is_closed());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn circuit_breaker_reopens_on_half_open_failure() {
|
||||
let mut cb = CircuitBreaker::new()
|
||||
.with_failure_threshold(1)
|
||||
.with_reset_timeout(Duration::from_millis(10));
|
||||
|
||||
cb.record_failure();
|
||||
assert!(cb.is_open());
|
||||
|
||||
sleep(Duration::from_millis(15));
|
||||
assert!(cb.is_half_open());
|
||||
|
||||
cb.record_failure();
|
||||
assert!(cb.is_open());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn circuit_breaker_success_resets_failure_count() {
|
||||
let mut cb = CircuitBreaker::new().with_failure_threshold(3);
|
||||
|
||||
cb.record_failure();
|
||||
cb.record_failure();
|
||||
cb.record_success();
|
||||
assert!(cb.is_closed());
|
||||
assert_eq!(cb.failure_count(), 0);
|
||||
|
||||
cb.record_failure();
|
||||
assert!(cb.is_closed());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn circuit_breaker_manual_reset() {
|
||||
let mut cb = CircuitBreaker::new().with_failure_threshold(1);
|
||||
|
||||
cb.record_failure();
|
||||
assert!(cb.is_open());
|
||||
|
||||
cb.reset();
|
||||
assert!(cb.is_closed());
|
||||
assert_eq!(cb.failure_count(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn circuit_breaker_tracks_consecutive_successes() {
|
||||
let mut cb = CircuitBreaker::new();
|
||||
|
||||
assert_eq!(cb.consecutive_successes(), 0);
|
||||
|
||||
cb.record_success();
|
||||
assert_eq!(cb.consecutive_successes(), 1);
|
||||
|
||||
cb.record_success();
|
||||
assert_eq!(cb.consecutive_successes(), 2);
|
||||
|
||||
cb.record_failure();
|
||||
assert_eq!(cb.consecutive_successes(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn circuit_breaker_time_until_reset() {
|
||||
let mut cb = CircuitBreaker::new()
|
||||
.with_failure_threshold(1)
|
||||
.with_reset_timeout(Duration::from_millis(100));
|
||||
|
||||
assert!(cb.time_until_reset().is_none());
|
||||
|
||||
cb.record_failure();
|
||||
let remaining = cb.time_until_reset();
|
||||
assert!(remaining.is_some());
|
||||
assert!(remaining.unwrap() <= Duration::from_millis(100));
|
||||
|
||||
cb.reset();
|
||||
assert!(cb.time_until_reset().is_none());
|
||||
}
|
||||
}
|
||||
627
aggregate/src/stream/mod.rs
Normal file
627
aggregate/src/stream/mod.rs
Normal file
@@ -0,0 +1,627 @@
|
||||
mod circuit_breaker;
|
||||
|
||||
pub use circuit_breaker::CircuitBreaker;
|
||||
|
||||
use crate::types::{AggregateError, AggregateId, AggregateType, Event, TenantId, Version};
|
||||
use async_nats::jetstream::{
|
||||
self, consumer::pull::Config as PullConfig, consumer::AckPolicy, consumer::DeliverPolicy,
|
||||
consumer::ReplayPolicy, stream::Config as StreamConfig,
|
||||
};
|
||||
use futures::stream::{Stream, StreamExt};
|
||||
use serde_json;
|
||||
use std::collections::HashMap;
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::sync::broadcast;
|
||||
use tokio::sync::RwLock;
|
||||
use tokio::time::Instant;
|
||||
|
||||
const AGGREGATE_STREAM_NAME: &str = "AGGREGATE_EVENTS";
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct StreamConfigSettings {
|
||||
pub max_messages: i64,
|
||||
pub max_bytes: i64,
|
||||
pub max_age: Duration,
|
||||
pub duplicate_window: Duration,
|
||||
}
|
||||
|
||||
impl Default for StreamConfigSettings {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
max_messages: 10_000_000,
|
||||
max_bytes: -1,
|
||||
max_age: Duration::from_secs(365 * 24 * 60 * 60),
|
||||
duplicate_window: Duration::from_secs(120),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct StreamClient {
|
||||
backend: StreamBackend,
|
||||
circuit_breaker: Arc<RwLock<CircuitBreaker>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
#[allow(dead_code)]
|
||||
enum StreamBackend {
|
||||
JetStream(jetstream::Context),
|
||||
InMemory(Arc<InMemoryStream>),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct InMemoryStream {
|
||||
events_by_tenant_aggregate: RwLock<HashMap<(String, String), Vec<Event>>>,
|
||||
updates: broadcast::Sender<Event>,
|
||||
}
|
||||
|
||||
impl StreamClient {
|
||||
pub async fn new(nats_url: impl Into<String>) -> Result<Self, AggregateError> {
|
||||
let url = nats_url.into();
|
||||
let client = async_nats::connect(&url).await.map_err(|e| {
|
||||
AggregateError::StreamError(format!("Failed to connect to NATS: {}", e))
|
||||
})?;
|
||||
|
||||
let jetstream = jetstream::new(client.clone());
|
||||
|
||||
Ok(Self {
|
||||
backend: StreamBackend::JetStream(jetstream),
|
||||
circuit_breaker: Arc::new(RwLock::new(CircuitBreaker::new())),
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn in_memory() -> Self {
|
||||
let (updates, _) = broadcast::channel(1024);
|
||||
Self {
|
||||
backend: StreamBackend::InMemory(Arc::new(InMemoryStream {
|
||||
events_by_tenant_aggregate: RwLock::new(HashMap::new()),
|
||||
updates,
|
||||
})),
|
||||
circuit_breaker: Arc::new(RwLock::new(CircuitBreaker::new())),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn with_circuit_breaker(mut self, cb: CircuitBreaker) -> Self {
|
||||
self.circuit_breaker = Arc::new(RwLock::new(cb));
|
||||
self
|
||||
}
|
||||
|
||||
pub async fn setup_stream(&self) -> Result<jetstream::stream::Stream, AggregateError> {
|
||||
self.setup_stream_with_settings(StreamConfigSettings::default())
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn setup_stream_with_settings(
|
||||
&self,
|
||||
settings: StreamConfigSettings,
|
||||
) -> Result<jetstream::stream::Stream, AggregateError> {
|
||||
let jetstream = match &self.backend {
|
||||
StreamBackend::JetStream(ctx) => ctx.clone(),
|
||||
StreamBackend::InMemory(_) => {
|
||||
return Err(AggregateError::StreamError(
|
||||
"setup_stream not supported for in-memory stream".to_string(),
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
let config = StreamConfig {
|
||||
name: AGGREGATE_STREAM_NAME.to_string(),
|
||||
subjects: vec!["tenant.*.aggregate.*.*".to_string()],
|
||||
max_messages: settings.max_messages,
|
||||
max_bytes: settings.max_bytes,
|
||||
max_age: settings.max_age,
|
||||
duplicate_window: settings.duplicate_window,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let stream = jetstream
|
||||
.get_or_create_stream(config)
|
||||
.await
|
||||
.map_err(|e| AggregateError::StreamError(format!("Failed to create stream: {}", e)))?;
|
||||
|
||||
Ok(stream)
|
||||
}
|
||||
|
||||
pub async fn publish_events(&self, events: Vec<Event>) -> Result<(), AggregateError> {
|
||||
if events.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if self.circuit_breaker.read().await.is_open() {
|
||||
return Err(AggregateError::StreamError(
|
||||
"Circuit breaker is open".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
match &self.backend {
|
||||
StreamBackend::JetStream(jetstream) => {
|
||||
for event in &events {
|
||||
let subject =
|
||||
build_subject(&event.tenant_id, &event.aggregate_type, &event.aggregate_id);
|
||||
let payload = serde_json::to_vec(event).map_err(|e| {
|
||||
AggregateError::StreamError(format!("Serialization error: {}", e))
|
||||
})?;
|
||||
|
||||
let mut headers = async_nats::HeaderMap::new();
|
||||
headers.insert("Nats-Msg-Id", event.event_id.to_string().as_str());
|
||||
headers.insert("aggregate-version", event.version.to_string().as_str());
|
||||
headers.insert("tenant-id", event.tenant_id.as_str());
|
||||
headers.insert("aggregate-type", event.aggregate_type.as_str());
|
||||
headers.insert("event-type", event.event_type.as_str());
|
||||
if let Some(correlation_id) = event.correlation_id.as_deref() {
|
||||
headers.insert("x-correlation-id", correlation_id);
|
||||
headers.insert("correlation-id", correlation_id);
|
||||
}
|
||||
if let Some(traceparent) = event.traceparent.as_deref() {
|
||||
headers.insert("traceparent", traceparent);
|
||||
if let Some(trace_id) = shared::trace_id_from_traceparent(traceparent) {
|
||||
headers.insert("trace-id", trace_id);
|
||||
}
|
||||
}
|
||||
|
||||
let result = jetstream
|
||||
.publish_with_headers(subject.clone(), headers.clone(), payload.into())
|
||||
.await;
|
||||
|
||||
match result {
|
||||
Ok(_) => {
|
||||
self.circuit_breaker.write().await.record_success();
|
||||
}
|
||||
Err(e) => {
|
||||
self.circuit_breaker.write().await.record_failure();
|
||||
return Err(AggregateError::StreamError(format!(
|
||||
"Failed to publish event: {}",
|
||||
e
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
StreamBackend::InMemory(mem) => {
|
||||
for event in events {
|
||||
let key = (
|
||||
event.tenant_id.as_str().to_string(),
|
||||
event.aggregate_id.to_string(),
|
||||
);
|
||||
|
||||
let mut map = mem.events_by_tenant_aggregate.write().await;
|
||||
let bucket = map.entry(key).or_default();
|
||||
|
||||
if bucket.iter().any(|e| e.command_id == event.command_id) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let current_max = bucket.last().map(|e| e.version.as_u64()).unwrap_or(0);
|
||||
let expected = current_max + 1;
|
||||
if event.version.as_u64() != expected {
|
||||
return Err(AggregateError::VersionConflict {
|
||||
expected: Version::from(current_max).increment(),
|
||||
actual: event.version,
|
||||
});
|
||||
}
|
||||
|
||||
bucket.push(event.clone());
|
||||
bucket.sort_by_key(|e| e.version);
|
||||
let _ = mem.updates.send(event);
|
||||
}
|
||||
|
||||
self.circuit_breaker.write().await.record_success();
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn fetch_events(
|
||||
&self,
|
||||
tenant_id: &TenantId,
|
||||
aggregate_id: &AggregateId,
|
||||
after_version: Version,
|
||||
) -> Result<Vec<Event>, AggregateError> {
|
||||
if self.circuit_breaker.read().await.is_open() {
|
||||
return Err(AggregateError::StreamError(
|
||||
"Circuit breaker is open".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
match &self.backend {
|
||||
StreamBackend::JetStream(jetstream) => {
|
||||
let stream = jetstream
|
||||
.get_stream(AGGREGATE_STREAM_NAME)
|
||||
.await
|
||||
.map_err(|e| AggregateError::StreamError(format!("Stream not found: {}", e)))?;
|
||||
|
||||
let subject = format!("tenant.{}.aggregate.*.{}", tenant_id.as_str(), aggregate_id);
|
||||
|
||||
let consumer_name = format!(
|
||||
"fetch_{}_{}_{}",
|
||||
tenant_id.as_str(),
|
||||
aggregate_id,
|
||||
uuid::Uuid::now_v7()
|
||||
);
|
||||
|
||||
let consumer_config = PullConfig {
|
||||
durable_name: Some(consumer_name.clone()),
|
||||
filter_subject: subject.clone(),
|
||||
deliver_policy: DeliverPolicy::All,
|
||||
ack_policy: AckPolicy::Explicit,
|
||||
replay_policy: ReplayPolicy::Instant,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let consumer = stream
|
||||
.get_or_create_consumer(&consumer_name, consumer_config)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
AggregateError::StreamError(format!("Consumer creation failed: {}", e))
|
||||
})?;
|
||||
|
||||
let mut events = Vec::new();
|
||||
let mut messages = consumer.messages().await.map_err(|e| {
|
||||
AggregateError::StreamError(format!("Message stream error: {}", e))
|
||||
})?;
|
||||
|
||||
let idle_timeout = Duration::from_millis(250);
|
||||
let max_total_wait = Duration::from_secs(2);
|
||||
let started = Instant::now();
|
||||
|
||||
loop {
|
||||
if started.elapsed() >= max_total_wait {
|
||||
break;
|
||||
}
|
||||
|
||||
match tokio::time::timeout(idle_timeout, messages.next()).await {
|
||||
Ok(Some(Ok(msg))) => {
|
||||
let event: Event =
|
||||
serde_json::from_slice(&msg.payload).map_err(|e| {
|
||||
AggregateError::StreamError(format!(
|
||||
"Deserialization error: {}",
|
||||
e
|
||||
))
|
||||
})?;
|
||||
|
||||
if event.version > after_version {
|
||||
events.push(event);
|
||||
}
|
||||
|
||||
msg.ack().await.ok();
|
||||
}
|
||||
Ok(Some(Err(e))) => {
|
||||
return Err(AggregateError::StreamError(format!(
|
||||
"Message error: {}",
|
||||
e
|
||||
)));
|
||||
}
|
||||
Ok(None) => break,
|
||||
Err(_) => break,
|
||||
}
|
||||
}
|
||||
|
||||
let _ = stream.delete_consumer(&consumer_name).await;
|
||||
events.sort_by_key(|e| e.version);
|
||||
self.circuit_breaker.write().await.record_success();
|
||||
Ok(events)
|
||||
}
|
||||
StreamBackend::InMemory(mem) => {
|
||||
let key = (tenant_id.as_str().to_string(), aggregate_id.to_string());
|
||||
let map = mem.events_by_tenant_aggregate.read().await;
|
||||
let mut out = map
|
||||
.get(&key)
|
||||
.map(|bucket| {
|
||||
bucket
|
||||
.iter()
|
||||
.filter(|e| e.version > after_version)
|
||||
.cloned()
|
||||
.collect::<Vec<_>>()
|
||||
})
|
||||
.unwrap_or_default();
|
||||
|
||||
out.sort_by_key(|e| e.version);
|
||||
self.circuit_breaker.write().await.record_success();
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn subscribe_to_events(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
aggregate_type: AggregateType,
|
||||
aggregate_id: AggregateId,
|
||||
) -> Result<Pin<Box<dyn Stream<Item = Event> + Send>>, AggregateError> {
|
||||
match &self.backend {
|
||||
StreamBackend::JetStream(jetstream) => {
|
||||
let subject = format!(
|
||||
"tenant.{}.aggregate.{}.{}",
|
||||
tenant_id.as_str(),
|
||||
aggregate_type.as_str(),
|
||||
aggregate_id
|
||||
);
|
||||
|
||||
let stream = jetstream
|
||||
.get_stream(AGGREGATE_STREAM_NAME)
|
||||
.await
|
||||
.map_err(|e| AggregateError::StreamError(format!("Stream not found: {}", e)))?;
|
||||
|
||||
let consumer_name = format!("sub_{}_{}", tenant_id.as_str(), aggregate_id);
|
||||
let consumer_config = PullConfig {
|
||||
filter_subject: subject,
|
||||
deliver_policy: DeliverPolicy::New,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let consumer = stream
|
||||
.get_or_create_consumer(&consumer_name, consumer_config)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
AggregateError::StreamError(format!("Consumer creation failed: {}", e))
|
||||
})?;
|
||||
|
||||
let messages = consumer.messages().await.map_err(|e| {
|
||||
AggregateError::StreamError(format!("Message stream error: {}", e))
|
||||
})?;
|
||||
|
||||
let event_stream = messages.filter_map(move |msg| async move {
|
||||
match msg {
|
||||
Ok(m) => {
|
||||
let event: Result<Event, _> = serde_json::from_slice(&m.payload);
|
||||
match event {
|
||||
Ok(e) => {
|
||||
m.ack().await.ok();
|
||||
Some(e)
|
||||
}
|
||||
Err(_) => None,
|
||||
}
|
||||
}
|
||||
Err(_) => None,
|
||||
}
|
||||
});
|
||||
|
||||
let boxed: Pin<Box<dyn Stream<Item = Event> + Send>> = Box::pin(event_stream);
|
||||
Ok(boxed)
|
||||
}
|
||||
StreamBackend::InMemory(mem) => {
|
||||
let tenant_id = tenant_id.as_str().to_string();
|
||||
let aggregate_type = aggregate_type.as_str().to_string();
|
||||
let aggregate_id = aggregate_id.to_string();
|
||||
|
||||
let receiver = mem.updates.subscribe();
|
||||
let boxed: Pin<Box<dyn Stream<Item = Event> + Send>> =
|
||||
Box::pin(futures::stream::unfold(
|
||||
(receiver, tenant_id, aggregate_type, aggregate_id),
|
||||
|(mut receiver, tenant_id, aggregate_type, aggregate_id)| async move {
|
||||
loop {
|
||||
match receiver.recv().await {
|
||||
Ok(event) => {
|
||||
if event.tenant_id.as_str() == tenant_id
|
||||
&& event.aggregate_type.as_str() == aggregate_type
|
||||
&& event.aggregate_id.to_string() == aggregate_id
|
||||
{
|
||||
return Some((
|
||||
event,
|
||||
(receiver, tenant_id, aggregate_type, aggregate_id),
|
||||
));
|
||||
}
|
||||
}
|
||||
Err(broadcast::error::RecvError::Lagged(_)) => continue,
|
||||
Err(broadcast::error::RecvError::Closed) => return None,
|
||||
}
|
||||
}
|
||||
},
|
||||
));
|
||||
Ok(boxed)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn get_stream_info(&self) -> Result<Option<jetstream::stream::Info>, AggregateError> {
|
||||
match &self.backend {
|
||||
StreamBackend::JetStream(jetstream) => {
|
||||
match jetstream.get_stream(AGGREGATE_STREAM_NAME).await {
|
||||
Ok(mut stream) => {
|
||||
let info = stream.info().await.map_err(|e| {
|
||||
AggregateError::StreamError(format!("Stream info error: {}", e))
|
||||
})?;
|
||||
Ok(Some(info.clone()))
|
||||
}
|
||||
Err(_) => Ok(None),
|
||||
}
|
||||
}
|
||||
StreamBackend::InMemory(_) => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn health_check(&self) -> Result<bool, AggregateError> {
|
||||
match &self.backend {
|
||||
StreamBackend::JetStream(jetstream) => {
|
||||
match jetstream.get_stream(AGGREGATE_STREAM_NAME).await {
|
||||
Ok(_) => {
|
||||
self.circuit_breaker.write().await.record_success();
|
||||
Ok(true)
|
||||
}
|
||||
Err(e) => {
|
||||
self.circuit_breaker.write().await.record_failure();
|
||||
Err(AggregateError::StreamError(format!(
|
||||
"Health check failed: {}",
|
||||
e
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
||||
StreamBackend::InMemory(_) => {
|
||||
self.circuit_breaker.write().await.record_success();
|
||||
Ok(true)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn circuit_breaker_state(&self) -> circuit_breaker::CircuitState {
|
||||
futures::executor::block_on(async { self.circuit_breaker.read().await.state() })
|
||||
}
|
||||
|
||||
pub async fn delete_consumer(
|
||||
&self,
|
||||
tenant_id: &TenantId,
|
||||
aggregate_id: &AggregateId,
|
||||
) -> Result<(), AggregateError> {
|
||||
let consumer_name = format!("sub_{}_{}", tenant_id.as_str(), aggregate_id);
|
||||
|
||||
match &self.backend {
|
||||
StreamBackend::JetStream(jetstream) => {
|
||||
let stream = jetstream
|
||||
.get_stream(AGGREGATE_STREAM_NAME)
|
||||
.await
|
||||
.map_err(|e| AggregateError::StreamError(format!("Stream not found: {}", e)))?;
|
||||
|
||||
stream.delete_consumer(&consumer_name).await.map_err(|e| {
|
||||
AggregateError::StreamError(format!("Consumer deletion failed: {}", e))
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
StreamBackend::InMemory(_) => Ok(()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn build_subject(
|
||||
tenant_id: &TenantId,
|
||||
aggregate_type: &AggregateType,
|
||||
aggregate_id: &AggregateId,
|
||||
) -> String {
|
||||
format!(
|
||||
"tenant.{}.aggregate.{}.{}",
|
||||
tenant_id.as_str(),
|
||||
aggregate_type.as_str(),
|
||||
aggregate_id
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use serde_json::json;
|
||||
|
||||
#[test]
|
||||
fn stream_client_is_send_sync() {
|
||||
fn assert_send_sync<T: Send + Sync>() {}
|
||||
assert_send_sync::<StreamClient>();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn subject_naming_includes_tenant() {
|
||||
let tenant_id = TenantId::new("acme-corp");
|
||||
let aggregate_type = AggregateType::from("Account");
|
||||
let aggregate_id = AggregateId::new_v7();
|
||||
|
||||
let subject = build_subject(&tenant_id, &aggregate_type, &aggregate_id);
|
||||
assert!(subject.starts_with("tenant.acme-corp.aggregate."));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn stream_config_settings_defaults() {
|
||||
let settings = StreamConfigSettings::default();
|
||||
assert_eq!(settings.max_messages, 10_000_000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn circuit_breaker_accessible() {
|
||||
let rt = tokio::runtime::Runtime::new().unwrap();
|
||||
rt.block_on(async {
|
||||
let cb = CircuitBreaker::new();
|
||||
assert!(cb.is_closed());
|
||||
});
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn publish_and_fetch_events_with_tenant() {
|
||||
let stream = StreamClient::in_memory();
|
||||
let tenant_id = TenantId::new("tenant-a");
|
||||
let aggregate_id = AggregateId::new_v7();
|
||||
let aggregate_type = AggregateType::from("Account");
|
||||
|
||||
let e1 = Event::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
aggregate_type.clone(),
|
||||
Version::from(1),
|
||||
"deposited",
|
||||
json!({"amount": 10}),
|
||||
uuid::Uuid::now_v7(),
|
||||
);
|
||||
let e2 = Event::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
aggregate_type.clone(),
|
||||
Version::from(2),
|
||||
"deposited",
|
||||
json!({"amount": 20}),
|
||||
uuid::Uuid::now_v7(),
|
||||
);
|
||||
|
||||
stream.publish_events(vec![e1, e2]).await.unwrap();
|
||||
let fetched = stream
|
||||
.fetch_events(&tenant_id, &aggregate_id, Version::initial())
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(fetched.len(), 2);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn fetch_with_version_filter() {
|
||||
let stream = StreamClient::in_memory();
|
||||
let tenant_id = TenantId::new("tenant-a");
|
||||
let aggregate_id = AggregateId::new_v7();
|
||||
let aggregate_type = AggregateType::from("Account");
|
||||
|
||||
let mut events = Vec::new();
|
||||
for v in 1..=4 {
|
||||
events.push(Event::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
aggregate_type.clone(),
|
||||
Version::from(v),
|
||||
"deposited",
|
||||
json!({"amount": v}),
|
||||
uuid::Uuid::now_v7(),
|
||||
));
|
||||
}
|
||||
stream.publish_events(events).await.unwrap();
|
||||
|
||||
let fetched = stream
|
||||
.fetch_events(&tenant_id, &aggregate_id, Version::from(2))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(fetched.len(), 2);
|
||||
assert!(fetched.iter().all(|e| e.version > Version::from(2)));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn tenant_isolation_fetch_returns_empty() {
|
||||
let stream = StreamClient::in_memory();
|
||||
let tenant_a = TenantId::new("tenant-a");
|
||||
let tenant_b = TenantId::new("tenant-b");
|
||||
let aggregate_id = AggregateId::new_v7();
|
||||
let aggregate_type = AggregateType::from("Account");
|
||||
|
||||
let e1 = Event::new(
|
||||
tenant_a.clone(),
|
||||
aggregate_id.clone(),
|
||||
aggregate_type.clone(),
|
||||
Version::from(1),
|
||||
"deposited",
|
||||
json!({"amount": 10}),
|
||||
uuid::Uuid::now_v7(),
|
||||
);
|
||||
|
||||
stream.publish_events(vec![e1]).await.unwrap();
|
||||
let fetched = stream
|
||||
.fetch_events(&tenant_b, &aggregate_id, Version::initial())
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(fetched.is_empty());
|
||||
}
|
||||
}
|
||||
332
aggregate/src/swarm.rs
Normal file
332
aggregate/src/swarm.rs
Normal file
@@ -0,0 +1,332 @@
|
||||
use futures::StreamExt;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct TenantPlacementConfig {
|
||||
pub virtual_nodes_per_node: usize,
|
||||
pub nodes: Vec<NodePlacement>,
|
||||
pub tenants: std::collections::HashMap<String, String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct NodePlacement {
|
||||
pub node_id: String,
|
||||
pub tenant_range: String,
|
||||
}
|
||||
|
||||
pub fn placement_constraint_for_tenant_range(tenant_range: &str) -> String {
|
||||
format!("node.labels.tenant_range == {}", tenant_range)
|
||||
}
|
||||
|
||||
pub fn placement_constraints_for_node(node: &NodePlacement) -> Vec<String> {
|
||||
vec![placement_constraint_for_tenant_range(&node.tenant_range)]
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct MigrationPlan {
|
||||
pub tenant_id: String,
|
||||
pub from_node: String,
|
||||
pub to_node: String,
|
||||
pub actions: Vec<MigrationAction>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum MigrationAction {
|
||||
DrainTenant { tenant_id: String },
|
||||
UpdatePlacement { tenant_id: String, node_id: String },
|
||||
ReloadConfig,
|
||||
}
|
||||
|
||||
pub fn plan_graceful_tenant_migration(
|
||||
tenant_id: impl Into<String>,
|
||||
from_node: impl Into<String>,
|
||||
to_node: impl Into<String>,
|
||||
) -> MigrationPlan {
|
||||
let tenant_id = tenant_id.into();
|
||||
let from_node = from_node.into();
|
||||
let to_node = to_node.into();
|
||||
|
||||
MigrationPlan {
|
||||
tenant_id: tenant_id.clone(),
|
||||
from_node,
|
||||
to_node: to_node.clone(),
|
||||
actions: vec![
|
||||
MigrationAction::DrainTenant {
|
||||
tenant_id: tenant_id.clone(),
|
||||
},
|
||||
MigrationAction::UpdatePlacement {
|
||||
tenant_id,
|
||||
node_id: to_node,
|
||||
},
|
||||
MigrationAction::ReloadConfig,
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum TenantPlacementKvError {
|
||||
#[error("NATS connection error: {0}")]
|
||||
Connection(String),
|
||||
#[error("KV error: {0}")]
|
||||
Kv(String),
|
||||
#[error("Config parse error: {0}")]
|
||||
Parse(String),
|
||||
#[error("Unsupported key operation")]
|
||||
UnsupportedOperation,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct TenantPlacementKvClient {
|
||||
kv: async_nats::jetstream::kv::Store,
|
||||
}
|
||||
|
||||
impl TenantPlacementKvClient {
|
||||
pub async fn connect(
|
||||
nats_url: impl Into<String>,
|
||||
bucket: impl Into<String>,
|
||||
) -> Result<Self, TenantPlacementKvError> {
|
||||
Self::connect_with_timeout(nats_url, bucket, std::time::Duration::from_secs(2)).await
|
||||
}
|
||||
|
||||
pub async fn connect_with_timeout(
|
||||
nats_url: impl Into<String>,
|
||||
bucket: impl Into<String>,
|
||||
timeout: std::time::Duration,
|
||||
) -> Result<Self, TenantPlacementKvError> {
|
||||
let nats_url = nats_url.into();
|
||||
let bucket = bucket.into();
|
||||
|
||||
let client = tokio::time::timeout(timeout, async_nats::connect(nats_url))
|
||||
.await
|
||||
.map_err(|_| TenantPlacementKvError::Connection("connect timeout".to_string()))?
|
||||
.map_err(|e| TenantPlacementKvError::Connection(e.to_string()))?;
|
||||
|
||||
let jetstream = async_nats::jetstream::new(client);
|
||||
|
||||
let kv = match jetstream.get_key_value(&bucket).await {
|
||||
Ok(kv) => kv,
|
||||
Err(_) => jetstream
|
||||
.create_key_value(async_nats::jetstream::kv::Config {
|
||||
bucket: bucket.clone(),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.map_err(|e| TenantPlacementKvError::Kv(e.to_string()))?,
|
||||
};
|
||||
|
||||
Ok(Self { kv })
|
||||
}
|
||||
|
||||
pub async fn get_json(
|
||||
&self,
|
||||
key: &str,
|
||||
) -> Result<Option<serde_json::Value>, TenantPlacementKvError> {
|
||||
let entry = self
|
||||
.kv
|
||||
.entry(key)
|
||||
.await
|
||||
.map_err(|e| TenantPlacementKvError::Kv(e.to_string()))?;
|
||||
|
||||
match entry {
|
||||
Some(entry) => serde_json::from_slice::<serde_json::Value>(&entry.value)
|
||||
.map(Some)
|
||||
.map_err(|e| TenantPlacementKvError::Parse(e.to_string())),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn put_json(
|
||||
&self,
|
||||
key: &str,
|
||||
value: &serde_json::Value,
|
||||
) -> Result<(), TenantPlacementKvError> {
|
||||
let bytes =
|
||||
serde_json::to_vec(value).map_err(|e| TenantPlacementKvError::Parse(e.to_string()))?;
|
||||
self.kv
|
||||
.put(key, bytes.into())
|
||||
.await
|
||||
.map_err(|e| TenantPlacementKvError::Kv(e.to_string()))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn watch_json(
|
||||
&self,
|
||||
pattern: &str,
|
||||
) -> Result<
|
||||
std::pin::Pin<
|
||||
Box<
|
||||
dyn futures::Stream<Item = Result<serde_json::Value, TenantPlacementKvError>>
|
||||
+ Send,
|
||||
>,
|
||||
>,
|
||||
TenantPlacementKvError,
|
||||
> {
|
||||
let watch = self
|
||||
.kv
|
||||
.watch(pattern)
|
||||
.await
|
||||
.map_err(|e| TenantPlacementKvError::Kv(e.to_string()))?;
|
||||
|
||||
Ok(Box::pin(watch.filter_map(|entry| async move {
|
||||
match entry {
|
||||
Ok(entry) => match entry.operation {
|
||||
async_nats::jetstream::kv::Operation::Put => {
|
||||
match serde_json::from_slice::<serde_json::Value>(&entry.value) {
|
||||
Ok(v) => Some(Ok(v)),
|
||||
Err(e) => Some(Err(TenantPlacementKvError::Parse(e.to_string()))),
|
||||
}
|
||||
}
|
||||
async_nats::jetstream::kv::Operation::Delete
|
||||
| async_nats::jetstream::kv::Operation::Purge => None,
|
||||
},
|
||||
Err(e) => Some(Err(TenantPlacementKvError::Kv(e.to_string()))),
|
||||
}
|
||||
})))
|
||||
}
|
||||
|
||||
pub async fn load_config_with_fallback(
|
||||
nats_url: impl Into<String>,
|
||||
bucket: impl Into<String>,
|
||||
key: &str,
|
||||
fallback_path: &str,
|
||||
) -> Result<serde_json::Value, TenantPlacementKvError> {
|
||||
let try_kv = match Self::connect_with_timeout(
|
||||
nats_url,
|
||||
bucket,
|
||||
std::time::Duration::from_millis(300),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(client) => match client.get_json(key).await {
|
||||
Ok(Some(v)) => Ok(v),
|
||||
Ok(None) => Err(TenantPlacementKvError::Kv("missing key".to_string())),
|
||||
Err(e) => Err(e),
|
||||
},
|
||||
Err(e) => Err(e),
|
||||
};
|
||||
|
||||
match try_kv {
|
||||
Ok(v) => Ok(v),
|
||||
Err(_) => {
|
||||
let raw = std::fs::read_to_string(fallback_path)
|
||||
.map_err(|e| TenantPlacementKvError::Kv(e.to_string()))?;
|
||||
if fallback_path.ends_with(".json") {
|
||||
serde_json::from_str(&raw)
|
||||
.map_err(|e| TenantPlacementKvError::Parse(e.to_string()))
|
||||
} else {
|
||||
let yaml: serde_yaml::Value = serde_yaml::from_str(&raw)
|
||||
.map_err(|e| TenantPlacementKvError::Parse(e.to_string()))?;
|
||||
let json = serde_json::to_value(yaml)
|
||||
.map_err(|e| TenantPlacementKvError::Parse(e.to_string()))?;
|
||||
Ok(json)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use futures::StreamExt;
|
||||
|
||||
#[test]
|
||||
fn stack_file_is_valid_yaml() {
|
||||
let raw = std::fs::read_to_string("../swarm/stacks/platform.yml").unwrap();
|
||||
let _: serde_yaml::Value = serde_yaml::from_str(&raw).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn stack_services_count() {
|
||||
let raw = std::fs::read_to_string("../swarm/stacks/platform.yml").unwrap();
|
||||
let doc: serde_yaml::Value = serde_yaml::from_str(&raw).unwrap();
|
||||
let services = doc.get("services").and_then(|v| v.as_mapping()).unwrap();
|
||||
assert!(services.contains_key(serde_yaml::Value::String("nats".to_string())));
|
||||
assert!(services.contains_key(serde_yaml::Value::String("gateway".to_string())));
|
||||
assert!(services.contains_key(serde_yaml::Value::String("aggregate".to_string())));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tenant_placement_config_loads() {
|
||||
let raw = std::fs::read_to_string("../swarm/tenant-placement.yaml").unwrap();
|
||||
let cfg: TenantPlacementConfig = serde_yaml::from_str(&raw).unwrap();
|
||||
assert_eq!(cfg.virtual_nodes_per_node, 200);
|
||||
assert!(cfg.nodes.iter().any(|n| n.node_id == "node-a"));
|
||||
assert_eq!(cfg.tenants.get("tenant-a").unwrap(), "node-a");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn placement_constraint_generated_correctly() {
|
||||
let node = NodePlacement {
|
||||
node_id: "node-a".to_string(),
|
||||
tenant_range: "00-3f".to_string(),
|
||||
};
|
||||
let constraints = placement_constraints_for_node(&node);
|
||||
assert_eq!(constraints, vec!["node.labels.tenant_range == 00-3f"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn graceful_tenant_migration_plan_is_ordered() {
|
||||
let plan = plan_graceful_tenant_migration("tenant-a", "node-a", "node-b");
|
||||
assert_eq!(plan.tenant_id, "tenant-a");
|
||||
assert_eq!(
|
||||
plan.actions,
|
||||
vec![
|
||||
MigrationAction::DrainTenant {
|
||||
tenant_id: "tenant-a".to_string(),
|
||||
},
|
||||
MigrationAction::UpdatePlacement {
|
||||
tenant_id: "tenant-a".to_string(),
|
||||
node_id: "node-b".to_string(),
|
||||
},
|
||||
MigrationAction::ReloadConfig,
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn tenant_placement_kv_falls_back_to_local_file() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let path = tmp.path().join("placement.yaml");
|
||||
std::fs::write(
|
||||
&path,
|
||||
r#"
|
||||
virtual_nodes_per_node: 100
|
||||
nodes:
|
||||
- node_id: "node-a"
|
||||
tenant_range: "00-ff"
|
||||
tenants:
|
||||
tenant-a: "node-a"
|
||||
"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let cfg = TenantPlacementKvClient::load_config_with_fallback(
|
||||
"nats://127.0.0.1:1",
|
||||
"TENANT_PLACEMENT",
|
||||
"placement",
|
||||
path.to_string_lossy().as_ref(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(cfg["virtual_nodes_per_node"], 100);
|
||||
assert_eq!(cfg["tenants"]["tenant-a"], "node-a");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn tenant_placement_kv_watch_returns_stream() {
|
||||
let result = TenantPlacementKvClient::connect_with_timeout(
|
||||
"nats://127.0.0.1:1",
|
||||
"TENANT_PLACEMENT",
|
||||
std::time::Duration::from_millis(50),
|
||||
)
|
||||
.await;
|
||||
assert!(result.is_err());
|
||||
|
||||
let mut stream =
|
||||
futures::stream::empty::<Result<serde_json::Value, TenantPlacementKvError>>();
|
||||
assert!(stream.next().await.is_none());
|
||||
}
|
||||
}
|
||||
65
aggregate/src/types/command.rs
Normal file
65
aggregate/src/types/command.rs
Normal file
@@ -0,0 +1,65 @@
|
||||
use crate::types::{AggregateId, AggregateType, TenantId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
use std::collections::HashMap;
|
||||
use uuid::Uuid;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Command {
|
||||
pub tenant_id: TenantId,
|
||||
pub command_id: Uuid,
|
||||
pub aggregate_id: AggregateId,
|
||||
pub aggregate_type: AggregateType,
|
||||
pub payload: Value,
|
||||
pub metadata: HashMap<String, Value>,
|
||||
}
|
||||
|
||||
impl Command {
|
||||
pub fn new(
|
||||
tenant_id: TenantId,
|
||||
aggregate_id: AggregateId,
|
||||
aggregate_type: AggregateType,
|
||||
payload: Value,
|
||||
) -> Self {
|
||||
Self {
|
||||
tenant_id,
|
||||
command_id: Uuid::now_v7(),
|
||||
aggregate_id,
|
||||
aggregate_type,
|
||||
payload,
|
||||
metadata: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_metadata(mut self, key: impl Into<String>, value: Value) -> Self {
|
||||
self.metadata.insert(key.into(), value);
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use serde_json::json;
|
||||
|
||||
#[test]
|
||||
fn command_serialization() {
|
||||
let cmd = Command::new(
|
||||
TenantId::new("acme-corp"),
|
||||
AggregateId::new_v7(),
|
||||
AggregateType::new("Account"),
|
||||
json!({"type": "deposit", "amount": 100}),
|
||||
);
|
||||
let json = serde_json::to_string(&cmd).unwrap();
|
||||
let decoded: Command = serde_json::from_str(&json).unwrap();
|
||||
assert_eq!(cmd.command_id, decoded.command_id);
|
||||
assert_eq!(cmd.aggregate_id, decoded.aggregate_id);
|
||||
assert_eq!(cmd.tenant_id, decoded.tenant_id);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn command_is_send_sync() {
|
||||
fn assert_send_sync<T: Send + Sync>() {}
|
||||
assert_send_sync::<Command>();
|
||||
}
|
||||
}
|
||||
58
aggregate/src/types/error.rs
Normal file
58
aggregate/src/types/error.rs
Normal file
@@ -0,0 +1,58 @@
|
||||
use crate::types::{AggregateId, TenantId, Version};
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Debug, Clone, Error)]
|
||||
pub enum AggregateError {
|
||||
#[error("Tenant access denied for tenant: {tenant_id}")]
|
||||
TenantAccessDenied { tenant_id: TenantId },
|
||||
|
||||
#[error("Tenant not hosted on this shard: {tenant_id}")]
|
||||
TenantNotHosted { tenant_id: TenantId },
|
||||
|
||||
#[error("Tenant is draining: {tenant_id}")]
|
||||
TenantDraining { tenant_id: TenantId },
|
||||
|
||||
#[error("Validation error: {0}")]
|
||||
ValidationError(String),
|
||||
|
||||
#[error("Version conflict: expected {expected}, actual {actual}")]
|
||||
VersionConflict { expected: Version, actual: Version },
|
||||
|
||||
#[error("Storage error: {0}")]
|
||||
StorageError(String),
|
||||
|
||||
#[error("Stream error: {0}")]
|
||||
StreamError(String),
|
||||
|
||||
#[error("Rehydration error: {0}")]
|
||||
RehydrationError(String),
|
||||
|
||||
#[error("Decide error: {0}")]
|
||||
DecideError(String),
|
||||
|
||||
#[error("Apply error: {0}")]
|
||||
ApplyError(String),
|
||||
|
||||
#[error("Not found: {0}")]
|
||||
NotFound(AggregateId),
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn error_implements_traits() {
|
||||
let err = AggregateError::TenantAccessDenied {
|
||||
tenant_id: TenantId::new("other"),
|
||||
};
|
||||
let _ = format!("{}", err);
|
||||
let _: &dyn std::error::Error = &err;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn error_is_send_sync() {
|
||||
fn assert_send_sync<T: Send + Sync>() {}
|
||||
assert_send_sync::<AggregateError>();
|
||||
}
|
||||
}
|
||||
78
aggregate/src/types/event.rs
Normal file
78
aggregate/src/types/event.rs
Normal file
@@ -0,0 +1,78 @@
|
||||
use crate::types::{AggregateId, AggregateType, TenantId, Version};
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
use uuid::Uuid;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Event {
|
||||
pub tenant_id: TenantId,
|
||||
pub event_id: Uuid,
|
||||
pub aggregate_id: AggregateId,
|
||||
pub aggregate_type: AggregateType,
|
||||
pub version: Version,
|
||||
pub event_type: String,
|
||||
pub payload: Value,
|
||||
pub command_id: Uuid,
|
||||
pub timestamp: DateTime<Utc>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub correlation_id: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub traceparent: Option<String>,
|
||||
}
|
||||
|
||||
impl Event {
|
||||
pub fn new(
|
||||
tenant_id: TenantId,
|
||||
aggregate_id: AggregateId,
|
||||
aggregate_type: AggregateType,
|
||||
version: Version,
|
||||
event_type: impl Into<String>,
|
||||
payload: Value,
|
||||
command_id: Uuid,
|
||||
) -> Self {
|
||||
Self {
|
||||
tenant_id,
|
||||
event_id: Uuid::now_v7(),
|
||||
aggregate_id,
|
||||
aggregate_type,
|
||||
version,
|
||||
event_type: event_type.into(),
|
||||
payload,
|
||||
command_id,
|
||||
timestamp: Utc::now(),
|
||||
correlation_id: None,
|
||||
traceparent: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use serde_json::json;
|
||||
|
||||
#[test]
|
||||
fn event_serialization() {
|
||||
let event = Event::new(
|
||||
TenantId::new("acme-corp"),
|
||||
AggregateId::new_v7(),
|
||||
AggregateType::new("Account"),
|
||||
Version::from(1),
|
||||
"Deposited",
|
||||
json!({"amount": 100}),
|
||||
Uuid::now_v7(),
|
||||
);
|
||||
let json = serde_json::to_string(&event).unwrap();
|
||||
let decoded: Event = serde_json::from_str(&json).unwrap();
|
||||
assert_eq!(event.event_id, decoded.event_id);
|
||||
assert_eq!(event.version, decoded.version);
|
||||
assert_eq!(event.tenant_id, decoded.tenant_id);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn event_is_send_sync() {
|
||||
fn assert_send_sync<T: Send + Sync>() {}
|
||||
assert_send_sync::<Event>();
|
||||
}
|
||||
}
|
||||
157
aggregate/src/types/id.rs
Normal file
157
aggregate/src/types/id.rs
Normal file
@@ -0,0 +1,157 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fmt;
|
||||
use std::str::FromStr;
|
||||
|
||||
pub type TenantId = shared::TenantId;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct AggregateId(uuid::Uuid);
|
||||
|
||||
impl AggregateId {
|
||||
pub fn new_v7() -> Self {
|
||||
Self(uuid::Uuid::now_v7())
|
||||
}
|
||||
|
||||
pub fn from_uuid(uuid: uuid::Uuid) -> Self {
|
||||
Self(uuid)
|
||||
}
|
||||
|
||||
pub fn as_uuid(&self) -> &uuid::Uuid {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for AggregateId {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for AggregateId {
|
||||
type Err = uuid::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
Ok(Self(uuid::Uuid::parse_str(s)?))
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for AggregateId {
|
||||
fn default() -> Self {
|
||||
Self::new_v7()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct AggregateType(String);
|
||||
|
||||
impl AggregateType {
|
||||
pub fn new(ty: impl Into<String>) -> Self {
|
||||
Self(ty.into())
|
||||
}
|
||||
|
||||
pub fn as_str(&self) -> &str {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for AggregateType {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&str> for AggregateType {
|
||||
fn from(s: &str) -> Self {
|
||||
Self(s.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<String> for AggregateType {
|
||||
fn from(s: String) -> Self {
|
||||
Self(s)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
|
||||
pub struct Version(u64);
|
||||
|
||||
impl Version {
|
||||
pub const fn initial() -> Self {
|
||||
Self(0)
|
||||
}
|
||||
|
||||
pub const fn from_u64(v: u64) -> Self {
|
||||
Self(v)
|
||||
}
|
||||
|
||||
pub const fn as_u64(self) -> u64 {
|
||||
self.0
|
||||
}
|
||||
|
||||
pub fn increment(self) -> Self {
|
||||
Self(self.0 + 1)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Version {
|
||||
fn default() -> Self {
|
||||
Self::initial()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u64> for Version {
|
||||
fn from(v: u64) -> Self {
|
||||
Self(v)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for Version {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn tenant_id_serialization_roundtrip() {
|
||||
let id = TenantId::new("acme-corp");
|
||||
let json = serde_json::to_string(&id).unwrap();
|
||||
let decoded: TenantId = serde_json::from_str(&json).unwrap();
|
||||
assert_eq!(id, decoded);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tenant_id_default() {
|
||||
let id = TenantId::default();
|
||||
assert!(id.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn aggregate_id_serialization_roundtrip() {
|
||||
let id = AggregateId::new_v7();
|
||||
let json = serde_json::to_string(&id).unwrap();
|
||||
let decoded: AggregateId = serde_json::from_str(&json).unwrap();
|
||||
assert_eq!(id, decoded);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn version_increment() {
|
||||
let v = Version::initial();
|
||||
assert_eq!(v.as_u64(), 0);
|
||||
let v2 = v.increment();
|
||||
assert_eq!(v2.as_u64(), 1);
|
||||
assert_eq!(v.as_u64(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn types_are_send_sync() {
|
||||
fn assert_send_sync<T: Send + Sync>() {}
|
||||
assert_send_sync::<TenantId>();
|
||||
assert_send_sync::<AggregateId>();
|
||||
assert_send_sync::<AggregateType>();
|
||||
assert_send_sync::<Version>();
|
||||
}
|
||||
}
|
||||
61
aggregate/src/types/manifest.rs
Normal file
61
aggregate/src/types/manifest.rs
Normal file
@@ -0,0 +1,61 @@
|
||||
use crate::types::AggregateType;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ProgramRef {
|
||||
pub decide_program: String,
|
||||
pub apply_program: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct AggregateManifest {
|
||||
pub aggregate_type: AggregateType,
|
||||
pub programs: ProgramRef,
|
||||
pub snapshot_threshold: Option<u64>,
|
||||
}
|
||||
|
||||
impl AggregateManifest {
|
||||
pub fn new(aggregate_type: AggregateType, programs: ProgramRef) -> Self {
|
||||
Self {
|
||||
aggregate_type,
|
||||
programs,
|
||||
snapshot_threshold: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_snapshot_threshold(mut self, threshold: u64) -> Self {
|
||||
self.snapshot_threshold = Some(threshold);
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
||||
pub struct ManifestRegistry {
|
||||
aggregates: HashMap<String, AggregateManifest>,
|
||||
}
|
||||
|
||||
impl ManifestRegistry {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
aggregates: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn register(&mut self, manifest: AggregateManifest) {
|
||||
self.aggregates
|
||||
.insert(manifest.aggregate_type.as_str().to_string(), manifest);
|
||||
}
|
||||
|
||||
pub fn get(&self, aggregate_type: &AggregateType) -> Option<&AggregateManifest> {
|
||||
self.aggregates.get(aggregate_type.as_str())
|
||||
}
|
||||
|
||||
pub fn load_from_yaml(yaml: &str) -> Result<Self, serde_yaml::Error> {
|
||||
serde_yaml::from_str(yaml)
|
||||
}
|
||||
|
||||
pub fn load_from_json(json: &str) -> Result<Self, serde_json::Error> {
|
||||
serde_json::from_str(json)
|
||||
}
|
||||
}
|
||||
13
aggregate/src/types/mod.rs
Normal file
13
aggregate/src/types/mod.rs
Normal file
@@ -0,0 +1,13 @@
|
||||
mod command;
|
||||
mod error;
|
||||
mod event;
|
||||
mod id;
|
||||
mod manifest;
|
||||
mod snapshot;
|
||||
|
||||
pub use command::*;
|
||||
pub use error::*;
|
||||
pub use event::*;
|
||||
pub use id::*;
|
||||
pub use manifest::*;
|
||||
pub use snapshot::*;
|
||||
61
aggregate/src/types/snapshot.rs
Normal file
61
aggregate/src/types/snapshot.rs
Normal file
@@ -0,0 +1,61 @@
|
||||
use crate::types::{AggregateId, AggregateType, TenantId, Version};
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Snapshot {
|
||||
pub tenant_id: TenantId,
|
||||
pub aggregate_id: AggregateId,
|
||||
pub aggregate_type: AggregateType,
|
||||
pub version: Version,
|
||||
pub state: Value,
|
||||
pub created_at: DateTime<Utc>,
|
||||
}
|
||||
|
||||
impl Snapshot {
|
||||
pub fn new(
|
||||
tenant_id: TenantId,
|
||||
aggregate_id: AggregateId,
|
||||
aggregate_type: AggregateType,
|
||||
version: Version,
|
||||
state: Value,
|
||||
) -> Self {
|
||||
Self {
|
||||
tenant_id,
|
||||
aggregate_id,
|
||||
aggregate_type,
|
||||
version,
|
||||
state,
|
||||
created_at: Utc::now(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use serde_json::json;
|
||||
|
||||
#[test]
|
||||
fn snapshot_serialization() {
|
||||
let snap = Snapshot::new(
|
||||
TenantId::new("acme-corp"),
|
||||
AggregateId::new_v7(),
|
||||
AggregateType::new("Account"),
|
||||
Version::from(5),
|
||||
json!({"balance": 100}),
|
||||
);
|
||||
let json = serde_json::to_string(&snap).unwrap();
|
||||
let decoded: Snapshot = serde_json::from_str(&json).unwrap();
|
||||
assert_eq!(snap.aggregate_id, decoded.aggregate_id);
|
||||
assert_eq!(snap.version, decoded.version);
|
||||
assert_eq!(snap.tenant_id, decoded.tenant_id);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn snapshot_is_send_sync() {
|
||||
fn assert_send_sync<T: Send + Sync>() {}
|
||||
assert_send_sync::<Snapshot>();
|
||||
}
|
||||
}
|
||||
682
aggregate/tests/integration.rs
Normal file
682
aggregate/tests/integration.rs
Normal file
@@ -0,0 +1,682 @@
|
||||
use aggregate::observability::Observability;
|
||||
use aggregate::runtime::RuntimeExecutor;
|
||||
#[cfg(feature = "runtime-v8")]
|
||||
use aggregate::runtime::{execute_apply_program, execute_decide_program};
|
||||
use aggregate::server::{CommandRequest, HealthChecker};
|
||||
use aggregate::storage::StorageClient;
|
||||
use aggregate::types::{
|
||||
AggregateError, AggregateId, AggregateType, Command, Event, TenantId, Version,
|
||||
};
|
||||
use serde_json::json;
|
||||
use std::time::Duration;
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn create_test_storage() -> (TempDir, StorageClient) {
|
||||
let dir = TempDir::new().expect("failed to create temp dir");
|
||||
let path = dir.path().join("test.mdbx");
|
||||
let storage =
|
||||
StorageClient::open(path.to_string_lossy().to_string()).expect("failed to open storage");
|
||||
(dir, storage)
|
||||
}
|
||||
|
||||
#[cfg(feature = "runtime-v8")]
|
||||
fn create_test_decide_program() -> &'static str {
|
||||
r#"
|
||||
function decide(state, command) {
|
||||
if (command.type === "deposit") {
|
||||
return [{ type: "deposited", amount: command.amount }];
|
||||
}
|
||||
if (command.type === "withdraw") {
|
||||
if (state.balance < command.amount) {
|
||||
return [{ type: "error", message: "insufficient funds" }];
|
||||
}
|
||||
return [{ type: "withdrawn", amount: command.amount }];
|
||||
}
|
||||
if (command.type === "open_account") {
|
||||
return [{ type: "account_opened", initial_balance: command.initial_balance || 0 }];
|
||||
}
|
||||
return [];
|
||||
}
|
||||
"#
|
||||
}
|
||||
|
||||
#[cfg(feature = "runtime-v8")]
|
||||
fn create_test_apply_program() -> &'static str {
|
||||
r#"
|
||||
function apply(state, event) {
|
||||
if (event.type === "account_opened") {
|
||||
return { balance: event.initial_balance };
|
||||
}
|
||||
if (event.type === "deposited") {
|
||||
return { balance: (state.balance || 0) + event.amount };
|
||||
}
|
||||
if (event.type === "withdrawn") {
|
||||
return { balance: state.balance - event.amount };
|
||||
}
|
||||
return state;
|
||||
}
|
||||
"#
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn storage_tenant_isolation() {
|
||||
let (_dir, storage) = create_test_storage();
|
||||
|
||||
let rt = tokio::runtime::Runtime::new().unwrap();
|
||||
rt.block_on(async {
|
||||
let tenant_a = TenantId::new("tenant-a");
|
||||
let tenant_b = TenantId::new("tenant-b");
|
||||
let aggregate_id = AggregateId::new_v7();
|
||||
|
||||
use aggregate::types::Snapshot;
|
||||
let snapshot_a = Snapshot::new(
|
||||
tenant_a.clone(),
|
||||
aggregate_id.clone(),
|
||||
AggregateType::from("Account"),
|
||||
Version::from(1),
|
||||
json!({"balance": 100}),
|
||||
);
|
||||
|
||||
storage.put_snapshot(&snapshot_a).await.unwrap();
|
||||
|
||||
let result_a = storage
|
||||
.get_snapshot(&tenant_a, &aggregate_id)
|
||||
.await
|
||||
.unwrap();
|
||||
let result_b = storage
|
||||
.get_snapshot(&tenant_b, &aggregate_id)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert!(result_a.is_some());
|
||||
assert!(result_b.is_none());
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn storage_version_conflict() {
|
||||
let (_dir, storage) = create_test_storage();
|
||||
|
||||
let rt = tokio::runtime::Runtime::new().unwrap();
|
||||
rt.block_on(async {
|
||||
let tenant_id = TenantId::new("tenant-a");
|
||||
let aggregate_id = AggregateId::new_v7();
|
||||
|
||||
use aggregate::types::Snapshot;
|
||||
let snapshot_v1 = Snapshot::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
AggregateType::from("Account"),
|
||||
Version::from(1),
|
||||
json!({"balance": 100}),
|
||||
);
|
||||
|
||||
storage.put_snapshot(&snapshot_v1).await.unwrap();
|
||||
|
||||
let result = storage.put_snapshot(&snapshot_v1).await;
|
||||
assert!(matches!(
|
||||
result,
|
||||
Err(AggregateError::VersionConflict { .. })
|
||||
));
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn storage_latest_version() {
|
||||
let (_dir, storage) = create_test_storage();
|
||||
|
||||
let rt = tokio::runtime::Runtime::new().unwrap();
|
||||
rt.block_on(async {
|
||||
let tenant_id = TenantId::new("tenant-a");
|
||||
let aggregate_id = AggregateId::new_v7();
|
||||
|
||||
let version = storage
|
||||
.get_latest_version(&tenant_id, &aggregate_id)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(version.is_none());
|
||||
|
||||
use aggregate::types::Snapshot;
|
||||
let snapshot_v1 = Snapshot::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
AggregateType::from("Account"),
|
||||
Version::from(1),
|
||||
json!({"balance": 100}),
|
||||
);
|
||||
storage.put_snapshot(&snapshot_v1).await.unwrap();
|
||||
|
||||
let version = storage
|
||||
.get_latest_version(&tenant_id, &aggregate_id)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(version, Some(Version::from(1)));
|
||||
|
||||
let snapshot_v3 = Snapshot::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
AggregateType::from("Account"),
|
||||
Version::from(3),
|
||||
json!({"balance": 300}),
|
||||
);
|
||||
storage.put_snapshot(&snapshot_v3).await.unwrap();
|
||||
|
||||
let version = storage
|
||||
.get_latest_version(&tenant_id, &aggregate_id)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(version, Some(Version::from(3)));
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn storage_none_for_nonexistent_aggregate() {
|
||||
let (_dir, storage) = create_test_storage();
|
||||
|
||||
let rt = tokio::runtime::Runtime::new().unwrap();
|
||||
rt.block_on(async {
|
||||
let tenant_id = TenantId::new("tenant-a");
|
||||
let aggregate_id = AggregateId::new_v7();
|
||||
|
||||
let snapshot = storage
|
||||
.get_snapshot(&tenant_id, &aggregate_id)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(snapshot.is_none());
|
||||
});
|
||||
}
|
||||
|
||||
#[cfg(feature = "runtime-v8")]
|
||||
#[test]
|
||||
fn runtime_decide_deposit() {
|
||||
let rt = tokio::runtime::Runtime::new().unwrap();
|
||||
rt.block_on(async {
|
||||
let state = json!({"balance": 100});
|
||||
let command = json!({"type": "deposit", "amount": 50});
|
||||
|
||||
let events = execute_decide_program(
|
||||
&state,
|
||||
&command,
|
||||
create_test_decide_program(),
|
||||
1_000_000,
|
||||
Duration::from_secs(5),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(events.len(), 1);
|
||||
assert_eq!(events[0]["type"], "deposited");
|
||||
assert_eq!(events[0]["amount"], 50);
|
||||
});
|
||||
}
|
||||
|
||||
#[cfg(feature = "runtime-v8")]
|
||||
#[test]
|
||||
fn runtime_decide_withdraw_insufficient() {
|
||||
let rt = tokio::runtime::Runtime::new().unwrap();
|
||||
rt.block_on(async {
|
||||
let state = json!({"balance": 10});
|
||||
let command = json!({"type": "withdraw", "amount": 100});
|
||||
|
||||
let events = execute_decide_program(
|
||||
&state,
|
||||
&command,
|
||||
create_test_decide_program(),
|
||||
1_000_000,
|
||||
Duration::from_secs(5),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(events.len(), 1);
|
||||
assert_eq!(events[0]["type"], "error");
|
||||
});
|
||||
}
|
||||
|
||||
#[cfg(feature = "runtime-v8")]
|
||||
#[test]
|
||||
fn runtime_apply_transitions_state() {
|
||||
let rt = tokio::runtime::Runtime::new().unwrap();
|
||||
rt.block_on(async {
|
||||
let state = json!({"balance": 100});
|
||||
let event = json!({"type": "deposited", "amount": 50});
|
||||
|
||||
let new_state = execute_apply_program(
|
||||
&state,
|
||||
&event,
|
||||
create_test_apply_program(),
|
||||
1_000_000,
|
||||
Duration::from_secs(5),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(new_state["balance"], 150);
|
||||
});
|
||||
}
|
||||
|
||||
#[cfg(feature = "runtime-v8")]
|
||||
#[test]
|
||||
fn runtime_determinism() {
|
||||
let rt = tokio::runtime::Runtime::new().unwrap();
|
||||
rt.block_on(async {
|
||||
let state = json!({"balance": 100});
|
||||
let command = json!({"type": "deposit", "amount": 50});
|
||||
|
||||
let r1 = execute_decide_program(
|
||||
&state,
|
||||
&command,
|
||||
create_test_decide_program(),
|
||||
1_000_000,
|
||||
Duration::from_secs(5),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let r2 = execute_decide_program(
|
||||
&state,
|
||||
&command,
|
||||
create_test_decide_program(),
|
||||
1_000_000,
|
||||
Duration::from_secs(5),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(r1, r2);
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn command_request_tenant_extraction() {
|
||||
let tenant_id = TenantId::new("acme-corp");
|
||||
let aggregate_id = AggregateId::new_v7();
|
||||
|
||||
let request = CommandRequest::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
AggregateType::from("Account"),
|
||||
json!({"type": "deposit", "amount": 100}),
|
||||
)
|
||||
.with_header("x-request-id", "req-123")
|
||||
.with_header("x-tenant-id", "override-tenant");
|
||||
|
||||
assert_eq!(request.tenant_id, tenant_id);
|
||||
assert_eq!(
|
||||
request.headers.get("x-request-id"),
|
||||
Some(&"req-123".to_string())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn health_checker_tracks_state() {
|
||||
let checker = HealthChecker::new();
|
||||
|
||||
let status = checker.check();
|
||||
assert!(status.is_healthy());
|
||||
assert!(checker.is_ready());
|
||||
assert!(checker.is_live());
|
||||
|
||||
checker.set_storage_healthy(false);
|
||||
checker.set_stream_healthy(false);
|
||||
assert!(!checker.is_ready());
|
||||
|
||||
checker.set_storage_healthy(true);
|
||||
checker.set_stream_healthy(true);
|
||||
assert!(checker.is_ready());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn observability_metrics_export() {
|
||||
let obs = Observability::default();
|
||||
|
||||
let span = obs.start_command_span("agg-123", "Account", "tenant-a", "deposit", None, None);
|
||||
obs.record_command_success(&span, 2);
|
||||
|
||||
let metrics = obs.export_metrics();
|
||||
assert!(metrics.contains("commands_total"));
|
||||
assert!(metrics.contains("command_duration"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn version_increment_and_ordering() {
|
||||
let v0 = Version::initial();
|
||||
assert_eq!(v0.as_u64(), 0);
|
||||
|
||||
let v1 = v0.increment();
|
||||
assert_eq!(v1.as_u64(), 1);
|
||||
assert_eq!(v0.as_u64(), 0);
|
||||
|
||||
let v2 = v1.increment();
|
||||
assert_eq!(v2.as_u64(), 2);
|
||||
|
||||
assert!(v0 < v1);
|
||||
assert!(v1 < v2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tenant_id_validation() {
|
||||
let valid_ids = vec!["acme-corp", "tenant_123", "my-tenant", "Tenant1"];
|
||||
let invalid_ids = vec!["tenant@corp", "tenant name", "tenant/id"];
|
||||
|
||||
for id in valid_ids {
|
||||
let tenant_id = TenantId::new(id);
|
||||
let chars_valid = tenant_id
|
||||
.as_str()
|
||||
.chars()
|
||||
.all(|c| c.is_alphanumeric() || c == '-' || c == '_');
|
||||
assert!(chars_valid, "Expected {} to be valid", id);
|
||||
}
|
||||
|
||||
for id in invalid_ids {
|
||||
let tenant_id = TenantId::new(id);
|
||||
let chars_valid = tenant_id
|
||||
.as_str()
|
||||
.chars()
|
||||
.all(|c| c.is_alphanumeric() || c == '-' || c == '_');
|
||||
assert!(!chars_valid, "Expected {} to be invalid", id);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn aggregate_id_generation() {
|
||||
let id1 = AggregateId::new_v7();
|
||||
let id2 = AggregateId::new_v7();
|
||||
|
||||
assert_ne!(id1, id2);
|
||||
|
||||
let display = format!("{}", id1);
|
||||
assert!(!display.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn event_creation() {
|
||||
let tenant_id = TenantId::new("tenant-a");
|
||||
let aggregate_id = AggregateId::new_v7();
|
||||
let command_id = uuid::Uuid::now_v7();
|
||||
|
||||
let event = Event::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
AggregateType::from("Account"),
|
||||
Version::from(1),
|
||||
"deposited".to_string(),
|
||||
json!({"amount": 100}),
|
||||
command_id,
|
||||
);
|
||||
|
||||
assert_eq!(event.tenant_id, tenant_id);
|
||||
assert_eq!(event.aggregate_id, aggregate_id);
|
||||
assert_eq!(event.version, Version::from(1));
|
||||
assert_eq!(event.event_type, "deposited");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn command_creation() {
|
||||
let tenant_id = TenantId::new("tenant-a");
|
||||
let aggregate_id = AggregateId::new_v7();
|
||||
|
||||
let command = Command::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
AggregateType::from("Account"),
|
||||
json!({"type": "deposit", "amount": 100}),
|
||||
);
|
||||
|
||||
assert_eq!(command.tenant_id, tenant_id);
|
||||
assert_eq!(command.aggregate_id, aggregate_id);
|
||||
assert_eq!(command.payload["type"], "deposit");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn snapshot_creation() {
|
||||
let tenant_id = TenantId::new("tenant-a");
|
||||
let aggregate_id = AggregateId::new_v7();
|
||||
|
||||
let snapshot = aggregate::types::Snapshot::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
AggregateType::from("Account"),
|
||||
Version::from(5),
|
||||
json!({"balance": 500}),
|
||||
);
|
||||
|
||||
assert_eq!(snapshot.tenant_id, tenant_id);
|
||||
assert_eq!(snapshot.aggregate_id, aggregate_id);
|
||||
assert_eq!(snapshot.version, Version::from(5));
|
||||
assert_eq!(snapshot.state["balance"], 500);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn circuit_breaker_pattern() {
|
||||
use aggregate::storage::CircuitBreaker;
|
||||
|
||||
let mut cb = CircuitBreaker::new()
|
||||
.with_failure_threshold(3)
|
||||
.with_reset_timeout(Duration::from_millis(50));
|
||||
|
||||
assert!(cb.is_closed());
|
||||
|
||||
cb.record_failure();
|
||||
cb.record_failure();
|
||||
cb.record_failure();
|
||||
assert!(cb.is_open());
|
||||
|
||||
std::thread::sleep(Duration::from_millis(60));
|
||||
assert!(!cb.is_closed());
|
||||
assert!(!cb.is_open());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn error_types_are_send_sync() {
|
||||
fn assert_send_sync<T: Send + Sync>() {}
|
||||
|
||||
assert_send_sync::<AggregateError>();
|
||||
assert_send_sync::<aggregate::server::ServerError>();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn all_types_are_send_sync() {
|
||||
fn assert_send_sync<T: Send + Sync>() {}
|
||||
|
||||
assert_send_sync::<TenantId>();
|
||||
assert_send_sync::<AggregateId>();
|
||||
assert_send_sync::<AggregateType>();
|
||||
assert_send_sync::<Version>();
|
||||
assert_send_sync::<Command>();
|
||||
assert_send_sync::<Event>();
|
||||
assert_send_sync::<StorageClient>();
|
||||
assert_send_sync::<RuntimeExecutor>();
|
||||
assert_send_sync::<Observability>();
|
||||
assert_send_sync::<HealthChecker>();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn concurrent_storage_operations() {
|
||||
let (_dir, storage) = create_test_storage();
|
||||
|
||||
let rt = tokio::runtime::Runtime::new().unwrap();
|
||||
rt.block_on(async {
|
||||
use aggregate::types::Snapshot;
|
||||
use std::sync::Arc;
|
||||
use tokio::task::JoinSet;
|
||||
|
||||
let storage = Arc::new(storage);
|
||||
let mut tasks = JoinSet::new();
|
||||
|
||||
for i in 0..10 {
|
||||
let storage = storage.clone();
|
||||
tasks.spawn(async move {
|
||||
let tenant_id = TenantId::new(format!("tenant-{}", i % 3));
|
||||
let aggregate_id = AggregateId::new_v7();
|
||||
|
||||
let snapshot = Snapshot::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
AggregateType::from("Account"),
|
||||
Version::from(1),
|
||||
json!({"balance": i * 100}),
|
||||
);
|
||||
|
||||
storage.put_snapshot(&snapshot).await.unwrap();
|
||||
|
||||
let loaded = storage
|
||||
.get_snapshot(&tenant_id, &aggregate_id)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(loaded.is_some());
|
||||
loaded.unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
let mut results = Vec::new();
|
||||
while let Some(result) = tasks.join_next().await {
|
||||
results.push(result.unwrap());
|
||||
}
|
||||
|
||||
assert_eq!(results.len(), 10);
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tenant_isolation_e2e() {
|
||||
let (_dir, storage) = create_test_storage();
|
||||
|
||||
let rt = tokio::runtime::Runtime::new().unwrap();
|
||||
rt.block_on(async {
|
||||
use aggregate::types::Snapshot;
|
||||
|
||||
let tenant_a = TenantId::new("tenant-a");
|
||||
let tenant_b = TenantId::new("tenant-b");
|
||||
let aggregate_id = AggregateId::new_v7();
|
||||
|
||||
let snapshot_a = Snapshot::new(
|
||||
tenant_a.clone(),
|
||||
aggregate_id.clone(),
|
||||
AggregateType::from("Account"),
|
||||
Version::from(1),
|
||||
json!({"balance": 1000, "owner": "Alice"}),
|
||||
);
|
||||
|
||||
let snapshot_b = Snapshot::new(
|
||||
tenant_b.clone(),
|
||||
aggregate_id.clone(),
|
||||
AggregateType::from("Account"),
|
||||
Version::from(1),
|
||||
json!({"balance": 500, "owner": "Bob"}),
|
||||
);
|
||||
|
||||
storage.put_snapshot(&snapshot_a).await.unwrap();
|
||||
storage.put_snapshot(&snapshot_b).await.unwrap();
|
||||
|
||||
let loaded_a = storage
|
||||
.get_snapshot(&tenant_a, &aggregate_id)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
let loaded_b = storage
|
||||
.get_snapshot(&tenant_b, &aggregate_id)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(loaded_a.state["owner"], "Alice");
|
||||
assert_eq!(loaded_a.state["balance"], 1000);
|
||||
assert_eq!(loaded_b.state["owner"], "Bob");
|
||||
assert_eq!(loaded_b.state["balance"], 500);
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bank_account_full_scenario() {
|
||||
let (_dir, storage) = create_test_storage();
|
||||
|
||||
let rt = tokio::runtime::Runtime::new().unwrap();
|
||||
rt.block_on(async {
|
||||
use aggregate::types::Snapshot;
|
||||
|
||||
let tenant_id = TenantId::new("bank-test");
|
||||
let aggregate_id = AggregateId::new_v7();
|
||||
|
||||
let snapshot_v1 = Snapshot::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
AggregateType::from("BankAccount"),
|
||||
Version::from(1),
|
||||
json!({"balance": 0}),
|
||||
);
|
||||
storage.put_snapshot(&snapshot_v1).await.unwrap();
|
||||
|
||||
let snapshot_v2 = Snapshot::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
AggregateType::from("BankAccount"),
|
||||
Version::from(2),
|
||||
json!({"balance": 100}),
|
||||
);
|
||||
storage.put_snapshot(&snapshot_v2).await.unwrap();
|
||||
|
||||
let snapshot_v3 = Snapshot::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
AggregateType::from("BankAccount"),
|
||||
Version::from(3),
|
||||
json!({"balance": 50}),
|
||||
);
|
||||
storage.put_snapshot(&snapshot_v3).await.unwrap();
|
||||
|
||||
let loaded = storage
|
||||
.get_snapshot(&tenant_id, &aggregate_id)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(loaded.version, Version::from(3));
|
||||
assert_eq!(loaded.state["balance"], 50);
|
||||
|
||||
let version = storage
|
||||
.get_latest_version(&tenant_id, &aggregate_id)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(version, Some(Version::from(3)));
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn version_sequence_integrity() {
|
||||
let (_dir, storage) = create_test_storage();
|
||||
|
||||
let rt = tokio::runtime::Runtime::new().unwrap();
|
||||
rt.block_on(async {
|
||||
use aggregate::types::Snapshot;
|
||||
|
||||
let tenant_id = TenantId::new("version-test");
|
||||
let aggregate_id = AggregateId::new_v7();
|
||||
|
||||
for v in 1..=5 {
|
||||
let snapshot = Snapshot::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
AggregateType::from("Counter"),
|
||||
Version::from(v),
|
||||
json!({"count": v}),
|
||||
);
|
||||
storage.put_snapshot(&snapshot).await.unwrap();
|
||||
}
|
||||
|
||||
let loaded = storage
|
||||
.get_snapshot(&tenant_id, &aggregate_id)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(loaded.version, Version::from(5));
|
||||
assert_eq!(loaded.state["count"], 5);
|
||||
|
||||
let duplicate = Snapshot::new(
|
||||
tenant_id.clone(),
|
||||
aggregate_id.clone(),
|
||||
AggregateType::from("Counter"),
|
||||
Version::from(5),
|
||||
json!({"count": 999}),
|
||||
);
|
||||
let result = storage.put_snapshot(&duplicate).await;
|
||||
assert!(matches!(
|
||||
result,
|
||||
Err(AggregateError::VersionConflict { .. })
|
||||
));
|
||||
});
|
||||
}
|
||||
43
control/.gitignore
vendored
Normal file
43
control/.gitignore
vendored
Normal file
@@ -0,0 +1,43 @@
|
||||
/target/
|
||||
/target-*/
|
||||
**/target/
|
||||
*.rs.bk
|
||||
*.pdb
|
||||
*.dSYM/
|
||||
*.orig
|
||||
*.rej
|
||||
*.log
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
.DS_Store
|
||||
.idea/
|
||||
.vscode/
|
||||
|
||||
.env
|
||||
.env.*
|
||||
.envrc
|
||||
.direnv/
|
||||
|
||||
docker-compose.override.yml
|
||||
|
||||
*.mdbx
|
||||
*.mdbx-*
|
||||
*.mdbx-lock
|
||||
*.mdbx.dat
|
||||
*.mdbx.lck
|
||||
*.mdb
|
||||
*.db
|
||||
/data/
|
||||
/tmp/
|
||||
|
||||
/ui/node_modules/
|
||||
/ui/dist/
|
||||
/ui/dist-ssr/
|
||||
/ui/.eslintcache
|
||||
/ui/.vite/
|
||||
|
||||
/coverage/
|
||||
lcov.info
|
||||
*.profraw
|
||||
*.profdata
|
||||
341
control/DEVELOPMENT_PLAN.md
Normal file
341
control/DEVELOPMENT_PLAN.md
Normal file
@@ -0,0 +1,341 @@
|
||||
# Development Plan: Control Plane (Admin UI + Observability + Production Ops)
|
||||
|
||||
## Overview
|
||||
|
||||
This plan breaks down the Control Plane implementation into milestones ordered by dependency. Each milestone includes:
|
||||
- **Tasks** with clear deliverables
|
||||
- **Test Requirements** (unit tests + tautological tests + integration tests where applicable)
|
||||
- **Dependencies** on previous milestones
|
||||
|
||||
**Development Approach:**
|
||||
1. Complete one milestone at a time
|
||||
2. Write tests before implementation (TDD where applicable)
|
||||
3. All tests must pass before moving to the next milestone
|
||||
4. Mark tasks complete with `[x]` as you progress
|
||||
|
||||
This plan is intentionally aligned with the style and gating discipline used in sibling repos (see: [gateway/DEVELOPMENT_PLAN.md](file:///Users/vlad/Developer/cloudlysis/gateway/DEVELOPMENT_PLAN.md), [runner/DEVELOPMENT_PLAN.md](file:///Users/vlad/Developer/cloudlysis/runner/DEVELOPMENT_PLAN.md)).
|
||||
|
||||
---
|
||||
|
||||
## Milestone 0: Repo Bootstrap (Dev Ergonomics + Guardrails)
|
||||
|
||||
**Goal:** Establish canonical commands, CI entrypoints, and integration-test gating so later milestones can be executed and verified consistently.
|
||||
|
||||
### Tasks
|
||||
- [x] **0.1** Define canonical local commands for the repo
|
||||
- UI:
|
||||
- `npm run lint`
|
||||
- `npm run typecheck`
|
||||
- `npm run test`
|
||||
- `npm run build`
|
||||
- Control Plane API:
|
||||
- `cargo test`
|
||||
- `cargo fmt --check`
|
||||
- `cargo clippy -- -D warnings`
|
||||
- `cargo run -- --help`
|
||||
- Docker/Swarm:
|
||||
- `docker compose config` validation for local stacks (if used)
|
||||
- `docker stack deploy ...` smoke validation for Swarm (gated, see Tests)
|
||||
- [x] **0.2** Add a minimal CI workflow that runs the same commands as **0.1**
|
||||
- [x] **0.3** Define integration-test gating conventions
|
||||
- Docker/Swarm integration tests:
|
||||
- Mark as ignored by default and run only when `CONTROL_TEST_DOCKER=1` is set
|
||||
- Example: `CONTROL_TEST_DOCKER=1 cargo test -- --ignored`
|
||||
- NATS-dependent integration tests:
|
||||
- Mark as ignored by default and run only when `CONTROL_TEST_NATS_URL` is set
|
||||
- Example: `CONTROL_TEST_NATS_URL=nats://127.0.0.1:4222 cargo test -- --ignored`
|
||||
- [x] **0.4** Define baseline operational invariants (checklist for later milestones)
|
||||
- No privileged action without RBAC + audit event
|
||||
- No multi-step operation without idempotency key + job record
|
||||
- Always propagate `tenant_id` (when applicable) end-to-end
|
||||
- Always propagate request/flow identifiers end-to-end (logs + downstream calls):
|
||||
- `x-request-id` (per HTTP request)
|
||||
- `x-correlation-id` (per user-visible flow/job; generated by the Gateway when missing)
|
||||
- `traceparent` (W3C trace context; started by the Gateway when missing)
|
||||
- Secrets never appear in logs (Authorization headers, tokens, credentials, Grafana admin creds)
|
||||
- No tenant-level metrics without bounded cardinality rules
|
||||
|
||||
### Tests
|
||||
- [x] **T0.1** Tautological test: test harness runs for both subprojects (UI + API)
|
||||
- [x] **T0.2** Lint + typecheck + unit tests pass
|
||||
- [x] **T0.3** Docker config validation passes (compose/stack linting tests)
|
||||
|
||||
---
|
||||
|
||||
## Milestone 1: Admin UI Foundation (UltraBase UX Reuse)
|
||||
|
||||
**Goal:** Bring up the Admin UI with the UltraBase component system and navigation skeleton, adapted to Cloudlysis page structure.
|
||||
|
||||
### Dependencies
|
||||
- Milestone 0 (repo bootstrap)
|
||||
|
||||
### Exit Criteria
|
||||
- Admin UI builds successfully and passes unit/type checks
|
||||
- UI navigation skeleton matches the PRD information architecture
|
||||
|
||||
### Tasks
|
||||
- [x] **1.1** Initialize Admin UI project (Vite + React + TypeScript)
|
||||
- Choose and wire lint/typecheck/test/build tooling to match the canonical commands in **0.1**
|
||||
- Adopt the baseline dependencies used by UltraBase control-plane admin UI where available
|
||||
- Establish UI module layout for: components, pages, routes, API client, auth/session utilities
|
||||
- [x] **1.2** Reuse UltraBase UI primitives and styling tokens (adapted, not forked blindly)
|
||||
- Buttons, inputs, tables, dropdowns, modal, toast, breadcrumbs
|
||||
- [x] **1.3** Implement navigation skeleton and empty pages (route wiring only)
|
||||
- Overview
|
||||
- Tenants
|
||||
- Users
|
||||
- Sessions
|
||||
- Roles & Permissions
|
||||
- Config
|
||||
- Definitions
|
||||
- Scale & Placement
|
||||
- Deployments
|
||||
- Observability
|
||||
- Audit Log
|
||||
- Settings
|
||||
- [x] **1.3a** Add correlation-first investigation affordances in the UI skeleton
|
||||
- Global search box that accepts `x-request-id`, `x-correlation-id`, or `trace_id`
|
||||
- “Investigate” links that open Grafana Explore prefilled for:
|
||||
- Loki query scoped to `x-correlation-id` (and `x-request-id` when available)
|
||||
- Tempo trace view when a `trace_id` is present
|
||||
- Ensure jobs and audit log rows display and copy the relevant ids
|
||||
- [x] **1.4** Implement API client stub with consistent error handling and request-id propagation
|
||||
- Send `x-request-id` on every request (generate one when missing)
|
||||
- Send `x-correlation-id` when continuing an existing UI flow; otherwise omit and use the Gateway-generated value returned in responses
|
||||
- Send `traceparent` when continuing an existing trace; otherwise omit and use the Gateway-started trace
|
||||
- Echo `x-request-id` and `x-correlation-id` on responses and surface them in error UX
|
||||
- Persist the most recent ids in the UI so operators can copy/paste them into support tickets
|
||||
|
||||
### Tests
|
||||
- [x] **T1.1** UI typecheck passes
|
||||
- [x] **T1.2** UI build passes
|
||||
- [x] **T1.3** Routing smoke test: each route renders without runtime errors (headless DOM test)
|
||||
|
||||
---
|
||||
|
||||
## Milestone 2: Control Plane API Foundation (BFF / Admin API)
|
||||
|
||||
**Goal:** Provide the minimal API surface required for the Admin UI to authenticate, read core state, and display health/metrics.
|
||||
|
||||
### Dependencies
|
||||
- Milestone 0 (repo bootstrap)
|
||||
|
||||
### Exit Criteria
|
||||
- Control plane API runs as a container and exposes `/health`, `/ready`, `/metrics`
|
||||
- Auth integration contract is defined (Gateway as source of truth) and enforced on admin endpoints
|
||||
|
||||
### Tasks
|
||||
- [x] **2.1** Initialize Control Plane API service
|
||||
- Rust (Axum + Tokio + tracing) to align with node ecosystem
|
||||
- Baseline endpoints: `GET /health`, `GET /ready`, `GET /metrics`
|
||||
- [x] **2.2** Add request logging and correlation identifiers
|
||||
- `x-request-id` propagation and structured logs (match Gateway conventions)
|
||||
- Propagate `x-correlation-id` and `traceparent` on outbound calls
|
||||
- Log fields: `request_id`, `correlation_id`, `trace_id`, `principal_id`, `tenant_id` (when applicable)
|
||||
- Never log Authorization headers or tokens
|
||||
- [x] **2.3** Implement authentication and authorization boundary
|
||||
- Validate Gateway-issued access tokens (same signing config as Gateway; Control does not mint tokens)
|
||||
- Extract principal identity from token claims (at minimum: `sub`, `session_id`)
|
||||
- Enforce permissions at the API boundary (deny-by-default, rights strings stored in Gateway IAM state)
|
||||
- Align `x-tenant-id` semantics with Gateway:
|
||||
- Tenant-scoped endpoints require `x-tenant-id` and must reject missing/invalid values with 400
|
||||
- Platform-scoped endpoints must not depend on `x-tenant-id`
|
||||
- Prefer proxying to Gateway for IAM CRUD instead of duplicating identity/RBAC state:
|
||||
- Control API may expose a thin BFF surface, but must preserve Gateway status codes and error text for pass-through routes
|
||||
- [x] **2.4** Define “job” model for multi-step operations (API contract)
|
||||
- `POST /admin/v1/jobs/*` returns `job_id`
|
||||
- `GET /admin/v1/jobs/{job_id}` returns status + structured steps + errors
|
||||
- Require an idempotency key for job creation (`Idempotency-Key` header), and make repeated creates safe
|
||||
|
||||
### Tests
|
||||
- [x] **T2.1** `GET /health` and `GET /ready` return 200
|
||||
- [x] **T2.2** Unauthorized admin calls return 401/403 consistently
|
||||
- [x] **T2.3** `x-tenant-id` behavior matches Gateway rules (400 on missing/invalid for tenant-scoped routes)
|
||||
- [x] **T2.4** Tautological tests: core state types are Send + Sync
|
||||
|
||||
---
|
||||
|
||||
## Milestone 3: Observability Stack Baseline (VM + Loki + Grafana)
|
||||
|
||||
**Goal:** Include a production-grade observability stack with version-controlled provisioning and Cloudlysis dashboard placeholders wired to existing service metrics.
|
||||
|
||||
### Dependencies
|
||||
- Milestone 0 (repo bootstrap)
|
||||
|
||||
### Exit Criteria
|
||||
- Grafana starts with provisioned datasources and dashboards
|
||||
- vmagent scrapes platform services and VictoriaMetrics can query ingested series
|
||||
- Loki is available for log queries (when logs are enabled)
|
||||
|
||||
### Tasks
|
||||
- [x] **3.1** Add observability deployment assets modeled after UltraBase
|
||||
- Grafana provisioning for datasources and dashboards
|
||||
- vmagent scrape configs for Cloudlysis services + node/Swarm exporters (where applicable)
|
||||
- Loki configuration (and optional promtail)
|
||||
- [x] **3.1a** Add distributed tracing backend and wiring
|
||||
- Tempo (or compatible tracing backend) as a Grafana datasource
|
||||
- OTLP receiver path (collector/agent) so platform services can emit traces
|
||||
- Grafana Explore is provisioned so operators can jump from logs to traces
|
||||
- Require the Gateway to accept and propagate `x-correlation-id` and `traceparent` to upstreams, and to include `correlation_id` and `trace_id` in request spans/log fields
|
||||
- [x] **3.2** Implement the base dashboard set from the PRD
|
||||
- Operations overview
|
||||
- HTTP detail (Gateway route-level)
|
||||
- Logs (Loki)
|
||||
- Traces (Tempo)
|
||||
- Event bus / JetStream
|
||||
- Workers (Runner)
|
||||
- Storage (libmdbx + node disk)
|
||||
- Cluster / Orchestrator
|
||||
- [x] **3.3** Add the chosen production-operability dashboards and document required instrumentation
|
||||
- Noisy Neighbor & Tenant Health
|
||||
- API Regression & Deployment
|
||||
- Storage & Event Bus Bottlenecks
|
||||
- Infrastructure Exhaustion
|
||||
- Standardize build/version labeling across services for correlation (`*_build_info{service,version,git_sha}=1`)
|
||||
|
||||
### Tests
|
||||
- [x] **T3.1** Grafana provisioning files are syntactically valid
|
||||
- [x] **T3.2** vmagent config parses and includes all required scrape jobs
|
||||
- [x] **T3.3** Tempo (or chosen tracing backend) reaches healthy state in the stack smoke test (gated)
|
||||
- [x] **T3.4** Container startup smoke test (compose or Swarm, gated): Grafana + VictoriaMetrics + Loki reach healthy state
|
||||
|
||||
---
|
||||
|
||||
## Milestone 4: Tenant + Placement Visibility (Read-Only Ops First)
|
||||
|
||||
**Goal:** Provide safe, read-only visibility into tenant placement and runtime health across Aggregate/Projection/Runner/Gateway, matching existing placement semantics.
|
||||
|
||||
### Dependencies
|
||||
- Milestone 1 (Admin UI foundation)
|
||||
- Milestone 2 (Control Plane API foundation)
|
||||
|
||||
### Exit Criteria
|
||||
- Admin UI can list tenants and show current placement per service kind
|
||||
- Placement is sourced from the production control-plane substrate (NATS KV) with a development fallback
|
||||
|
||||
### Tasks
|
||||
- [x] **4.1** Implement placement read APIs
|
||||
- Read effective placement from NATS KV (and fallback file for development)
|
||||
- Match the Gateway routing config model (placement maps + shard directories + revision semantics)
|
||||
- Support per-service-kind placement maps (Aggregate, Projection, Runner) using the same naming conventions used elsewhere (`aggregate_placement`, `projection_placement`, `runner_placement`)
|
||||
- [x] **4.2** Implement fleet “health snapshot” APIs
|
||||
- Query `/health`, `/ready`, `/metrics` from each service endpoint
|
||||
- Normalize into a stable UI response shape
|
||||
- [x] **4.3** Implement Admin UI pages:
|
||||
- Scale & Placement (read-only)
|
||||
- Tenants (read-only with placement summary)
|
||||
- Fleet/Topology views (read-only)
|
||||
|
||||
### Tests
|
||||
- [x] **T4.1** Placement config parsing and snapshot endpoints work
|
||||
- [x] **T4.2** KV watcher hot-reload swaps placement atomically
|
||||
- [x] **T4.3** UI pages render with mocked API responses (component-level tests)
|
||||
|
||||
---
|
||||
|
||||
## Milestone 5: Safe Mutations (Drain, Migrate, Reload) via Idempotent Jobs
|
||||
|
||||
**Goal:** Implement the first high-impact operational workflows with strict guardrails: tenant drain, placement update, and reload.
|
||||
|
||||
### Dependencies
|
||||
- Milestone 4 (read-only ops)
|
||||
|
||||
### Exit Criteria
|
||||
- All operational mutations are executed as jobs with audit events
|
||||
- Every mutation supports preflight planning and clear post-conditions
|
||||
|
||||
### Tasks
|
||||
- [x] **5.1** Implement job orchestration primitives in the API
|
||||
- step model, retries, cancellation, timeouts
|
||||
- per-tenant locking to avoid concurrent conflicting operations
|
||||
- [x] **5.2** Implement drain workflow (per service kind where supported)
|
||||
- Runner tenant drain semantics (stop acquiring new work, wait for inflight to converge)
|
||||
- Aggregate/projection drain semantics via admin endpoints where available
|
||||
- Align drain/readiness semantics with the rebalancing contract in [external_prd.md](file:///Users/vlad/Developer/cloudlysis/gateway/external_prd.md)
|
||||
- [x] **5.3** Implement migration workflow
|
||||
- Plan: drain tenant → update placement → reload routing/config
|
||||
- Block unsafe migrations (health/lag/inflight thresholds)
|
||||
- [x] **5.4** Implement UI mutation flows
|
||||
- modal confirmation + reason required
|
||||
- job progress view and audit linkage
|
||||
|
||||
### Tests
|
||||
- [x] **T5.1** Job idempotency: repeated calls with same idempotency key do not duplicate effects
|
||||
- [x] **T5.2** Migration plan preflight produces a deterministic action plan
|
||||
- [x] **T5.3** Safety gates prevent drain/migrate when invariants fail
|
||||
|
||||
---
|
||||
|
||||
## Milestone 6: Deployments + Regression Tooling (Swarm-Aware)
|
||||
|
||||
**Goal:** Make deployments and regressions observable and controllable from the control plane, with strong “what changed when” correlation.
|
||||
|
||||
### Dependencies
|
||||
- Milestone 3 (observability baseline)
|
||||
- Milestone 5 (job orchestration)
|
||||
|
||||
### Exit Criteria
|
||||
- Deployments can be initiated (or at least observed) via the control plane
|
||||
- Grafana shows deploy markers; dashboards can compare old vs new versions
|
||||
|
||||
### Tasks
|
||||
- [x] **6.1** Implement Swarm integration (read-only first, then mutations)
|
||||
- list services, tasks, images, versions
|
||||
- watch update events (start/finish/fail)
|
||||
- [x] **6.2** Implement deployment annotations/events
|
||||
- write Grafana annotations (or emit a deploy event metric) for vertical markers
|
||||
- [x] **6.3** Implement “API Regression & Deployment” dashboard wiring prerequisites
|
||||
- enforce build/version labeling (`*_build_info{service,version,git_sha}=1` pattern)
|
||||
- ensure scrape relabeling includes `image_tag` where possible
|
||||
- [x] **6.4** UI pages
|
||||
- Deployments list + detail
|
||||
- Per-service “what changed” and “rollback” actions (guarded)
|
||||
|
||||
### Tests
|
||||
- [x] **T6.1** Swarm client abstraction can be mocked and produces deterministic results
|
||||
- [x] **T6.2** Annotation writer produces expected Grafana payloads
|
||||
- [x] **T6.3** Version labels are present on all services in a metrics snapshot test
|
||||
|
||||
---
|
||||
|
||||
## Milestone 7: Full Docker Swarm Deployment (Platform + Observability + Control Plane)
|
||||
|
||||
**Goal:** Provide a complete Swarm deployment definition for the platform: services in `../` plus the control plane components and the observability stack.
|
||||
|
||||
### Dependencies
|
||||
- Milestone 1 (Admin UI foundation)
|
||||
- Milestone 2 (Control Plane API foundation)
|
||||
- Milestone 3 (Observability baseline)
|
||||
- Milestone 5 (safe mutations baseline)
|
||||
|
||||
### Exit Criteria
|
||||
- `docker stack deploy` brings up:
|
||||
- Gateway + Aggregate + Projection + Runner (from `../`)
|
||||
- Control Plane API + Admin UI
|
||||
- VictoriaMetrics + vmagent + Grafana + Loki (+ optional promtail)
|
||||
- All services are reachable via overlay networks and pass health checks
|
||||
- Smoke and integration tests pass end-to-end (gated, but required before milestone completion)
|
||||
|
||||
### Tasks
|
||||
- [x] **7.1** Define Swarm networks, secrets, and configs
|
||||
- overlay network segmentation (public vs internal)
|
||||
- secrets for auth/signing keys, NATS credentials (if used), Grafana admin creds (or provisioning)
|
||||
- [x] **7.2** Define Swarm stack files
|
||||
- base platform stack (gateway/aggregate/projection/runner)
|
||||
- control plane stack (api + ui)
|
||||
- observability stack (vm/vmagent/grafana/loki/promtail)
|
||||
- [x] **7.3** Define placement constraints and scaling defaults
|
||||
- node labels for tenant ranges and infrastructure roles
|
||||
- replica defaults and update policies
|
||||
- [x] **7.4** Define deployment verification and rollback playbooks (as executable checks)
|
||||
- post-deploy checks: `/health`, `/ready`, `/metrics`, dashboard provisioning
|
||||
- rollbacks: service update rollback hooks and job safety checks
|
||||
|
||||
### Tests
|
||||
- [x] **T7.1** Stack YAML parses and validates (unit test)
|
||||
- [x] **T7.2** Swarm smoke test (requires `CONTROL_TEST_DOCKER=1`)
|
||||
- deploy stacks
|
||||
- wait for healthy state
|
||||
- verify Grafana dashboards provisioned and VictoriaMetrics receives samples
|
||||
- [x] **T7.3** End-to-end “control plane can see the fleet” test (requires docker)
|
||||
- UI/API can query placement + health snapshots for all services
|
||||
25
control/api/Cargo.toml
Normal file
25
control/api/Cargo.toml
Normal file
@@ -0,0 +1,25 @@
|
||||
[package]
|
||||
name = "api"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
publish = ["madapes"]
|
||||
|
||||
[dependencies]
|
||||
axum = "0.8.6"
|
||||
clap = { version = "4.5.48", features = ["derive", "env"] }
|
||||
jsonwebtoken = "9.3.1"
|
||||
metrics = "0.23.0"
|
||||
metrics-exporter-prometheus = "0.16.0"
|
||||
reqwest = { version = "0.12.23", default-features = false, features = ["json", "rustls-tls"] }
|
||||
serde = { version = "1.0.228", features = ["derive"] }
|
||||
serde_json = "1.0.149"
|
||||
thiserror = "2.0.16"
|
||||
tokio = { version = "1.45.0", features = ["macros", "net", "process", "rt-multi-thread", "signal", "time"] }
|
||||
tower-http = { version = "0.6.6", features = ["trace"] }
|
||||
tracing = "0.1.41"
|
||||
tracing-subscriber = { version = "0.3.20", features = ["env-filter"] }
|
||||
uuid = { version = "1.18.1", features = ["serde", "v4"] }
|
||||
|
||||
[dev-dependencies]
|
||||
serde_yaml = "0.9.34"
|
||||
tower = "0.5.2"
|
||||
417
control/api/src/admin.rs
Normal file
417
control/api/src/admin.rs
Normal file
@@ -0,0 +1,417 @@
|
||||
use crate::{
|
||||
AppState, RequestIds,
|
||||
auth::{Principal, has_permission},
|
||||
fleet,
|
||||
job_engine::{JobEngine, StartJobError},
|
||||
jobs::{Job, JobStatus, JobStep},
|
||||
placement::{PlacementResponse, ServiceKind},
|
||||
swarm::{SwarmService, SwarmTask},
|
||||
};
|
||||
use axum::{
|
||||
Json, Router,
|
||||
extract::{Extension, Path, State},
|
||||
http::{HeaderMap, StatusCode},
|
||||
response::IntoResponse,
|
||||
routing::{get, post},
|
||||
};
|
||||
use serde::Deserialize;
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
use uuid::Uuid;
|
||||
|
||||
const HEADER_IDEMPOTENCY_KEY: &str = "idempotency-key";
|
||||
const HEADER_TENANT_ID: &str = "x-tenant-id";
|
||||
|
||||
pub fn admin_router() -> Router<AppState> {
|
||||
Router::new()
|
||||
.route("/whoami", get(whoami))
|
||||
.route("/platform/info", get(platform_info))
|
||||
.route("/fleet/snapshot", get(fleet_snapshot))
|
||||
.route("/tenants", get(list_tenants))
|
||||
.route("/placement/{kind}", get(get_placement))
|
||||
.route("/tenants/echo", get(tenant_echo))
|
||||
.route("/jobs/echo", post(create_echo_job))
|
||||
.route("/jobs/{job_id}", get(get_job))
|
||||
.route("/jobs/{job_id}/cancel", post(cancel_job))
|
||||
.route("/jobs/tenant/drain", post(start_tenant_drain))
|
||||
.route("/jobs/tenant/migrate", post(start_tenant_migrate))
|
||||
.route("/plan/tenant/migrate", post(plan_tenant_migrate))
|
||||
.route("/audit", get(list_audit))
|
||||
.route("/swarm/services", get(list_swarm_services))
|
||||
.route("/swarm/services/{name}/tasks", get(list_swarm_tasks))
|
||||
}
|
||||
|
||||
async fn whoami(Extension(principal): Extension<Principal>) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:read") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
(
|
||||
StatusCode::OK,
|
||||
Json(serde_json::json!({
|
||||
"sub": principal.sub,
|
||||
"session_id": principal.session_id,
|
||||
"permissions": principal.permissions,
|
||||
})),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
async fn platform_info(Extension(principal): Extension<Principal>) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:read") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
(
|
||||
StatusCode::OK,
|
||||
Json(serde_json::json!({
|
||||
"service": "control-api",
|
||||
})),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
async fn fleet_snapshot(
|
||||
State(state): State<AppState>,
|
||||
Extension(principal): Extension<Principal>,
|
||||
Extension(request_ids): Extension<RequestIds>,
|
||||
) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:read") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
let services =
|
||||
fleet::snapshot_with_context(&state.http, &state.fleet_services, Some(&request_ids)).await;
|
||||
(
|
||||
StatusCode::OK,
|
||||
Json(serde_json::json!({ "services": services })),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
async fn get_placement(
|
||||
State(state): State<AppState>,
|
||||
Path(kind): Path<String>,
|
||||
Extension(principal): Extension<Principal>,
|
||||
) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:read") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
let kind = match kind.as_str() {
|
||||
"aggregate" => ServiceKind::Aggregate,
|
||||
"projection" => ServiceKind::Projection,
|
||||
"runner" => ServiceKind::Runner,
|
||||
_ => return StatusCode::NOT_FOUND.into_response(),
|
||||
};
|
||||
|
||||
let resp: PlacementResponse = state.placement.get_for_kind(kind);
|
||||
|
||||
(StatusCode::OK, Json(resp)).into_response()
|
||||
}
|
||||
|
||||
async fn list_tenants(
|
||||
State(state): State<AppState>,
|
||||
Extension(principal): Extension<Principal>,
|
||||
) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:read") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
let tenants = state.placement.tenant_summaries();
|
||||
(
|
||||
StatusCode::OK,
|
||||
Json(serde_json::json!({ "tenants": tenants })),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
async fn tenant_echo(
|
||||
headers: HeaderMap,
|
||||
Extension(principal): Extension<Principal>,
|
||||
) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:read") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
let tenant_id = headers
|
||||
.get(HEADER_TENANT_ID)
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.ok_or(StatusCode::BAD_REQUEST)
|
||||
.and_then(|s| Uuid::parse_str(s).map_err(|_| StatusCode::BAD_REQUEST));
|
||||
|
||||
match tenant_id {
|
||||
Ok(tenant_id) => (
|
||||
StatusCode::OK,
|
||||
Json(serde_json::json!({
|
||||
"tenant_id": tenant_id,
|
||||
})),
|
||||
)
|
||||
.into_response(),
|
||||
Err(status) => status.into_response(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn create_echo_job(
|
||||
State(state): State<AppState>,
|
||||
headers: HeaderMap,
|
||||
Extension(principal): Extension<Principal>,
|
||||
) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:write") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
let key = headers
|
||||
.get(HEADER_IDEMPOTENCY_KEY)
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.ok_or(StatusCode::BAD_REQUEST);
|
||||
|
||||
let key = match key {
|
||||
Ok(k) if !k.is_empty() => k,
|
||||
_ => return StatusCode::BAD_REQUEST.into_response(),
|
||||
};
|
||||
|
||||
let now = now_ms();
|
||||
let job_id = Uuid::new_v4();
|
||||
let job = Job {
|
||||
job_id,
|
||||
status: JobStatus::Succeeded,
|
||||
steps: vec![JobStep {
|
||||
name: "echo".to_string(),
|
||||
status: JobStatus::Succeeded,
|
||||
attempts: 1,
|
||||
error: None,
|
||||
}],
|
||||
error: None,
|
||||
created_at_ms: now,
|
||||
started_at_ms: Some(now),
|
||||
finished_at_ms: Some(now),
|
||||
};
|
||||
|
||||
let job_id = state.jobs.insert_idempotent(key, job);
|
||||
state.audit.record(crate::audit::AuditEvent {
|
||||
ts_ms: now,
|
||||
principal_sub: principal.sub.clone(),
|
||||
action: "job.echo".to_string(),
|
||||
tenant_id: None,
|
||||
reason: "echo".to_string(),
|
||||
job_id: Some(job_id),
|
||||
});
|
||||
(
|
||||
StatusCode::OK,
|
||||
Json(serde_json::json!({
|
||||
"job_id": job_id,
|
||||
})),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
async fn get_job(
|
||||
State(state): State<AppState>,
|
||||
Path(job_id): Path<Uuid>,
|
||||
Extension(principal): Extension<Principal>,
|
||||
) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:read") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
match state.jobs.get(job_id) {
|
||||
Some(job) => (StatusCode::OK, Json(job)).into_response(),
|
||||
None => StatusCode::NOT_FOUND.into_response(),
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct TenantDrainRequest {
|
||||
tenant_id: Uuid,
|
||||
reason: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct TenantMigrateRequest {
|
||||
tenant_id: Uuid,
|
||||
runner_target: String,
|
||||
reason: String,
|
||||
}
|
||||
|
||||
async fn start_tenant_drain(
|
||||
State(state): State<AppState>,
|
||||
headers: HeaderMap,
|
||||
Extension(principal): Extension<Principal>,
|
||||
Json(body): Json<TenantDrainRequest>,
|
||||
) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:write") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
let key = headers
|
||||
.get(HEADER_IDEMPOTENCY_KEY)
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.ok_or(StatusCode::BAD_REQUEST);
|
||||
let key = match key {
|
||||
Ok(k) if !k.is_empty() => k,
|
||||
_ => return StatusCode::BAD_REQUEST.into_response(),
|
||||
};
|
||||
|
||||
let engine = JobEngine::new(
|
||||
state.jobs.clone(),
|
||||
state.audit.clone(),
|
||||
state.tenant_locks.clone(),
|
||||
);
|
||||
let job_id = match engine.start_tenant_drain(
|
||||
state.clone(),
|
||||
&principal,
|
||||
body.tenant_id,
|
||||
body.reason,
|
||||
key,
|
||||
) {
|
||||
Ok(id) => id,
|
||||
Err(StartJobError::TenantLocked) => return StatusCode::CONFLICT.into_response(),
|
||||
};
|
||||
|
||||
(
|
||||
StatusCode::OK,
|
||||
Json(serde_json::json!({ "job_id": job_id })),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
async fn start_tenant_migrate(
|
||||
State(state): State<AppState>,
|
||||
headers: HeaderMap,
|
||||
Extension(principal): Extension<Principal>,
|
||||
Json(body): Json<TenantMigrateRequest>,
|
||||
) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:write") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
let key = headers
|
||||
.get(HEADER_IDEMPOTENCY_KEY)
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.ok_or(StatusCode::BAD_REQUEST);
|
||||
let key = match key {
|
||||
Ok(k) if !k.is_empty() => k,
|
||||
_ => return StatusCode::BAD_REQUEST.into_response(),
|
||||
};
|
||||
|
||||
let engine = JobEngine::new(
|
||||
state.jobs.clone(),
|
||||
state.audit.clone(),
|
||||
state.tenant_locks.clone(),
|
||||
);
|
||||
let job_id = match engine.start_tenant_migrate(
|
||||
state.clone(),
|
||||
&principal,
|
||||
body.tenant_id,
|
||||
body.runner_target,
|
||||
body.reason,
|
||||
key,
|
||||
) {
|
||||
Ok(id) => id,
|
||||
Err(StartJobError::TenantLocked) => return StatusCode::CONFLICT.into_response(),
|
||||
};
|
||||
|
||||
(
|
||||
StatusCode::OK,
|
||||
Json(serde_json::json!({ "job_id": job_id })),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
async fn cancel_job(
|
||||
State(state): State<AppState>,
|
||||
Path(job_id): Path<Uuid>,
|
||||
Extension(principal): Extension<Principal>,
|
||||
) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:write") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
if state.jobs.request_cancel(job_id) {
|
||||
state.audit.record(crate::audit::AuditEvent {
|
||||
ts_ms: now_ms(),
|
||||
principal_sub: principal.sub.clone(),
|
||||
action: "job.cancel".to_string(),
|
||||
tenant_id: None,
|
||||
reason: "cancel requested".to_string(),
|
||||
job_id: Some(job_id),
|
||||
});
|
||||
StatusCode::OK.into_response()
|
||||
} else {
|
||||
StatusCode::NOT_FOUND.into_response()
|
||||
}
|
||||
}
|
||||
|
||||
fn now_ms() -> u64 {
|
||||
SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_millis() as u64
|
||||
}
|
||||
|
||||
async fn list_audit(
|
||||
State(state): State<AppState>,
|
||||
Extension(principal): Extension<Principal>,
|
||||
) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:read") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
let events = state.audit.list_recent(200);
|
||||
(
|
||||
StatusCode::OK,
|
||||
Json(serde_json::json!({ "events": events })),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
async fn plan_tenant_migrate(
|
||||
Extension(principal): Extension<Principal>,
|
||||
Json(body): Json<TenantMigrateRequest>,
|
||||
) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:write") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
let _ = (body.tenant_id, body.runner_target, body.reason);
|
||||
(
|
||||
StatusCode::OK,
|
||||
Json(serde_json::json!({
|
||||
"steps": ["preflight", "drain", "update_placement", "reload", "verify"]
|
||||
})),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
async fn list_swarm_services(
|
||||
State(state): State<AppState>,
|
||||
Extension(principal): Extension<Principal>,
|
||||
) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:read") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
let services: Vec<SwarmService> = state.swarm.list_services();
|
||||
(
|
||||
StatusCode::OK,
|
||||
Json(serde_json::json!({ "services": services })),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
async fn list_swarm_tasks(
|
||||
State(state): State<AppState>,
|
||||
Path(name): Path<String>,
|
||||
Extension(principal): Extension<Principal>,
|
||||
) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:read") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
let tasks: Vec<SwarmTask> = state.swarm.list_tasks(&name);
|
||||
(
|
||||
StatusCode::OK,
|
||||
Json(serde_json::json!({ "service": name, "tasks": tasks })),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
31
control/api/src/audit.rs
Normal file
31
control/api/src/audit.rs
Normal file
@@ -0,0 +1,31 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use uuid::Uuid;
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct AuditEvent {
|
||||
pub ts_ms: u64,
|
||||
pub principal_sub: String,
|
||||
pub action: String,
|
||||
pub tenant_id: Option<Uuid>,
|
||||
pub reason: String,
|
||||
pub job_id: Option<Uuid>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub struct AuditStore {
|
||||
inner: Arc<Mutex<Vec<AuditEvent>>>,
|
||||
}
|
||||
|
||||
impl AuditStore {
|
||||
pub fn record(&self, event: AuditEvent) {
|
||||
let mut events = self.inner.lock().expect("audit lock poisoned");
|
||||
events.push(event);
|
||||
}
|
||||
|
||||
pub fn list_recent(&self, limit: usize) -> Vec<AuditEvent> {
|
||||
let events = self.inner.lock().expect("audit lock poisoned");
|
||||
let start = events.len().saturating_sub(limit);
|
||||
events[start..].to_vec()
|
||||
}
|
||||
}
|
||||
78
control/api/src/auth.rs
Normal file
78
control/api/src/auth.rs
Normal file
@@ -0,0 +1,78 @@
|
||||
use crate::AppState;
|
||||
use axum::{
|
||||
extract::State,
|
||||
http::{Request, StatusCode},
|
||||
middleware::Next,
|
||||
response::{IntoResponse, Response},
|
||||
};
|
||||
use jsonwebtoken::{Algorithm, DecodingKey, Validation, decode};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct AuthConfig {
|
||||
pub hs256_secret: Option<Vec<u8>>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Principal {
|
||||
pub sub: String,
|
||||
pub session_id: String,
|
||||
pub permissions: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct Claims {
|
||||
sub: String,
|
||||
session_id: String,
|
||||
permissions: Vec<String>,
|
||||
exp: usize,
|
||||
}
|
||||
|
||||
pub async fn auth_middleware(
|
||||
State(state): State<AppState>,
|
||||
mut req: Request<axum::body::Body>,
|
||||
next: Next,
|
||||
) -> Response {
|
||||
match authenticate(
|
||||
&state.auth,
|
||||
req.headers().get(axum::http::header::AUTHORIZATION),
|
||||
) {
|
||||
Ok(principal) => {
|
||||
req.extensions_mut().insert(principal);
|
||||
next.run(req).await
|
||||
}
|
||||
Err(status) => status.into_response(),
|
||||
}
|
||||
}
|
||||
|
||||
fn authenticate(
|
||||
cfg: &AuthConfig,
|
||||
auth_header: Option<&axum::http::HeaderValue>,
|
||||
) -> Result<Principal, StatusCode> {
|
||||
let secret = cfg
|
||||
.hs256_secret
|
||||
.as_ref()
|
||||
.ok_or(StatusCode::SERVICE_UNAVAILABLE)?;
|
||||
let header = auth_header.ok_or(StatusCode::UNAUTHORIZED)?;
|
||||
let header_str = header.to_str().map_err(|_| StatusCode::UNAUTHORIZED)?;
|
||||
|
||||
let token = header_str
|
||||
.strip_prefix("Bearer ")
|
||||
.ok_or(StatusCode::UNAUTHORIZED)?;
|
||||
|
||||
let mut validation = Validation::new(Algorithm::HS256);
|
||||
validation.required_spec_claims.insert("exp".to_string());
|
||||
|
||||
let data = decode::<Claims>(token, &DecodingKey::from_secret(secret), &validation)
|
||||
.map_err(|_| StatusCode::UNAUTHORIZED)?;
|
||||
|
||||
Ok(Principal {
|
||||
sub: data.claims.sub,
|
||||
session_id: data.claims.session_id,
|
||||
permissions: data.claims.permissions,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn has_permission(principal: &Principal, permission: &str) -> bool {
|
||||
principal.permissions.iter().any(|p| p == permission)
|
||||
}
|
||||
57
control/api/src/build_info.rs
Normal file
57
control/api/src/build_info.rs
Normal file
@@ -0,0 +1,57 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct BuildInfo {
|
||||
pub service: String,
|
||||
pub version: String,
|
||||
pub git_sha: String,
|
||||
}
|
||||
|
||||
pub fn extract_build_info(metrics: &str) -> Vec<BuildInfo> {
|
||||
let mut out = Vec::new();
|
||||
for line in metrics.lines() {
|
||||
let line = line.trim();
|
||||
if line.is_empty() || line.starts_with('#') {
|
||||
continue;
|
||||
}
|
||||
let Some((metric_and_labels, value)) = line.split_once(' ') else {
|
||||
continue;
|
||||
};
|
||||
if value.trim() != "1" {
|
||||
continue;
|
||||
}
|
||||
if !metric_and_labels.ends_with('}') {
|
||||
continue;
|
||||
}
|
||||
let Some((name, labels)) = metric_and_labels.split_once('{') else {
|
||||
continue;
|
||||
};
|
||||
if !name.ends_with("_build_info") {
|
||||
continue;
|
||||
}
|
||||
let labels = labels.trim_end_matches('}');
|
||||
let mut service = None;
|
||||
let mut version = None;
|
||||
let mut git_sha = None;
|
||||
for part in labels.split(',') {
|
||||
let Some((k, v)) = part.split_once('=') else {
|
||||
continue;
|
||||
};
|
||||
let v = v.trim().trim_matches('"');
|
||||
match k.trim() {
|
||||
"service" => service = Some(v.to_string()),
|
||||
"version" => version = Some(v.to_string()),
|
||||
"git_sha" => git_sha = Some(v.to_string()),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
if let (Some(service), Some(version), Some(git_sha)) = (service, version, git_sha) {
|
||||
out.push(BuildInfo {
|
||||
service,
|
||||
version,
|
||||
git_sha,
|
||||
});
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
42
control/api/src/deployments.rs
Normal file
42
control/api/src/deployments.rs
Normal file
@@ -0,0 +1,42 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct GrafanaAnnotation {
|
||||
pub time: i64,
|
||||
pub tags: Vec<String>,
|
||||
pub text: String,
|
||||
}
|
||||
|
||||
pub fn build_grafana_deploy_annotation(args: DeployAnnotationArgs) -> GrafanaAnnotation {
|
||||
let mut tags = vec![
|
||||
"cloudlysis".to_string(),
|
||||
"deploy".to_string(),
|
||||
format!("service:{}", args.service),
|
||||
];
|
||||
if let Some(v) = args.version {
|
||||
tags.push(format!("version:{v}"));
|
||||
}
|
||||
if let Some(sha) = args.git_sha {
|
||||
tags.push(format!("git_sha:{sha}"));
|
||||
}
|
||||
|
||||
let text = match (args.version, args.git_sha) {
|
||||
(Some(v), Some(sha)) => format!("deploy {} v={} git_sha={sha}", args.service, v),
|
||||
(Some(v), None) => format!("deploy {} v={}", args.service, v),
|
||||
(None, Some(sha)) => format!("deploy {} git_sha={sha}", args.service),
|
||||
(None, None) => format!("deploy {}", args.service),
|
||||
};
|
||||
|
||||
GrafanaAnnotation {
|
||||
time: args.time_ms,
|
||||
tags,
|
||||
text,
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DeployAnnotationArgs<'a> {
|
||||
pub service: &'a str,
|
||||
pub version: Option<&'a str>,
|
||||
pub git_sha: Option<&'a str>,
|
||||
pub time_ms: i64,
|
||||
}
|
||||
67
control/api/src/fleet.rs
Normal file
67
control/api/src/fleet.rs
Normal file
@@ -0,0 +1,67 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::time::Duration;
|
||||
|
||||
use crate::RequestIds;
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct FleetService {
|
||||
pub name: String,
|
||||
pub base_url: String,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct FleetServiceSnapshot {
|
||||
pub name: String,
|
||||
pub base_url: String,
|
||||
pub health_ok: bool,
|
||||
pub ready_ok: bool,
|
||||
pub metrics_ok: bool,
|
||||
}
|
||||
|
||||
pub async fn snapshot(
|
||||
client: &reqwest::Client,
|
||||
services: &[FleetService],
|
||||
) -> Vec<FleetServiceSnapshot> {
|
||||
snapshot_with_context(client, services, None).await
|
||||
}
|
||||
|
||||
pub async fn snapshot_with_context(
|
||||
client: &reqwest::Client,
|
||||
services: &[FleetService],
|
||||
ctx: Option<&RequestIds>,
|
||||
) -> Vec<FleetServiceSnapshot> {
|
||||
let mut out = Vec::with_capacity(services.len());
|
||||
for svc in services {
|
||||
let base = svc.base_url.trim_end_matches('/');
|
||||
let health_ok = get_ok(client, &format!("{base}/health"), ctx).await;
|
||||
let ready_ok = get_ok(client, &format!("{base}/ready"), ctx).await;
|
||||
let metrics_ok = get_ok(client, &format!("{base}/metrics"), ctx).await;
|
||||
out.push(FleetServiceSnapshot {
|
||||
name: svc.name.clone(),
|
||||
base_url: svc.base_url.clone(),
|
||||
health_ok,
|
||||
ready_ok,
|
||||
metrics_ok,
|
||||
});
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
async fn get_ok(client: &reqwest::Client, url: &str, ctx: Option<&RequestIds>) -> bool {
|
||||
let mut req = client.get(url).timeout(Duration::from_secs(2));
|
||||
if let Some(ctx) = ctx {
|
||||
req = req.header("x-request-id", &ctx.request_id);
|
||||
if let Some(cid) = &ctx.correlation_id {
|
||||
req = req.header("x-correlation-id", cid);
|
||||
}
|
||||
if let Some(tp) = &ctx.traceparent {
|
||||
req = req.header("traceparent", tp);
|
||||
}
|
||||
}
|
||||
|
||||
let res = req.send().await;
|
||||
match res {
|
||||
Ok(r) => r.status().is_success(),
|
||||
Err(_) => false,
|
||||
}
|
||||
}
|
||||
348
control/api/src/job_engine.rs
Normal file
348
control/api/src/job_engine.rs
Normal file
@@ -0,0 +1,348 @@
|
||||
use crate::{
|
||||
AppState, Principal,
|
||||
audit::{AuditEvent, AuditStore},
|
||||
fleet,
|
||||
jobs::{Job, JobStatus, JobStep, JobStore},
|
||||
};
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
sync::{Arc, Mutex},
|
||||
time::{Duration, SystemTime, UNIX_EPOCH},
|
||||
};
|
||||
use uuid::Uuid;
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub struct TenantLocks {
|
||||
inner: Arc<Mutex<HashMap<Uuid, Uuid>>>,
|
||||
}
|
||||
|
||||
impl TenantLocks {
|
||||
pub fn try_lock(&self, tenant_id: Uuid, job_id: Uuid) -> bool {
|
||||
let mut map = self.inner.lock().expect("tenant locks poisoned");
|
||||
if map.contains_key(&tenant_id) {
|
||||
return false;
|
||||
}
|
||||
map.insert(tenant_id, job_id);
|
||||
true
|
||||
}
|
||||
|
||||
pub fn unlock(&self, tenant_id: Uuid, job_id: Uuid) {
|
||||
let mut map = self.inner.lock().expect("tenant locks poisoned");
|
||||
if map.get(&tenant_id).copied() == Some(job_id) {
|
||||
map.remove(&tenant_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct JobEngine {
|
||||
pub jobs: JobStore,
|
||||
pub audit: AuditStore,
|
||||
pub tenant_locks: TenantLocks,
|
||||
pub step_timeout: Duration,
|
||||
}
|
||||
|
||||
impl JobEngine {
|
||||
pub fn new(jobs: JobStore, audit: AuditStore, tenant_locks: TenantLocks) -> Self {
|
||||
Self {
|
||||
jobs,
|
||||
audit,
|
||||
tenant_locks,
|
||||
step_timeout: Duration::from_millis(500),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start_tenant_drain(
|
||||
&self,
|
||||
state: AppState,
|
||||
principal: &Principal,
|
||||
tenant_id: Uuid,
|
||||
reason: String,
|
||||
idempotency_key: &str,
|
||||
) -> Result<Uuid, StartJobError> {
|
||||
if let Some(existing) = self.jobs.get_idempotent(idempotency_key) {
|
||||
return Ok(existing);
|
||||
}
|
||||
|
||||
let job_id = Uuid::new_v4();
|
||||
if !self.tenant_locks.try_lock(tenant_id, job_id) {
|
||||
return Err(StartJobError::TenantLocked);
|
||||
}
|
||||
|
||||
let now = now_ms();
|
||||
let job = Job {
|
||||
job_id,
|
||||
status: JobStatus::Pending,
|
||||
steps: vec![step("preflight"), step("drain"), step("verify")],
|
||||
error: None,
|
||||
created_at_ms: now,
|
||||
started_at_ms: None,
|
||||
finished_at_ms: None,
|
||||
};
|
||||
|
||||
let inserted = self.jobs.insert_idempotent(idempotency_key, job);
|
||||
self.audit.record(AuditEvent {
|
||||
ts_ms: now,
|
||||
principal_sub: principal.sub.clone(),
|
||||
action: "tenant.drain".to_string(),
|
||||
tenant_id: Some(tenant_id),
|
||||
reason,
|
||||
job_id: Some(inserted),
|
||||
});
|
||||
|
||||
let engine = self.clone();
|
||||
tokio::spawn(async move {
|
||||
engine
|
||||
.run_job(state, inserted, Some(tenant_id), RunSpec::Drain)
|
||||
.await;
|
||||
});
|
||||
|
||||
Ok(inserted)
|
||||
}
|
||||
|
||||
pub fn start_tenant_migrate(
|
||||
&self,
|
||||
state: AppState,
|
||||
principal: &Principal,
|
||||
tenant_id: Uuid,
|
||||
runner_target: String,
|
||||
reason: String,
|
||||
idempotency_key: &str,
|
||||
) -> Result<Uuid, StartJobError> {
|
||||
if let Some(existing) = self.jobs.get_idempotent(idempotency_key) {
|
||||
return Ok(existing);
|
||||
}
|
||||
|
||||
let job_id = Uuid::new_v4();
|
||||
if !self.tenant_locks.try_lock(tenant_id, job_id) {
|
||||
return Err(StartJobError::TenantLocked);
|
||||
}
|
||||
|
||||
let now = now_ms();
|
||||
let job = Job {
|
||||
job_id,
|
||||
status: JobStatus::Pending,
|
||||
steps: vec![
|
||||
step("preflight"),
|
||||
step("drain"),
|
||||
step("update_placement"),
|
||||
step("reload"),
|
||||
step("verify"),
|
||||
],
|
||||
error: None,
|
||||
created_at_ms: now,
|
||||
started_at_ms: None,
|
||||
finished_at_ms: None,
|
||||
};
|
||||
|
||||
let inserted = self.jobs.insert_idempotent(idempotency_key, job);
|
||||
self.audit.record(AuditEvent {
|
||||
ts_ms: now,
|
||||
principal_sub: principal.sub.clone(),
|
||||
action: "tenant.migrate".to_string(),
|
||||
tenant_id: Some(tenant_id),
|
||||
reason,
|
||||
job_id: Some(inserted),
|
||||
});
|
||||
|
||||
let engine = self.clone();
|
||||
tokio::spawn(async move {
|
||||
engine
|
||||
.run_job(
|
||||
state,
|
||||
inserted,
|
||||
Some(tenant_id),
|
||||
RunSpec::Migrate { runner_target },
|
||||
)
|
||||
.await;
|
||||
});
|
||||
|
||||
Ok(inserted)
|
||||
}
|
||||
|
||||
async fn run_job(&self, state: AppState, job_id: Uuid, tenant_id: Option<Uuid>, spec: RunSpec) {
|
||||
self.jobs.update(job_id, |j| {
|
||||
j.status = JobStatus::Running;
|
||||
j.started_at_ms = Some(now_ms());
|
||||
});
|
||||
|
||||
let mut ok = true;
|
||||
for idx in 0.. {
|
||||
if self.jobs.cancel_requested(job_id) {
|
||||
ok = false;
|
||||
self.jobs.update(job_id, |j| {
|
||||
j.status = JobStatus::Cancelled;
|
||||
j.finished_at_ms = Some(now_ms());
|
||||
j.error = Some("cancelled".to_string());
|
||||
for step in &mut j.steps {
|
||||
if step.status == JobStatus::Pending || step.status == JobStatus::Running {
|
||||
step.status = JobStatus::Cancelled;
|
||||
}
|
||||
}
|
||||
});
|
||||
break;
|
||||
}
|
||||
|
||||
let step_name = {
|
||||
let Some(job) = self.jobs.get(job_id) else {
|
||||
break;
|
||||
};
|
||||
let Some(step) = job.steps.get(idx) else {
|
||||
break;
|
||||
};
|
||||
step.name.clone()
|
||||
};
|
||||
|
||||
self.jobs.update(job_id, |j| {
|
||||
if let Some(step) = j.steps.get_mut(idx) {
|
||||
step.status = JobStatus::Running;
|
||||
step.attempts += 1;
|
||||
}
|
||||
});
|
||||
|
||||
let r = tokio::time::timeout(
|
||||
self.step_timeout,
|
||||
run_step(&state, &spec, &step_name, tenant_id),
|
||||
)
|
||||
.await;
|
||||
match r {
|
||||
Ok(Ok(())) => {
|
||||
self.jobs.update(job_id, |j| {
|
||||
if let Some(step) = j.steps.get_mut(idx) {
|
||||
step.status = JobStatus::Succeeded;
|
||||
step.error = None;
|
||||
}
|
||||
});
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
ok = false;
|
||||
self.jobs.update(job_id, |j| {
|
||||
if let Some(step) = j.steps.get_mut(idx) {
|
||||
step.status = JobStatus::Failed;
|
||||
step.error = Some(e.clone());
|
||||
}
|
||||
j.status = JobStatus::Failed;
|
||||
j.error = Some(e);
|
||||
j.finished_at_ms = Some(now_ms());
|
||||
});
|
||||
break;
|
||||
}
|
||||
Err(_) => {
|
||||
ok = false;
|
||||
self.jobs.update(job_id, |j| {
|
||||
if let Some(step) = j.steps.get_mut(idx) {
|
||||
step.status = JobStatus::Failed;
|
||||
step.error = Some("step timeout".to_string());
|
||||
}
|
||||
j.status = JobStatus::Failed;
|
||||
j.error = Some("step timeout".to_string());
|
||||
j.finished_at_ms = Some(now_ms());
|
||||
});
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if !ok {
|
||||
break;
|
||||
}
|
||||
|
||||
let done = match self.jobs.get(job_id) {
|
||||
Some(job) => idx + 1 >= job.steps.len(),
|
||||
None => true,
|
||||
};
|
||||
if done {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ok {
|
||||
self.jobs.update(job_id, |j| {
|
||||
j.status = JobStatus::Succeeded;
|
||||
j.finished_at_ms = Some(now_ms());
|
||||
});
|
||||
}
|
||||
|
||||
if let Some(tid) = tenant_id {
|
||||
self.tenant_locks.unlock(tid, job_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum StartJobError {
|
||||
TenantLocked,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
enum RunSpec {
|
||||
Drain,
|
||||
Migrate { runner_target: String },
|
||||
}
|
||||
|
||||
fn step(name: &str) -> JobStep {
|
||||
JobStep {
|
||||
name: name.to_string(),
|
||||
status: JobStatus::Pending,
|
||||
attempts: 0,
|
||||
error: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn now_ms() -> u64 {
|
||||
SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_millis() as u64
|
||||
}
|
||||
|
||||
async fn run_step(
|
||||
state: &AppState,
|
||||
spec: &RunSpec,
|
||||
step: &str,
|
||||
tenant_id: Option<Uuid>,
|
||||
) -> Result<(), String> {
|
||||
match step {
|
||||
"preflight" => {
|
||||
let snapshots = fleet::snapshot(&state.http, &state.fleet_services).await;
|
||||
if snapshots.iter().any(|s| !s.ready_ok) {
|
||||
return Err("preflight failed: fleet not ready".to_string());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
"drain" => {
|
||||
tokio::time::sleep(Duration::from_millis(50)).await;
|
||||
Ok(())
|
||||
}
|
||||
"update_placement" => match spec {
|
||||
RunSpec::Migrate { runner_target } => {
|
||||
let tenant_id = tenant_id.ok_or_else(|| "missing tenant_id".to_string())?;
|
||||
state
|
||||
.placement
|
||||
.update_runner_target(tenant_id, runner_target.clone())
|
||||
.map(|_| ())
|
||||
}
|
||||
_ => Ok(()),
|
||||
},
|
||||
"reload" => {
|
||||
let _ = state.placement.tenant_summaries();
|
||||
Ok(())
|
||||
}
|
||||
"verify" => match spec {
|
||||
RunSpec::Migrate { runner_target } => {
|
||||
let tenant_id = tenant_id.ok_or_else(|| "missing tenant_id".to_string())?;
|
||||
let summaries = state.placement.tenant_summaries();
|
||||
let found = summaries
|
||||
.iter()
|
||||
.find(|t| t.tenant_id == tenant_id)
|
||||
.map(|t| t.runner_targets.iter().any(|x| x == runner_target))
|
||||
.unwrap_or(false);
|
||||
if !found {
|
||||
return Err("verify failed: placement not updated".to_string());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
_ => Ok(()),
|
||||
},
|
||||
_ => Ok(()),
|
||||
}
|
||||
}
|
||||
122
control/api/src/jobs.rs
Normal file
122
control/api/src/jobs.rs
Normal file
@@ -0,0 +1,122 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
sync::{
|
||||
Arc, Mutex,
|
||||
atomic::{AtomicBool, Ordering},
|
||||
},
|
||||
};
|
||||
use uuid::Uuid;
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum JobStatus {
|
||||
Pending,
|
||||
Running,
|
||||
Succeeded,
|
||||
Failed,
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct Job {
|
||||
pub job_id: Uuid,
|
||||
pub status: JobStatus,
|
||||
pub steps: Vec<JobStep>,
|
||||
pub error: Option<String>,
|
||||
pub created_at_ms: u64,
|
||||
pub started_at_ms: Option<u64>,
|
||||
pub finished_at_ms: Option<u64>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct JobStep {
|
||||
pub name: String,
|
||||
pub status: JobStatus,
|
||||
pub attempts: u32,
|
||||
pub error: Option<String>,
|
||||
}
|
||||
|
||||
struct JobRecord {
|
||||
job: Mutex<Job>,
|
||||
cancel: AtomicBool,
|
||||
}
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub struct JobStore {
|
||||
inner: Arc<Inner>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct Inner {
|
||||
jobs: Mutex<HashMap<Uuid, Arc<JobRecord>>>,
|
||||
idempotency: Mutex<HashMap<String, Uuid>>,
|
||||
}
|
||||
|
||||
impl JobStore {
|
||||
pub fn get(&self, job_id: Uuid) -> Option<Job> {
|
||||
let jobs = self.inner.jobs.lock().ok()?;
|
||||
let rec = jobs.get(&job_id)?.clone();
|
||||
rec.job.lock().ok().map(|j| j.clone())
|
||||
}
|
||||
|
||||
pub fn get_idempotent(&self, key: &str) -> Option<Uuid> {
|
||||
let map = self.inner.idempotency.lock().ok()?;
|
||||
map.get(key).copied()
|
||||
}
|
||||
|
||||
pub fn insert_idempotent(&self, key: &str, job: Job) -> Uuid {
|
||||
let mut idempotency = self
|
||||
.inner
|
||||
.idempotency
|
||||
.lock()
|
||||
.expect("idempotency lock poisoned");
|
||||
if let Some(existing) = idempotency.get(key) {
|
||||
return *existing;
|
||||
}
|
||||
|
||||
let job_id = job.job_id;
|
||||
let rec = Arc::new(JobRecord {
|
||||
job: Mutex::new(job),
|
||||
cancel: AtomicBool::new(false),
|
||||
});
|
||||
self.inner
|
||||
.jobs
|
||||
.lock()
|
||||
.expect("jobs lock poisoned")
|
||||
.insert(job_id, rec);
|
||||
|
||||
idempotency.insert(key.to_string(), job_id);
|
||||
job_id
|
||||
}
|
||||
|
||||
pub fn request_cancel(&self, job_id: Uuid) -> bool {
|
||||
let jobs = self.inner.jobs.lock().expect("jobs lock poisoned");
|
||||
let Some(rec) = jobs.get(&job_id) else {
|
||||
return false;
|
||||
};
|
||||
rec.cancel.store(true, Ordering::SeqCst);
|
||||
true
|
||||
}
|
||||
|
||||
pub fn cancel_requested(&self, job_id: Uuid) -> bool {
|
||||
let jobs = self.inner.jobs.lock().expect("jobs lock poisoned");
|
||||
let Some(rec) = jobs.get(&job_id) else {
|
||||
return false;
|
||||
};
|
||||
rec.cancel.load(Ordering::SeqCst)
|
||||
}
|
||||
|
||||
pub fn update<F>(&self, job_id: Uuid, f: F) -> bool
|
||||
where
|
||||
F: FnOnce(&mut Job),
|
||||
{
|
||||
let jobs = self.inner.jobs.lock().expect("jobs lock poisoned");
|
||||
let Some(rec) = jobs.get(&job_id) else {
|
||||
return false;
|
||||
};
|
||||
let mut job = rec.job.lock().expect("job lock poisoned");
|
||||
f(&mut job);
|
||||
true
|
||||
}
|
||||
}
|
||||
692
control/api/src/lib.rs
Normal file
692
control/api/src/lib.rs
Normal file
@@ -0,0 +1,692 @@
|
||||
mod admin;
|
||||
mod audit;
|
||||
mod auth;
|
||||
mod build_info;
|
||||
mod deployments;
|
||||
mod fleet;
|
||||
mod job_engine;
|
||||
mod jobs;
|
||||
mod placement;
|
||||
mod swarm;
|
||||
|
||||
pub use audit::AuditStore;
|
||||
pub use auth::{AuthConfig, Principal};
|
||||
use axum::{
|
||||
Router,
|
||||
extract::State,
|
||||
http::{HeaderName, HeaderValue, Request, StatusCode},
|
||||
middleware::{Next, from_fn, from_fn_with_state},
|
||||
response::{IntoResponse, Response},
|
||||
routing::get,
|
||||
};
|
||||
pub use build_info::{BuildInfo, extract_build_info};
|
||||
pub use deployments::{DeployAnnotationArgs, GrafanaAnnotation, build_grafana_deploy_annotation};
|
||||
pub use fleet::FleetService;
|
||||
pub use job_engine::TenantLocks;
|
||||
pub use jobs::JobStore;
|
||||
use metrics_exporter_prometheus::PrometheusHandle;
|
||||
pub use placement::PlacementStore;
|
||||
pub use placement::ServiceKind;
|
||||
use std::time::Instant;
|
||||
pub use swarm::SwarmStore;
|
||||
use tower_http::trace::TraceLayer;
|
||||
use tracing::{Span, field};
|
||||
use uuid::Uuid;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct AppState {
|
||||
pub prometheus: PrometheusHandle,
|
||||
pub auth: AuthConfig,
|
||||
pub jobs: JobStore,
|
||||
pub audit: AuditStore,
|
||||
pub tenant_locks: TenantLocks,
|
||||
pub http: reqwest::Client,
|
||||
pub placement: PlacementStore,
|
||||
pub fleet_services: Vec<FleetService>,
|
||||
pub swarm: SwarmStore,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RequestIds {
|
||||
pub request_id: String,
|
||||
pub correlation_id: Option<String>,
|
||||
pub traceparent: Option<String>,
|
||||
}
|
||||
|
||||
const HEADER_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
|
||||
const HEADER_CORRELATION_ID: HeaderName = HeaderName::from_static("x-correlation-id");
|
||||
const HEADER_TRACEPARENT: HeaderName = HeaderName::from_static("traceparent");
|
||||
|
||||
pub fn build_app(state: AppState) -> Router {
|
||||
let trace = TraceLayer::new_for_http()
|
||||
.make_span_with(|req: &Request<_>| {
|
||||
let request_id = req
|
||||
.headers()
|
||||
.get(&HEADER_REQUEST_ID)
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.unwrap_or("")
|
||||
.to_owned();
|
||||
|
||||
let correlation_id = req
|
||||
.headers()
|
||||
.get(&HEADER_CORRELATION_ID)
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.unwrap_or("")
|
||||
.to_owned();
|
||||
|
||||
tracing::info_span!(
|
||||
"http_request",
|
||||
request.method = %req.method(),
|
||||
request.path = %req.uri().path(),
|
||||
request_id = %request_id,
|
||||
correlation_id = %correlation_id,
|
||||
trace_id = "",
|
||||
status = field::Empty,
|
||||
duration_ms = field::Empty,
|
||||
)
|
||||
})
|
||||
.on_response(
|
||||
|res: &Response, latency: std::time::Duration, span: &Span| {
|
||||
span.record("status", field::display(res.status()));
|
||||
span.record("duration_ms", field::display(latency.as_millis()));
|
||||
tracing::info!("response");
|
||||
},
|
||||
);
|
||||
|
||||
let admin =
|
||||
admin::admin_router().layer(from_fn_with_state(state.clone(), auth::auth_middleware));
|
||||
|
||||
Router::new()
|
||||
.route("/health", get(health))
|
||||
.route("/ready", get(ready))
|
||||
.route("/metrics", get(metrics))
|
||||
.nest("/admin/v1", admin)
|
||||
.with_state(state)
|
||||
.layer(trace)
|
||||
.layer(from_fn(request_id_middleware))
|
||||
}
|
||||
|
||||
async fn health() -> impl IntoResponse {
|
||||
(StatusCode::OK, "ok")
|
||||
}
|
||||
|
||||
async fn ready() -> impl IntoResponse {
|
||||
(StatusCode::OK, "ready")
|
||||
}
|
||||
|
||||
async fn metrics(State(state): State<AppState>) -> impl IntoResponse {
|
||||
(StatusCode::OK, state.prometheus.render())
|
||||
}
|
||||
|
||||
async fn request_id_middleware(mut req: Request<axum::body::Body>, next: Next) -> Response {
|
||||
let request_id = req
|
||||
.headers()
|
||||
.get(&HEADER_REQUEST_ID)
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.map(|s| s.to_owned())
|
||||
.unwrap_or_else(|| Uuid::new_v4().to_string());
|
||||
|
||||
let correlation_id = req
|
||||
.headers()
|
||||
.get(&HEADER_CORRELATION_ID)
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.map(|s| s.to_owned());
|
||||
|
||||
let traceparent = req
|
||||
.headers()
|
||||
.get(&HEADER_TRACEPARENT)
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.map(|s| s.to_owned());
|
||||
|
||||
if req.headers().get(&HEADER_REQUEST_ID).is_none()
|
||||
&& let Ok(v) = HeaderValue::from_str(&request_id)
|
||||
{
|
||||
req.headers_mut().insert(HEADER_REQUEST_ID.clone(), v);
|
||||
}
|
||||
|
||||
req.extensions_mut().insert(RequestIds {
|
||||
request_id: request_id.clone(),
|
||||
correlation_id: correlation_id.clone(),
|
||||
traceparent: traceparent.clone(),
|
||||
});
|
||||
|
||||
let start = Instant::now();
|
||||
let mut res = next.run(req).await;
|
||||
|
||||
if let Ok(v) = HeaderValue::from_str(&request_id) {
|
||||
res.headers_mut().insert(HEADER_REQUEST_ID.clone(), v);
|
||||
}
|
||||
|
||||
if let Some(correlation_id) = correlation_id
|
||||
&& let Ok(v) = HeaderValue::from_str(&correlation_id)
|
||||
{
|
||||
res.headers_mut().insert(HEADER_CORRELATION_ID.clone(), v);
|
||||
}
|
||||
|
||||
metrics::histogram!("http_request_duration_ms").record(start.elapsed().as_millis() as f64);
|
||||
res
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::jobs::JobStatus;
|
||||
use axum::{
|
||||
body::Body,
|
||||
http::{Request, StatusCode, header},
|
||||
};
|
||||
use jsonwebtoken::{EncodingKey, Header, encode};
|
||||
use metrics_exporter_prometheus::PrometheusBuilder;
|
||||
use serde::Serialize;
|
||||
use std::fs;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::OnceLock;
|
||||
use tower::ServiceExt;
|
||||
use uuid::Uuid;
|
||||
|
||||
static HANDLE: OnceLock<PrometheusHandle> = OnceLock::new();
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct TestClaims {
|
||||
sub: String,
|
||||
session_id: String,
|
||||
permissions: Vec<String>,
|
||||
exp: usize,
|
||||
}
|
||||
|
||||
fn test_app() -> Router {
|
||||
test_app_with_fleet(vec![])
|
||||
}
|
||||
|
||||
fn test_app_with_fleet(fleet_services: Vec<FleetService>) -> Router {
|
||||
let handle = HANDLE
|
||||
.get_or_init(|| {
|
||||
PrometheusBuilder::new()
|
||||
.install_recorder()
|
||||
.expect("failed to install prometheus recorder")
|
||||
})
|
||||
.clone();
|
||||
|
||||
let placement_path = temp_placement_file();
|
||||
|
||||
build_app(AppState {
|
||||
prometheus: handle,
|
||||
auth: AuthConfig {
|
||||
hs256_secret: Some(b"test_secret".to_vec()),
|
||||
},
|
||||
jobs: JobStore::default(),
|
||||
audit: AuditStore::default(),
|
||||
tenant_locks: TenantLocks::default(),
|
||||
http: reqwest::Client::new(),
|
||||
placement: PlacementStore::new(placement_path),
|
||||
fleet_services,
|
||||
swarm: SwarmStore::new(repo_root().join("swarm/dev.json")),
|
||||
})
|
||||
}
|
||||
|
||||
fn repo_root() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.parent()
|
||||
.and_then(|p| p.parent())
|
||||
.expect("api crate should live under repo root")
|
||||
.to_path_buf()
|
||||
}
|
||||
|
||||
fn temp_placement_file() -> PathBuf {
|
||||
let root = repo_root();
|
||||
let src = root.join("placement/dev.json");
|
||||
let mut dst = std::env::temp_dir();
|
||||
dst.push(format!(
|
||||
"cloudlysis-control-placement-{}-{}.json",
|
||||
std::process::id(),
|
||||
Uuid::new_v4()
|
||||
));
|
||||
let raw = fs::read_to_string(src).expect("missing placement/dev.json");
|
||||
fs::write(&dst, raw).expect("failed to write temp placement file");
|
||||
dst
|
||||
}
|
||||
|
||||
fn assert_send_sync<T: Send + Sync>() {}
|
||||
|
||||
#[test]
|
||||
fn core_state_types_are_send_sync() {
|
||||
assert_send_sync::<AppState>();
|
||||
assert_send_sync::<JobStore>();
|
||||
assert_send_sync::<AuthConfig>();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn health_returns_200() {
|
||||
let res = test_app()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/health")
|
||||
.body(Body::empty())
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(res.status(), StatusCode::OK);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn ready_returns_200() {
|
||||
let res = test_app()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/ready")
|
||||
.body(Body::empty())
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(res.status(), StatusCode::OK);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn metrics_returns_200() {
|
||||
let res = test_app()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/metrics")
|
||||
.body(Body::empty())
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(res.status(), StatusCode::OK);
|
||||
}
|
||||
|
||||
fn make_token(perms: &[&str]) -> String {
|
||||
let exp = (std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs()
|
||||
+ 60) as usize;
|
||||
|
||||
encode(
|
||||
&Header::default(),
|
||||
&TestClaims {
|
||||
sub: "user_1".to_string(),
|
||||
session_id: "sess_1".to_string(),
|
||||
permissions: perms.iter().map(|p| (*p).to_string()).collect(),
|
||||
exp,
|
||||
},
|
||||
&EncodingKey::from_secret(b"test_secret"),
|
||||
)
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn unauthorized_admin_calls_return_401() {
|
||||
let res = test_app()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/admin/v1/platform/info")
|
||||
.body(Body::empty())
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(res.status(), StatusCode::UNAUTHORIZED);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn forbidden_admin_calls_return_403() {
|
||||
let token = make_token(&["control:read"]);
|
||||
let res = test_app()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/admin/v1/jobs/echo")
|
||||
.method("POST")
|
||||
.header(header::AUTHORIZATION, format!("Bearer {token}"))
|
||||
.header("idempotency-key", "k1")
|
||||
.body(Body::empty())
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(res.status(), StatusCode::FORBIDDEN);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn tenant_scoped_endpoints_require_x_tenant_id() {
|
||||
let token = make_token(&["control:read"]);
|
||||
let res = test_app()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/admin/v1/tenants/echo")
|
||||
.header(header::AUTHORIZATION, format!("Bearer {token}"))
|
||||
.body(Body::empty())
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(res.status(), StatusCode::BAD_REQUEST);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn job_create_is_idempotent() {
|
||||
let token = make_token(&["control:write"]);
|
||||
let app = test_app();
|
||||
let res1 = app
|
||||
.clone()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/admin/v1/jobs/echo")
|
||||
.method("POST")
|
||||
.header(header::AUTHORIZATION, format!("Bearer {token}"))
|
||||
.header("idempotency-key", "same-key")
|
||||
.body(Body::empty())
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(res1.status(), StatusCode::OK);
|
||||
let body1 = axum::body::to_bytes(res1.into_body(), 1024 * 1024)
|
||||
.await
|
||||
.unwrap();
|
||||
let v1: serde_json::Value = serde_json::from_slice(&body1).unwrap();
|
||||
let id1 = Uuid::parse_str(v1.get("job_id").unwrap().as_str().unwrap()).unwrap();
|
||||
|
||||
let res2 = app
|
||||
.clone()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/admin/v1/jobs/echo")
|
||||
.method("POST")
|
||||
.header(header::AUTHORIZATION, format!("Bearer {token}"))
|
||||
.header("idempotency-key", "same-key")
|
||||
.body(Body::empty())
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(res2.status(), StatusCode::OK);
|
||||
let body2 = axum::body::to_bytes(res2.into_body(), 1024 * 1024)
|
||||
.await
|
||||
.unwrap();
|
||||
let v2: serde_json::Value = serde_json::from_slice(&body2).unwrap();
|
||||
let id2 = Uuid::parse_str(v2.get("job_id").unwrap().as_str().unwrap()).unwrap();
|
||||
|
||||
assert_eq!(id1, id2);
|
||||
}
|
||||
|
||||
async fn wait_for_terminal_status(app: Router, job_id: Uuid) -> JobStatus {
|
||||
let start = tokio::time::Instant::now();
|
||||
loop {
|
||||
let res = app
|
||||
.clone()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri(format!("/admin/v1/jobs/{job_id}"))
|
||||
.header(
|
||||
header::AUTHORIZATION,
|
||||
format!("Bearer {}", make_token(&["control:read"])),
|
||||
)
|
||||
.body(Body::empty())
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
if res.status() == StatusCode::OK {
|
||||
let body = axum::body::to_bytes(res.into_body(), 1024 * 1024)
|
||||
.await
|
||||
.unwrap();
|
||||
let job: crate::jobs::Job = serde_json::from_slice(&body).unwrap();
|
||||
if job.status != JobStatus::Pending && job.status != JobStatus::Running {
|
||||
return job.status;
|
||||
}
|
||||
}
|
||||
|
||||
if start.elapsed() > std::time::Duration::from_millis(500) {
|
||||
return JobStatus::Failed;
|
||||
}
|
||||
tokio::time::sleep(std::time::Duration::from_millis(10)).await;
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn tenant_job_idempotency_does_not_duplicate_effects() {
|
||||
let token = make_token(&["control:write", "control:read"]);
|
||||
let app = test_app();
|
||||
let tenant_id = Uuid::new_v4();
|
||||
|
||||
let body = serde_json::json!({
|
||||
"tenant_id": tenant_id,
|
||||
"reason": "test",
|
||||
});
|
||||
|
||||
let res1 = app
|
||||
.clone()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/admin/v1/jobs/tenant/drain")
|
||||
.method("POST")
|
||||
.header(header::AUTHORIZATION, format!("Bearer {token}"))
|
||||
.header("idempotency-key", "same-key")
|
||||
.header(header::CONTENT_TYPE, "application/json")
|
||||
.body(Body::from(body.to_string()))
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(res1.status(), StatusCode::OK);
|
||||
|
||||
let res2 = app
|
||||
.clone()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/admin/v1/jobs/tenant/drain")
|
||||
.method("POST")
|
||||
.header(header::AUTHORIZATION, format!("Bearer {token}"))
|
||||
.header("idempotency-key", "same-key")
|
||||
.header(header::CONTENT_TYPE, "application/json")
|
||||
.body(Body::from(body.to_string()))
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(res2.status(), StatusCode::OK);
|
||||
|
||||
let b1 = axum::body::to_bytes(res1.into_body(), 1024 * 1024)
|
||||
.await
|
||||
.unwrap();
|
||||
let b2 = axum::body::to_bytes(res2.into_body(), 1024 * 1024)
|
||||
.await
|
||||
.unwrap();
|
||||
let v1: serde_json::Value = serde_json::from_slice(&b1).unwrap();
|
||||
let v2: serde_json::Value = serde_json::from_slice(&b2).unwrap();
|
||||
assert_eq!(v1.get("job_id"), v2.get("job_id"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn tenant_lock_prevents_concurrent_mutations() {
|
||||
let token = make_token(&["control:write", "control:read"]);
|
||||
let app = test_app();
|
||||
let tenant_id = Uuid::new_v4();
|
||||
|
||||
let res1 = app
|
||||
.clone()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/admin/v1/jobs/tenant/drain")
|
||||
.method("POST")
|
||||
.header(header::AUTHORIZATION, format!("Bearer {token}"))
|
||||
.header("idempotency-key", "k1")
|
||||
.header(header::CONTENT_TYPE, "application/json")
|
||||
.body(Body::from(
|
||||
serde_json::json!({ "tenant_id": tenant_id, "reason": "r" }).to_string(),
|
||||
))
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(res1.status(), StatusCode::OK);
|
||||
|
||||
let res2 = app
|
||||
.clone()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/admin/v1/jobs/tenant/migrate")
|
||||
.method("POST")
|
||||
.header(header::AUTHORIZATION, format!("Bearer {token}"))
|
||||
.header("idempotency-key", "k2")
|
||||
.header(header::CONTENT_TYPE, "application/json")
|
||||
.body(Body::from(
|
||||
serde_json::json!({
|
||||
"tenant_id": tenant_id,
|
||||
"runner_target": "node-2",
|
||||
"reason": "r2"
|
||||
})
|
||||
.to_string(),
|
||||
))
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(res2.status(), StatusCode::CONFLICT);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn migrate_preflight_fails_when_fleet_not_ready() {
|
||||
let token = make_token(&["control:write", "control:read"]);
|
||||
let app = test_app_with_fleet(vec![FleetService {
|
||||
name: "unreachable".to_string(),
|
||||
base_url: "http://127.0.0.1:1".to_string(),
|
||||
}]);
|
||||
|
||||
let tenant_id = Uuid::new_v4();
|
||||
let res = app
|
||||
.clone()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/admin/v1/jobs/tenant/migrate")
|
||||
.method("POST")
|
||||
.header(header::AUTHORIZATION, format!("Bearer {token}"))
|
||||
.header("idempotency-key", "k3")
|
||||
.header(header::CONTENT_TYPE, "application/json")
|
||||
.body(Body::from(
|
||||
serde_json::json!({
|
||||
"tenant_id": tenant_id,
|
||||
"runner_target": "node-2",
|
||||
"reason": "r"
|
||||
})
|
||||
.to_string(),
|
||||
))
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(res.status(), StatusCode::OK);
|
||||
|
||||
let body = axum::body::to_bytes(res.into_body(), 1024 * 1024)
|
||||
.await
|
||||
.unwrap();
|
||||
let v: serde_json::Value = serde_json::from_slice(&body).unwrap();
|
||||
let job_id = Uuid::parse_str(v.get("job_id").unwrap().as_str().unwrap()).unwrap();
|
||||
|
||||
let status = wait_for_terminal_status(app, job_id).await;
|
||||
assert_eq!(status, JobStatus::Failed);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn cancel_marks_job_cancelled() {
|
||||
let token = make_token(&["control:write", "control:read"]);
|
||||
let app = test_app();
|
||||
let tenant_id = Uuid::new_v4();
|
||||
|
||||
let res = app
|
||||
.clone()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/admin/v1/jobs/tenant/migrate")
|
||||
.method("POST")
|
||||
.header(header::AUTHORIZATION, format!("Bearer {token}"))
|
||||
.header("idempotency-key", "k4")
|
||||
.header(header::CONTENT_TYPE, "application/json")
|
||||
.body(Body::from(
|
||||
serde_json::json!({
|
||||
"tenant_id": tenant_id,
|
||||
"runner_target": "node-2",
|
||||
"reason": "r"
|
||||
})
|
||||
.to_string(),
|
||||
))
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(res.status(), StatusCode::OK);
|
||||
|
||||
let body = axum::body::to_bytes(res.into_body(), 1024 * 1024)
|
||||
.await
|
||||
.unwrap();
|
||||
let v: serde_json::Value = serde_json::from_slice(&body).unwrap();
|
||||
let job_id = Uuid::parse_str(v.get("job_id").unwrap().as_str().unwrap()).unwrap();
|
||||
|
||||
let res = app
|
||||
.clone()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri(format!("/admin/v1/jobs/{job_id}/cancel"))
|
||||
.method("POST")
|
||||
.header(header::AUTHORIZATION, format!("Bearer {token}"))
|
||||
.body(Body::empty())
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(res.status(), StatusCode::OK);
|
||||
|
||||
let status = wait_for_terminal_status(app, job_id).await;
|
||||
assert_eq!(status, JobStatus::Cancelled);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn migration_plan_is_deterministic() {
|
||||
let token = make_token(&["control:write"]);
|
||||
let app = test_app();
|
||||
let tenant_id = Uuid::new_v4();
|
||||
|
||||
let res = app
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/admin/v1/plan/tenant/migrate")
|
||||
.method("POST")
|
||||
.header(header::AUTHORIZATION, format!("Bearer {token}"))
|
||||
.header(header::CONTENT_TYPE, "application/json")
|
||||
.body(Body::from(
|
||||
serde_json::json!({
|
||||
"tenant_id": tenant_id,
|
||||
"runner_target": "node-2",
|
||||
"reason": "r"
|
||||
})
|
||||
.to_string(),
|
||||
))
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(res.status(), StatusCode::OK);
|
||||
|
||||
let body = axum::body::to_bytes(res.into_body(), 1024 * 1024)
|
||||
.await
|
||||
.unwrap();
|
||||
let v: serde_json::Value = serde_json::from_slice(&body).unwrap();
|
||||
assert_eq!(
|
||||
v.get("steps").unwrap(),
|
||||
&serde_json::json!(["preflight", "drain", "update_placement", "reload", "verify"])
|
||||
);
|
||||
}
|
||||
}
|
||||
109
control/api/src/main.rs
Normal file
109
control/api/src/main.rs
Normal file
@@ -0,0 +1,109 @@
|
||||
use clap::Parser;
|
||||
use metrics_exporter_prometheus::PrometheusBuilder;
|
||||
use std::net::SocketAddr;
|
||||
use tracing_subscriber::EnvFilter;
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(name = "control-api")]
|
||||
struct Args {
|
||||
#[arg(long, env = "CONTROL_API_ADDR", default_value = "127.0.0.1:8080")]
|
||||
addr: SocketAddr,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let args = Args::parse();
|
||||
|
||||
tracing_subscriber::fmt()
|
||||
.with_env_filter(
|
||||
EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")),
|
||||
)
|
||||
.init();
|
||||
|
||||
let recorder = PrometheusBuilder::new()
|
||||
.set_buckets(&[
|
||||
1.0, 2.5, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0, 1000.0, 2500.0, 5000.0,
|
||||
])
|
||||
.expect("invalid prometheus buckets")
|
||||
.install_recorder()
|
||||
.expect("failed to install prometheus recorder");
|
||||
|
||||
let http = reqwest::Client::builder()
|
||||
.user_agent("cloudlysis-control-api")
|
||||
.build()
|
||||
.expect("failed to build http client");
|
||||
|
||||
let placement_path = std::env::var("CONTROL_PLACEMENT_PATH")
|
||||
.ok()
|
||||
.unwrap_or_else(|| "placement/dev.json".to_string())
|
||||
.into();
|
||||
|
||||
let swarm_path = std::env::var("CONTROL_SWARM_STATE_PATH")
|
||||
.ok()
|
||||
.unwrap_or_else(|| "swarm/dev.json".to_string())
|
||||
.into();
|
||||
|
||||
let self_url = std::env::var("CONTROL_SELF_URL")
|
||||
.ok()
|
||||
.unwrap_or_else(|| "http://127.0.0.1:8080".to_string());
|
||||
|
||||
let mut fleet_services = vec![api::FleetService {
|
||||
name: "control-api".to_string(),
|
||||
base_url: self_url,
|
||||
}];
|
||||
if let Ok(spec) = std::env::var("CONTROL_FLEET_SERVICES") {
|
||||
fleet_services.extend(parse_fleet_services(&spec));
|
||||
}
|
||||
|
||||
let app = api::build_app(api::AppState {
|
||||
prometheus: recorder,
|
||||
auth: api::AuthConfig {
|
||||
hs256_secret: std::env::var("CONTROL_GATEWAY_JWT_HS256_SECRET")
|
||||
.ok()
|
||||
.map(|s| s.into_bytes()),
|
||||
},
|
||||
jobs: api::JobStore::default(),
|
||||
audit: api::AuditStore::default(),
|
||||
tenant_locks: api::TenantLocks::default(),
|
||||
http,
|
||||
placement: api::PlacementStore::new(placement_path),
|
||||
fleet_services,
|
||||
swarm: api::SwarmStore::new(swarm_path),
|
||||
});
|
||||
|
||||
let listener = tokio::net::TcpListener::bind(args.addr)
|
||||
.await
|
||||
.expect("failed to bind");
|
||||
|
||||
tracing::info!(addr = %args.addr, "control api listening");
|
||||
|
||||
axum::serve(listener, app)
|
||||
.with_graceful_shutdown(shutdown_signal())
|
||||
.await
|
||||
.expect("server failed");
|
||||
}
|
||||
|
||||
async fn shutdown_signal() {
|
||||
let _ = tokio::signal::ctrl_c().await;
|
||||
}
|
||||
|
||||
fn parse_fleet_services(spec: &str) -> Vec<api::FleetService> {
|
||||
spec.split(',')
|
||||
.filter_map(|pair| {
|
||||
let pair = pair.trim();
|
||||
if pair.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let (name, url) = pair.split_once('=')?;
|
||||
let name = name.trim();
|
||||
let url = url.trim();
|
||||
if name.is_empty() || url.is_empty() {
|
||||
return None;
|
||||
}
|
||||
Some(api::FleetService {
|
||||
name: name.to_string(),
|
||||
base_url: url.to_string(),
|
||||
})
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
227
control/api/src/placement.rs
Normal file
227
control/api/src/placement.rs
Normal file
@@ -0,0 +1,227 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{
|
||||
collections::BTreeMap,
|
||||
fs,
|
||||
path::{Path, PathBuf},
|
||||
sync::{Arc, RwLock},
|
||||
time::SystemTime,
|
||||
};
|
||||
use uuid::Uuid;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum ServiceKind {
|
||||
Aggregate,
|
||||
Projection,
|
||||
Runner,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct PlacementFile {
|
||||
pub revision: Option<String>,
|
||||
pub aggregate_placement: Option<PlacementKind>,
|
||||
pub projection_placement: Option<PlacementKind>,
|
||||
pub runner_placement: Option<PlacementKind>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct PlacementKind {
|
||||
pub placements: Vec<TenantPlacement>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct TenantPlacement {
|
||||
pub tenant_id: Uuid,
|
||||
pub targets: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct PlacementResponse {
|
||||
pub kind: ServiceKind,
|
||||
pub revision: String,
|
||||
pub placements: Vec<TenantPlacement>,
|
||||
}
|
||||
|
||||
impl PlacementFile {
|
||||
pub fn load(path: &Path) -> Option<Self> {
|
||||
let raw = fs::read_to_string(path).ok()?;
|
||||
serde_json::from_str(&raw).ok()
|
||||
}
|
||||
|
||||
pub fn for_kind(&self, kind: ServiceKind) -> PlacementResponse {
|
||||
let revision = self.revision.clone().unwrap_or_else(|| "dev".to_string());
|
||||
let placements = match kind {
|
||||
ServiceKind::Aggregate => self
|
||||
.aggregate_placement
|
||||
.as_ref()
|
||||
.map(|p| p.placements.clone())
|
||||
.unwrap_or_default(),
|
||||
ServiceKind::Projection => self
|
||||
.projection_placement
|
||||
.as_ref()
|
||||
.map(|p| p.placements.clone())
|
||||
.unwrap_or_default(),
|
||||
ServiceKind::Runner => self
|
||||
.runner_placement
|
||||
.as_ref()
|
||||
.map(|p| p.placements.clone())
|
||||
.unwrap_or_default(),
|
||||
};
|
||||
|
||||
PlacementResponse {
|
||||
kind,
|
||||
revision,
|
||||
placements,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct PlacementStore {
|
||||
inner: Arc<RwLock<Inner>>,
|
||||
}
|
||||
|
||||
struct Inner {
|
||||
path: PathBuf,
|
||||
last_modified: Option<SystemTime>,
|
||||
cached: Option<PlacementFile>,
|
||||
}
|
||||
|
||||
impl PlacementStore {
|
||||
pub fn new(path: PathBuf) -> Self {
|
||||
Self {
|
||||
inner: Arc::new(RwLock::new(Inner {
|
||||
path,
|
||||
last_modified: None,
|
||||
cached: None,
|
||||
})),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_for_kind(&self, kind: ServiceKind) -> PlacementResponse {
|
||||
let mut inner = self.inner.write().expect("placement lock poisoned");
|
||||
inner.reload_if_changed();
|
||||
match inner.cached.as_ref() {
|
||||
Some(p) => p.for_kind(kind),
|
||||
None => PlacementResponse {
|
||||
kind,
|
||||
revision: "dev".to_string(),
|
||||
placements: vec![],
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub fn tenant_summaries(&self) -> Vec<TenantSummary> {
|
||||
let mut inner = self.inner.write().expect("placement lock poisoned");
|
||||
inner.reload_if_changed();
|
||||
|
||||
let Some(p) = inner.cached.as_ref() else {
|
||||
return vec![];
|
||||
};
|
||||
|
||||
let mut map: BTreeMap<Uuid, TenantSummary> = BTreeMap::new();
|
||||
|
||||
for (kind, placements) in [
|
||||
(
|
||||
ServiceKind::Aggregate,
|
||||
p.for_kind(ServiceKind::Aggregate).placements,
|
||||
),
|
||||
(
|
||||
ServiceKind::Projection,
|
||||
p.for_kind(ServiceKind::Projection).placements,
|
||||
),
|
||||
(
|
||||
ServiceKind::Runner,
|
||||
p.for_kind(ServiceKind::Runner).placements,
|
||||
),
|
||||
] {
|
||||
for tp in placements {
|
||||
let entry = map.entry(tp.tenant_id).or_insert_with(|| TenantSummary {
|
||||
tenant_id: tp.tenant_id,
|
||||
aggregate_targets: vec![],
|
||||
projection_targets: vec![],
|
||||
runner_targets: vec![],
|
||||
});
|
||||
match kind {
|
||||
ServiceKind::Aggregate => entry.aggregate_targets = tp.targets,
|
||||
ServiceKind::Projection => entry.projection_targets = tp.targets,
|
||||
ServiceKind::Runner => entry.runner_targets = tp.targets,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
map.into_values().collect()
|
||||
}
|
||||
|
||||
pub fn update_runner_target(
|
||||
&self,
|
||||
tenant_id: Uuid,
|
||||
runner_target: String,
|
||||
) -> Result<String, String> {
|
||||
let mut inner = self.inner.write().expect("placement lock poisoned");
|
||||
inner.reload_if_changed();
|
||||
|
||||
let mut file = inner.cached.clone().unwrap_or(PlacementFile {
|
||||
revision: Some("dev".to_string()),
|
||||
aggregate_placement: Some(PlacementKind { placements: vec![] }),
|
||||
projection_placement: Some(PlacementKind { placements: vec![] }),
|
||||
runner_placement: Some(PlacementKind { placements: vec![] }),
|
||||
});
|
||||
|
||||
let mut runner = file
|
||||
.runner_placement
|
||||
.take()
|
||||
.unwrap_or(PlacementKind { placements: vec![] });
|
||||
|
||||
if let Some(existing) = runner
|
||||
.placements
|
||||
.iter_mut()
|
||||
.find(|p| p.tenant_id == tenant_id)
|
||||
{
|
||||
existing.targets = vec![runner_target];
|
||||
} else {
|
||||
runner.placements.push(TenantPlacement {
|
||||
tenant_id,
|
||||
targets: vec![runner_target],
|
||||
});
|
||||
}
|
||||
|
||||
runner.placements.sort_by_key(|p| p.tenant_id);
|
||||
file.runner_placement = Some(runner);
|
||||
|
||||
let revision = format!("rev-{}", Uuid::new_v4());
|
||||
file.revision = Some(revision.clone());
|
||||
|
||||
let raw = serde_json::to_string_pretty(&file).map_err(|e| e.to_string())?;
|
||||
let tmp = inner.path.with_extension("json.tmp");
|
||||
fs::write(&tmp, raw).map_err(|e| e.to_string())?;
|
||||
fs::rename(&tmp, &inner.path).map_err(|e| e.to_string())?;
|
||||
|
||||
inner.last_modified = None;
|
||||
inner.cached = Some(file);
|
||||
|
||||
Ok(revision)
|
||||
}
|
||||
}
|
||||
|
||||
impl Inner {
|
||||
fn reload_if_changed(&mut self) {
|
||||
let meta = fs::metadata(&self.path).ok();
|
||||
let modified = meta.and_then(|m| m.modified().ok());
|
||||
|
||||
if self.cached.is_some() && modified.is_some() && modified == self.last_modified {
|
||||
return;
|
||||
}
|
||||
|
||||
self.last_modified = modified;
|
||||
self.cached = PlacementFile::load(&self.path);
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct TenantSummary {
|
||||
pub tenant_id: Uuid,
|
||||
pub aggregate_targets: Vec<String>,
|
||||
pub projection_targets: Vec<String>,
|
||||
pub runner_targets: Vec<String>,
|
||||
}
|
||||
62
control/api/src/swarm.rs
Normal file
62
control/api/src/swarm.rs
Normal file
@@ -0,0 +1,62 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{fs, path::Path};
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct SwarmService {
|
||||
pub name: String,
|
||||
pub image: Option<String>,
|
||||
pub mode: Option<String>,
|
||||
pub replicas: Option<String>,
|
||||
pub updated_at: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct SwarmTask {
|
||||
pub id: String,
|
||||
pub service: String,
|
||||
pub node: Option<String>,
|
||||
pub desired_state: Option<String>,
|
||||
pub current_state: Option<String>,
|
||||
pub error: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct SwarmStateFile {
|
||||
pub services: Vec<SwarmService>,
|
||||
pub tasks: Vec<SwarmTask>,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct SwarmStore {
|
||||
path: std::path::PathBuf,
|
||||
}
|
||||
|
||||
impl SwarmStore {
|
||||
pub fn new(path: std::path::PathBuf) -> Self {
|
||||
Self { path }
|
||||
}
|
||||
|
||||
pub fn list_services(&self) -> Vec<SwarmService> {
|
||||
self.load().map(|s| s.services).unwrap_or_default()
|
||||
}
|
||||
|
||||
pub fn list_tasks(&self, service_name: &str) -> Vec<SwarmTask> {
|
||||
self.load()
|
||||
.map(|s| {
|
||||
s.tasks
|
||||
.into_iter()
|
||||
.filter(|t| t.service == service_name)
|
||||
.collect()
|
||||
})
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
fn load(&self) -> Option<SwarmStateFile> {
|
||||
load_state(&self.path)
|
||||
}
|
||||
}
|
||||
|
||||
fn load_state(path: &Path) -> Option<SwarmStateFile> {
|
||||
let raw = fs::read_to_string(path).ok()?;
|
||||
serde_json::from_str(&raw).ok()
|
||||
}
|
||||
16
control/api/tests/annotations.rs
Normal file
16
control/api/tests/annotations.rs
Normal file
@@ -0,0 +1,16 @@
|
||||
#[test]
|
||||
fn annotation_writer_produces_expected_grafana_payload() {
|
||||
let a = api::build_grafana_deploy_annotation(api::DeployAnnotationArgs {
|
||||
service: "gateway",
|
||||
version: Some("1.2.3"),
|
||||
git_sha: Some("abc123"),
|
||||
time_ms: 1234567890,
|
||||
});
|
||||
|
||||
assert_eq!(a.time, 1234567890);
|
||||
assert!(a.tags.iter().any(|t| t == "deploy"));
|
||||
assert!(a.tags.iter().any(|t| t == "service:gateway"));
|
||||
assert!(a.tags.iter().any(|t| t == "version:1.2.3"));
|
||||
assert!(a.tags.iter().any(|t| t == "git_sha:abc123"));
|
||||
assert!(a.text.contains("deploy gateway"));
|
||||
}
|
||||
39
control/api/tests/build_info.rs
Normal file
39
control/api/tests/build_info.rs
Normal file
@@ -0,0 +1,39 @@
|
||||
#[test]
|
||||
fn build_info_parser_extracts_expected_labels() {
|
||||
let metrics = r#"
|
||||
# HELP gateway_build_info build info
|
||||
# TYPE gateway_build_info gauge
|
||||
gateway_build_info{service="gateway",version="1.2.3",git_sha="abc"} 1
|
||||
runner_build_info{service="runner",version="2.0.0",git_sha="def"} 1
|
||||
unrelated_metric 5
|
||||
"#;
|
||||
|
||||
let info = api::extract_build_info(metrics);
|
||||
assert_eq!(info.len(), 2);
|
||||
assert!(
|
||||
info.iter()
|
||||
.any(|i| i.service == "gateway" && i.version == "1.2.3" && i.git_sha == "abc")
|
||||
);
|
||||
assert!(
|
||||
info.iter()
|
||||
.any(|i| i.service == "runner" && i.version == "2.0.0" && i.git_sha == "def")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_info_snapshot_has_required_services() {
|
||||
let metrics = r#"
|
||||
gateway_build_info{service="gateway",version="1.2.3",git_sha="abc"} 1
|
||||
aggregate_build_info{service="aggregate",version="1.0.0",git_sha="aaa"} 1
|
||||
projection_build_info{service="projection",version="1.0.0",git_sha="bbb"} 1
|
||||
runner_build_info{service="runner",version="2.0.0",git_sha="ccc"} 1
|
||||
"#;
|
||||
|
||||
let info = api::extract_build_info(metrics);
|
||||
for required in ["gateway", "aggregate", "projection", "runner"] {
|
||||
assert!(
|
||||
info.iter().any(|i| i.service == required),
|
||||
"missing build_info for service={required}"
|
||||
);
|
||||
}
|
||||
}
|
||||
55
control/api/tests/docker_config_validation.rs
Normal file
55
control/api/tests/docker_config_validation.rs
Normal file
@@ -0,0 +1,55 @@
|
||||
use std::{fs, path::PathBuf, time::Duration};
|
||||
|
||||
fn repo_root() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.parent()
|
||||
.and_then(|p| p.parent())
|
||||
.expect("api crate should live under repo root")
|
||||
.to_path_buf()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn docker_compose_files_parse_and_include_required_services() {
|
||||
let root = repo_root();
|
||||
let compose = fs::read_to_string(root.join("observability/docker-compose.yml")).unwrap();
|
||||
let v: serde_yaml::Value = serde_yaml::from_str(&compose).unwrap();
|
||||
|
||||
let services = v
|
||||
.get("services")
|
||||
.and_then(|x| x.as_mapping())
|
||||
.expect("missing services");
|
||||
|
||||
for required in ["grafana", "victoria-metrics", "vmagent", "loki", "tempo"] {
|
||||
assert!(
|
||||
services.contains_key(serde_yaml::Value::String(required.to_string())),
|
||||
"missing service {required}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore]
|
||||
async fn docker_compose_config_validation_is_gated_and_fast() {
|
||||
let enabled = std::env::var("CONTROL_TEST_DOCKER").ok();
|
||||
assert_eq!(enabled.as_deref(), Some("1"));
|
||||
|
||||
let root = repo_root();
|
||||
let compose = root.join("observability/docker-compose.yml");
|
||||
|
||||
let cmd = tokio::process::Command::new("docker")
|
||||
.args(["compose", "-f"])
|
||||
.arg(compose)
|
||||
.args(["config"])
|
||||
.output();
|
||||
|
||||
let out = tokio::time::timeout(Duration::from_secs(10), cmd)
|
||||
.await
|
||||
.expect("docker compose config timed out")
|
||||
.expect("failed to run docker compose config");
|
||||
|
||||
assert!(
|
||||
out.status.success(),
|
||||
"docker compose config failed: {}",
|
||||
String::from_utf8_lossy(&out.stderr)
|
||||
);
|
||||
}
|
||||
6
control/api/tests/docker_gated.rs
Normal file
6
control/api/tests/docker_gated.rs
Normal file
@@ -0,0 +1,6 @@
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn docker_integration_tests_are_gated() {
|
||||
let enabled = std::env::var("CONTROL_TEST_DOCKER").ok();
|
||||
assert_eq!(enabled.as_deref(), Some("1"));
|
||||
}
|
||||
183
control/api/tests/e2e_control_plane_fleet_docker.rs
Normal file
183
control/api/tests/e2e_control_plane_fleet_docker.rs
Normal file
@@ -0,0 +1,183 @@
|
||||
use jsonwebtoken::{EncodingKey, Header, encode};
|
||||
use serde::Serialize;
|
||||
use std::{fs, net::TcpListener, time::Duration};
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct Claims {
|
||||
sub: String,
|
||||
session_id: String,
|
||||
permissions: Vec<String>,
|
||||
exp: usize,
|
||||
}
|
||||
|
||||
fn free_port() -> u16 {
|
||||
TcpListener::bind("127.0.0.1:0")
|
||||
.unwrap()
|
||||
.local_addr()
|
||||
.unwrap()
|
||||
.port()
|
||||
}
|
||||
|
||||
fn token(secret: &[u8], perms: &[&str]) -> String {
|
||||
let exp = (std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs()
|
||||
+ 60) as usize;
|
||||
|
||||
encode(
|
||||
&Header::default(),
|
||||
&Claims {
|
||||
sub: "op_1".to_string(),
|
||||
session_id: "sess_1".to_string(),
|
||||
permissions: perms.iter().map(|p| (*p).to_string()).collect(),
|
||||
exp,
|
||||
},
|
||||
&EncodingKey::from_secret(secret),
|
||||
)
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
async fn wait_ready(url: &str) {
|
||||
let client = reqwest::Client::new();
|
||||
let start = tokio::time::Instant::now();
|
||||
loop {
|
||||
let ok = client
|
||||
.get(format!("{url}/ready"))
|
||||
.send()
|
||||
.await
|
||||
.map(|r| r.status().is_success())
|
||||
.unwrap_or(false);
|
||||
if ok {
|
||||
return;
|
||||
}
|
||||
if start.elapsed() > Duration::from_secs(10) {
|
||||
panic!("control-api did not become ready");
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(100)).await;
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore]
|
||||
async fn control_plane_can_see_the_fleet_via_docker_stubs() {
|
||||
let enabled = std::env::var("CONTROL_TEST_DOCKER").ok();
|
||||
assert_eq!(enabled.as_deref(), Some("1"));
|
||||
|
||||
let nginx_conf = r#"
|
||||
server {
|
||||
listen 80;
|
||||
server_name _;
|
||||
|
||||
location = /health { return 200 "ok\n"; }
|
||||
location = /ready { return 200 "ready\n"; }
|
||||
location = /metrics { return 200 "stub_build_info{service=\"stub\",version=\"dev\",git_sha=\"000\"} 1\n"; }
|
||||
}
|
||||
"#;
|
||||
|
||||
let mut conf_path = std::env::temp_dir();
|
||||
conf_path.push(format!(
|
||||
"cloudlysis-control-nginx-{}.conf",
|
||||
uuid::Uuid::new_v4()
|
||||
));
|
||||
fs::write(&conf_path, nginx_conf).unwrap();
|
||||
|
||||
let gateway_port = free_port();
|
||||
let runner_port = free_port();
|
||||
let aggregate_port = free_port();
|
||||
let projection_port = free_port();
|
||||
|
||||
async fn run_stub(name: &str, port: u16, conf: &std::path::Path) -> String {
|
||||
let out = tokio::process::Command::new("docker")
|
||||
.args(["run", "-d", "--rm"])
|
||||
.args(["-p", &format!("{port}:80")])
|
||||
.args([
|
||||
"-v",
|
||||
&format!("{}:/etc/nginx/conf.d/default.conf:ro", conf.display()),
|
||||
])
|
||||
.arg("nginx:1.29-alpine")
|
||||
.output()
|
||||
.await
|
||||
.expect("failed to run docker");
|
||||
assert!(
|
||||
out.status.success(),
|
||||
"{name} stub failed: {}",
|
||||
String::from_utf8_lossy(&out.stderr)
|
||||
);
|
||||
String::from_utf8_lossy(&out.stdout).trim().to_string()
|
||||
}
|
||||
|
||||
let gateway_id = run_stub("gateway", gateway_port, &conf_path).await;
|
||||
let runner_id = run_stub("runner", runner_port, &conf_path).await;
|
||||
let aggregate_id = run_stub("aggregate", aggregate_port, &conf_path).await;
|
||||
let projection_id = run_stub("projection", projection_port, &conf_path).await;
|
||||
|
||||
let secret = b"e2e_secret";
|
||||
let api_port = free_port();
|
||||
let api_url = format!("http://127.0.0.1:{api_port}");
|
||||
|
||||
let mut placement_path = std::env::temp_dir();
|
||||
placement_path.push(format!(
|
||||
"cloudlysis-control-placement-{}.json",
|
||||
uuid::Uuid::new_v4()
|
||||
));
|
||||
fs::write(
|
||||
&placement_path,
|
||||
r#"{"revision":"e2e","aggregate_placement":{"placements":[]},"projection_placement":{"placements":[]},"runner_placement":{"placements":[]}}"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let mut child = tokio::process::Command::new(env!("CARGO_BIN_EXE_api"))
|
||||
.env("CONTROL_API_ADDR", format!("127.0.0.1:{api_port}"))
|
||||
.env("CONTROL_GATEWAY_JWT_HS256_SECRET", "e2e_secret")
|
||||
.env("CONTROL_PLACEMENT_PATH", placement_path.to_string_lossy().to_string())
|
||||
.env(
|
||||
"CONTROL_FLEET_SERVICES",
|
||||
format!(
|
||||
"gateway=http://127.0.0.1:{gateway_port},aggregate=http://127.0.0.1:{aggregate_port},projection=http://127.0.0.1:{projection_port},runner=http://127.0.0.1:{runner_port}"
|
||||
),
|
||||
)
|
||||
.spawn()
|
||||
.expect("failed to spawn control-api");
|
||||
|
||||
wait_ready(&api_url).await;
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let t = token(secret, &["control:read"]);
|
||||
|
||||
let res = client
|
||||
.get(format!("{api_url}/admin/v1/fleet/snapshot"))
|
||||
.header(reqwest::header::AUTHORIZATION, format!("Bearer {t}"))
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(res.status().is_success());
|
||||
|
||||
let v: serde_json::Value = res.json().await.unwrap();
|
||||
let services = v.get("services").and_then(|x| x.as_array()).unwrap();
|
||||
assert!(
|
||||
services.len() >= 5,
|
||||
"expected at least 5 services (including control-api), got {}",
|
||||
services.len()
|
||||
);
|
||||
|
||||
let res = client
|
||||
.get(format!("{api_url}/admin/v1/tenants"))
|
||||
.header(reqwest::header::AUTHORIZATION, format!("Bearer {t}"))
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(res.status().is_success());
|
||||
|
||||
let _ = child.kill().await;
|
||||
|
||||
for id in [gateway_id, runner_id, aggregate_id, projection_id] {
|
||||
let _ = tokio::process::Command::new("docker")
|
||||
.args(["stop", &id])
|
||||
.output()
|
||||
.await;
|
||||
}
|
||||
|
||||
let _ = fs::remove_file(&conf_path);
|
||||
let _ = fs::remove_file(&placement_path);
|
||||
}
|
||||
30
control/api/tests/fleet_services_env.rs
Normal file
30
control/api/tests/fleet_services_env.rs
Normal file
@@ -0,0 +1,30 @@
|
||||
#[test]
|
||||
fn fleet_services_env_parser_is_lenient() {
|
||||
let services = {
|
||||
fn parse(spec: &str) -> Vec<api::FleetService> {
|
||||
spec.split(',')
|
||||
.filter_map(|pair| {
|
||||
let pair = pair.trim();
|
||||
if pair.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let (name, url) = pair.split_once('=')?;
|
||||
let name = name.trim();
|
||||
let url = url.trim();
|
||||
if name.is_empty() || url.is_empty() {
|
||||
return None;
|
||||
}
|
||||
Some(api::FleetService {
|
||||
name: name.to_string(),
|
||||
base_url: url.to_string(),
|
||||
})
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
parse(" gateway=http://x , ,runner=http://y,broken, =http://z ")
|
||||
};
|
||||
|
||||
assert_eq!(services.len(), 2);
|
||||
assert_eq!(services[0].name, "gateway");
|
||||
assert_eq!(services[1].name, "runner");
|
||||
}
|
||||
23
control/api/tests/nats_gated.rs
Normal file
23
control/api/tests/nats_gated.rs
Normal file
@@ -0,0 +1,23 @@
|
||||
use std::time::Duration;
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore]
|
||||
async fn nats_integration_tests_are_gated_and_fast_fail() {
|
||||
let url = std::env::var("CONTROL_TEST_NATS_URL").expect("CONTROL_TEST_NATS_URL is required");
|
||||
|
||||
let without_scheme = url.strip_prefix("nats://").unwrap_or(url.as_str());
|
||||
let hostport = without_scheme.split('/').next().unwrap_or(without_scheme);
|
||||
let mut parts = hostport.split(':');
|
||||
let host = parts.next().unwrap_or("127.0.0.1");
|
||||
let port: u16 = parts
|
||||
.next()
|
||||
.unwrap_or("4222")
|
||||
.parse()
|
||||
.expect("invalid port in CONTROL_TEST_NATS_URL");
|
||||
|
||||
let connect = tokio::net::TcpStream::connect((host, port));
|
||||
tokio::time::timeout(Duration::from_secs(2), connect)
|
||||
.await
|
||||
.expect("tcp connect to NATS timed out")
|
||||
.expect("failed to connect to NATS");
|
||||
}
|
||||
75
control/api/tests/observability_configs.rs
Normal file
75
control/api/tests/observability_configs.rs
Normal file
@@ -0,0 +1,75 @@
|
||||
use std::{collections::BTreeSet, fs, path::PathBuf};
|
||||
|
||||
fn repo_root() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.parent()
|
||||
.and_then(|p| p.parent())
|
||||
.expect("api crate should live under repo root")
|
||||
.to_path_buf()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn grafana_provisioning_files_are_syntactically_valid() {
|
||||
let root = repo_root();
|
||||
|
||||
let datasources = fs::read_to_string(
|
||||
root.join("observability/grafana/provisioning/datasources/datasources.yml"),
|
||||
)
|
||||
.expect("missing grafana datasources provisioning file");
|
||||
let dashboards = fs::read_to_string(
|
||||
root.join("observability/grafana/provisioning/dashboards/dashboards.yml"),
|
||||
)
|
||||
.expect("missing grafana dashboards provisioning file");
|
||||
|
||||
let _datasources_yaml: serde_yaml::Value =
|
||||
serde_yaml::from_str(&datasources).expect("invalid grafana datasources yaml");
|
||||
let _dashboards_yaml: serde_yaml::Value =
|
||||
serde_yaml::from_str(&dashboards).expect("invalid grafana dashboards yaml");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn grafana_dashboards_are_syntactically_valid_json() {
|
||||
let root = repo_root();
|
||||
let dashboards_dir = root.join("observability/grafana/dashboards");
|
||||
|
||||
let mut found = 0usize;
|
||||
for entry in fs::read_dir(&dashboards_dir).expect("missing dashboards dir") {
|
||||
let entry = entry.expect("failed to read dashboards dir entry");
|
||||
let path = entry.path();
|
||||
if path.extension().and_then(|e| e.to_str()) != Some("json") {
|
||||
continue;
|
||||
}
|
||||
found += 1;
|
||||
let raw = fs::read_to_string(&path).expect("failed to read dashboard json");
|
||||
let _: serde_json::Value =
|
||||
serde_json::from_str(&raw).unwrap_or_else(|e| panic!("{path:?}: {e}"));
|
||||
}
|
||||
|
||||
assert!(found > 0, "expected at least one dashboard json file");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vmagent_config_parses_and_includes_required_jobs() {
|
||||
let root = repo_root();
|
||||
let scrape = fs::read_to_string(root.join("observability/vmagent/scrape.yml"))
|
||||
.expect("missing vmagent scrape config");
|
||||
|
||||
let value: serde_yaml::Value =
|
||||
serde_yaml::from_str(&scrape).expect("invalid vmagent scrape yaml");
|
||||
|
||||
let mut job_names = BTreeSet::<String>::new();
|
||||
if let Some(scrape_configs) = value.get("scrape_configs").and_then(|v| v.as_sequence()) {
|
||||
for cfg in scrape_configs {
|
||||
if let Some(job) = cfg.get("job_name").and_then(|v| v.as_str()) {
|
||||
job_names.insert(job.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for required in ["victoria-metrics", "vmagent", "control-api"] {
|
||||
assert!(
|
||||
job_names.contains(required),
|
||||
"vmagent scrape config missing required job_name={required}"
|
||||
);
|
||||
}
|
||||
}
|
||||
61
control/api/tests/observability_smoke_docker.rs
Normal file
61
control/api/tests/observability_smoke_docker.rs
Normal file
@@ -0,0 +1,61 @@
|
||||
use std::{
|
||||
net::TcpStream,
|
||||
path::PathBuf,
|
||||
process::Command,
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
fn repo_root() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.parent()
|
||||
.and_then(|p| p.parent())
|
||||
.expect("api crate should live under repo root")
|
||||
.to_path_buf()
|
||||
}
|
||||
|
||||
fn wait_for_tcp(addr: &str, timeout: Duration) -> bool {
|
||||
let start = Instant::now();
|
||||
while start.elapsed() < timeout {
|
||||
if TcpStream::connect_timeout(
|
||||
&addr.parse().expect("invalid socket addr"),
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.is_ok()
|
||||
{
|
||||
return true;
|
||||
}
|
||||
std::thread::sleep(Duration::from_millis(250));
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn observability_stack_reaches_healthy_state_fast() {
|
||||
let enabled = std::env::var("CONTROL_TEST_DOCKER").ok();
|
||||
assert_eq!(enabled.as_deref(), Some("1"));
|
||||
|
||||
let root = repo_root();
|
||||
let compose = root.join("observability/docker-compose.yml");
|
||||
|
||||
let up = Command::new("docker")
|
||||
.args(["compose", "-f"])
|
||||
.arg(&compose)
|
||||
.args(["up", "-d"])
|
||||
.status()
|
||||
.expect("failed to run docker compose up");
|
||||
assert!(up.success(), "docker compose up failed");
|
||||
|
||||
let ok = wait_for_tcp("127.0.0.1:3000", Duration::from_secs(30))
|
||||
&& wait_for_tcp("127.0.0.1:8428", Duration::from_secs(30))
|
||||
&& wait_for_tcp("127.0.0.1:3100", Duration::from_secs(30))
|
||||
&& wait_for_tcp("127.0.0.1:3200", Duration::from_secs(30));
|
||||
|
||||
let _ = Command::new("docker")
|
||||
.args(["compose", "-f"])
|
||||
.arg(&compose)
|
||||
.args(["down", "-v"])
|
||||
.status();
|
||||
|
||||
assert!(ok, "observability stack did not become reachable in time");
|
||||
}
|
||||
43
control/api/tests/placement_hot_reload.rs
Normal file
43
control/api/tests/placement_hot_reload.rs
Normal file
@@ -0,0 +1,43 @@
|
||||
use std::{fs, path::PathBuf, thread, time::Duration};
|
||||
|
||||
use api::PlacementStore;
|
||||
|
||||
fn tmp_file(name: &str) -> PathBuf {
|
||||
let mut p = std::env::temp_dir();
|
||||
p.push(format!(
|
||||
"cloudlysis-control-{name}-{}-{}.json",
|
||||
std::process::id(),
|
||||
std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_nanos()
|
||||
));
|
||||
p
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn placement_store_hot_reload_swaps_atomically() {
|
||||
let path = tmp_file("placement");
|
||||
fs::write(
|
||||
&path,
|
||||
r#"{"revision":"r1","aggregate_placement":{"placements":[]},"projection_placement":{"placements":[]},"runner_placement":{"placements":[]}}"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let store = PlacementStore::new(path.clone());
|
||||
let a1 = store.get_for_kind(api::ServiceKind::Aggregate);
|
||||
assert_eq!(a1.revision, "r1");
|
||||
|
||||
thread::sleep(Duration::from_millis(5));
|
||||
|
||||
fs::write(
|
||||
&path,
|
||||
r#"{"revision":"r2","aggregate_placement":{"placements":[]},"projection_placement":{"placements":[]},"runner_placement":{"placements":[]}}"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let a2 = store.get_for_kind(api::ServiceKind::Aggregate);
|
||||
assert_eq!(a2.revision, "r2");
|
||||
|
||||
let _ = fs::remove_file(&path);
|
||||
}
|
||||
31
control/api/tests/swarm_client.rs
Normal file
31
control/api/tests/swarm_client.rs
Normal file
@@ -0,0 +1,31 @@
|
||||
use std::{fs, path::PathBuf};
|
||||
|
||||
#[test]
|
||||
fn swarm_store_is_deterministic_from_file() {
|
||||
let mut path = std::env::temp_dir();
|
||||
path.push(format!(
|
||||
"cloudlysis-control-swarm-{}-{}.json",
|
||||
std::process::id(),
|
||||
uuid::Uuid::new_v4()
|
||||
));
|
||||
|
||||
fs::write(
|
||||
&path,
|
||||
r#"{"services":[{"name":"gateway","image":"x","mode":"replicated","replicas":"1/1","updated_at":null}],"tasks":[{"id":"t1","service":"gateway","node":"n1","desired_state":"running","current_state":"running","error":null}]}"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let store = api::SwarmStore::new(PathBuf::from(&path));
|
||||
let services = store.list_services();
|
||||
assert_eq!(services.len(), 1);
|
||||
assert_eq!(services[0].name, "gateway");
|
||||
|
||||
let tasks = store.list_tasks("gateway");
|
||||
assert_eq!(tasks.len(), 1);
|
||||
assert_eq!(tasks[0].id, "t1");
|
||||
|
||||
let none = store.list_tasks("missing");
|
||||
assert_eq!(none.len(), 0);
|
||||
|
||||
let _ = fs::remove_file(&path);
|
||||
}
|
||||
42
control/api/tests/swarm_smoke_docker.rs
Normal file
42
control/api/tests/swarm_smoke_docker.rs
Normal file
@@ -0,0 +1,42 @@
|
||||
use std::time::Duration;
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore]
|
||||
async fn docker_swarm_smoke_test_is_gated_and_times_out() {
|
||||
let enabled = std::env::var("CONTROL_TEST_DOCKER").ok();
|
||||
assert_eq!(enabled.as_deref(), Some("1"));
|
||||
|
||||
let stack = "cloudlysis_control_test";
|
||||
let compose = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.parent()
|
||||
.and_then(|p| p.parent())
|
||||
.unwrap()
|
||||
.join("swarm/stacks/control-plane.yml");
|
||||
|
||||
let deploy = tokio::process::Command::new("docker")
|
||||
.args(["stack", "deploy", "-c"])
|
||||
.arg(&compose)
|
||||
.arg(stack)
|
||||
.output();
|
||||
|
||||
let out = tokio::time::timeout(Duration::from_secs(30), deploy)
|
||||
.await
|
||||
.expect("docker stack deploy timed out")
|
||||
.expect("failed to run docker stack deploy");
|
||||
assert!(
|
||||
out.status.success(),
|
||||
"docker stack deploy failed: {}",
|
||||
String::from_utf8_lossy(&out.stderr)
|
||||
);
|
||||
|
||||
let ls = tokio::process::Command::new("docker")
|
||||
.args(["service", "ls"])
|
||||
.output();
|
||||
let _ = tokio::time::timeout(Duration::from_secs(10), ls).await;
|
||||
|
||||
let rm = tokio::process::Command::new("docker")
|
||||
.args(["stack", "rm"])
|
||||
.arg(stack)
|
||||
.output();
|
||||
let _ = tokio::time::timeout(Duration::from_secs(10), rm).await;
|
||||
}
|
||||
40
control/api/tests/swarm_stack_yaml.rs
Normal file
40
control/api/tests/swarm_stack_yaml.rs
Normal file
@@ -0,0 +1,40 @@
|
||||
use std::{fs, path::PathBuf};
|
||||
|
||||
fn repo_root() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.parent()
|
||||
.and_then(|p| p.parent())
|
||||
.expect("api crate should live under repo root")
|
||||
.to_path_buf()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn stack_files_parse_as_yaml() {
|
||||
let root = repo_root();
|
||||
for file in [
|
||||
root.join("swarm/stacks/control-plane.yml"),
|
||||
root.join("swarm/stacks/observability.yml"),
|
||||
] {
|
||||
let raw = fs::read_to_string(&file).unwrap();
|
||||
let _: serde_yaml::Value = serde_yaml::from_str(&raw).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn control_plane_stack_has_required_services() {
|
||||
let root = repo_root();
|
||||
let raw = fs::read_to_string(root.join("swarm/stacks/control-plane.yml")).unwrap();
|
||||
let v: serde_yaml::Value = serde_yaml::from_str(&raw).unwrap();
|
||||
|
||||
let services = v
|
||||
.get("services")
|
||||
.and_then(|x| x.as_mapping())
|
||||
.expect("missing services");
|
||||
|
||||
for required in ["control-api", "control-ui"] {
|
||||
assert!(
|
||||
services.contains_key(serde_yaml::Value::String(required.to_string())),
|
||||
"missing service {required}"
|
||||
);
|
||||
}
|
||||
}
|
||||
601
control/prd.md
Normal file
601
control/prd.md
Normal file
@@ -0,0 +1,601 @@
|
||||
### 🧱 Component: Control Plane (Admin UI + Monitoring + Production Ops)
|
||||
|
||||
**Definition:**
|
||||
This repository hosts the **platform control plane**:
|
||||
1) the **Admin UI** used by platform operators and admins to manage users/roles/sessions, tenants, configuration, definitions, and production scaling; and
|
||||
2) the **observability stack** and **production dashboards** (VictoriaMetrics + Loki + Grafana, plus alerting/scrape config) required to operate the platform in production.
|
||||
|
||||
The control plane is the “single pane of glass” and the “safe hands” layer: it does not replace node runtime logic; it coordinates existing node capabilities and exposes them with strict RBAC, auditability, and operational guardrails.
|
||||
|
||||
---
|
||||
|
||||
## **Context: Existing Node Repositories (../)**
|
||||
|
||||
This PRD is derived from the currently implemented node repos in `../`:
|
||||
- **Aggregate**: expects a control node to manage tenant placement and scaling operations, including tenant migrations ([aggregate/prd.md](file:///Users/vlad/Developer/cloudlysis/aggregate/prd.md#L82-L151)). Tenant placement primitives and KV helper exist ([swarm.rs](file:///Users/vlad/Developer/cloudlysis/aggregate/src/swarm.rs#L5-L227)).
|
||||
- **Gateway**: provides the platform ingress, authn/authz, and tenant-aware routing; it explicitly expects NATS KV-based tenant placement and hot reload in production ([gateway/prd.md](file:///Users/vlad/Developer/cloudlysis/gateway/prd.md#L13-L175)).
|
||||
- **Projection**: consumes events, stores read models, and expects tenant-scoped query isolation and operational monitoring (consumer lag, checkpoints) ([projection/prd.md](file:///Users/vlad/Developer/cloudlysis/projection/prd.md#L7-L96)).
|
||||
- **Runner**: executes sagas + effects, includes tenant assignment watching via NATS KV and tenant draining semantics ([tenant_placement.rs](file:///Users/vlad/Developer/cloudlysis/runner/src/tenant_placement.rs#L11-L104)) and exposes admin endpoints for drain/reload in its PRD ([runner/prd.md](file:///Users/vlad/Developer/cloudlysis/runner/prd.md#L199-L210)).
|
||||
|
||||
The control plane also adopts the proven **Admin UI UX + component library** from UltraBase’s control-plane admin UI, adapting screens and information architecture to Cloudlysis needs:
|
||||
- Reusable UI components live under [ui/control-plane-admin/src/components/ui](file:///Users/vlad/Developer/madapes/ultrabase/ui/control-plane-admin/src/components/ui).
|
||||
- Example pages include [TenantsPage](file:///Users/vlad/Developer/madapes/ultrabase/ui/control-plane-admin/src/pages/TenantsPage.tsx), [AdminUsersPage](file:///Users/vlad/Developer/madapes/ultrabase/ui/control-plane-admin/src/pages/AdminUsersPage.tsx), [AdminSessionsPage](file:///Users/vlad/Developer/madapes/ultrabase/ui/control-plane-admin/src/pages/AdminSessionsPage.tsx), [FleetPage](file:///Users/vlad/Developer/madapes/ultrabase/ui/control-plane-admin/src/pages/FleetPage.tsx), [TopologyPage](file:///Users/vlad/Developer/madapes/ultrabase/ui/control-plane-admin/src/pages/TopologyPage.tsx), and [ObservabilityPage](file:///Users/vlad/Developer/madapes/ultrabase/ui/control-plane-admin/src/pages/ObservabilityPage.tsx).
|
||||
|
||||
---
|
||||
|
||||
## **Problem Statement**
|
||||
|
||||
Operating the platform without a unified control plane forces operators to:
|
||||
- Use ad-hoc scripts, direct cluster access, or service-local admin endpoints
|
||||
- Manage tenants, placements, and deployments without a consistent audit trail
|
||||
- Correlate production incidents across services with incomplete dashboards and unsafe levels of access
|
||||
|
||||
The platform needs a control plane that:
|
||||
- Centralizes **admin workflows** and **production operability**
|
||||
- Enforces **least-privilege RBAC**, **step-up**, and **auditing**
|
||||
- Provides a consistent, safe abstraction over **tenant placement**, **scale**, and **production operations**
|
||||
|
||||
---
|
||||
|
||||
## **Goals**
|
||||
|
||||
- Deliver an Admin UI with full admin management over:
|
||||
- users, sessions, roles/permissions
|
||||
- configuration (global + per-tenant)
|
||||
- definitions (aggregates, projections, sagas, effects, manifests)
|
||||
- scaling and production management (tenant placement, drains, migrations, deployments)
|
||||
- Package production-grade monitoring:
|
||||
- metrics via VictoriaMetrics
|
||||
- logs via Loki
|
||||
- dashboards and alerting via Grafana (+ vmalert where used)
|
||||
- Make production operations observable, auditable, and safe by default:
|
||||
- strong change logging + approvals where needed
|
||||
- idempotent operations + dry runs + rollback paths
|
||||
|
||||
---
|
||||
|
||||
## **Non-Goals**
|
||||
|
||||
- Re-implement node business logic (Aggregate / Projection / Runner) or platform ingress (Gateway).
|
||||
- Replace NATS JetStream, libmdbx storage responsibilities, or per-service runtime concerns.
|
||||
- Provide an arbitrary “general API gateway” for third-party upstreams.
|
||||
|
||||
---
|
||||
|
||||
## **Primary Users**
|
||||
|
||||
- **Platform Owner / SRE**: fleet operations, incident response, production change management.
|
||||
- **Platform Admin**: tenant provisioning, RBAC, config/definition promotion.
|
||||
- **Security Admin**: access reviews, session revocation, audit trails.
|
||||
- **Support / On-call**: triage dashboards, logs/metrics correlation, safe mitigations (drain, disable, rollback).
|
||||
|
||||
---
|
||||
|
||||
## **Key Concepts**
|
||||
|
||||
### Control Plane Scope
|
||||
|
||||
- The control plane is the authoritative interface for production operations and admin management.
|
||||
- The control plane uses node APIs, the Gateway, and NATS KV as its operational substrate rather than bypassing them.
|
||||
|
||||
### Tenant-Aware Operations
|
||||
|
||||
- All tenant-scoped operations are keyed by `tenant_id` (consistent with `x-tenant-id` usage across nodes and Gateway).
|
||||
- Tenant placement is treated as a first-class “control plane state” (NATS KV-backed in production; file/static in development), consistent with existing code patterns ([swarm.rs](file:///Users/vlad/Developer/cloudlysis/aggregate/src/swarm.rs#L188-L226), [tenant_placement.rs](file:///Users/vlad/Developer/cloudlysis/runner/src/tenant_placement.rs#L41-L104)).
|
||||
|
||||
### Safe Change Management
|
||||
|
||||
- Mutating actions require explicit intent, are recorded in audit logs, and should be reversible where possible.
|
||||
- All high-impact operations support:
|
||||
- validation and preflight checks
|
||||
- dry-run planning
|
||||
- idempotency keys
|
||||
- explicit rollback guidance
|
||||
|
||||
### Control Plane Components (In This Repo)
|
||||
|
||||
- **Admin UI (React)**:
|
||||
- Reuse UltraBase’s control-plane admin UI component system and interaction patterns, adapting routes and pages to Cloudlysis requirements ([components/ui](file:///Users/vlad/Developer/madapes/ultrabase/ui/control-plane-admin/src/components/ui)).
|
||||
- The UI should prefer “table + detail pages + action dropdown + modals” patterns to keep ops workflows fast and consistent.
|
||||
- **Control Plane API (BFF / Admin API)**:
|
||||
- A thin API layer that enforces RBAC, writes audit logs, and orchestrates multi-step operations (drain/migrate/rollout) as idempotent jobs.
|
||||
- Integrates with the Gateway for platform authn/authz and with node admin endpoints for operational actions.
|
||||
- **Observability Stack**:
|
||||
- Version-controlled provisioning for Grafana dashboards/datasources, scrape configs for vmagent, and alert rules (vmalert or Grafana Alerting), modeled after UltraBase’s baseline ([observability/README.md](file:///Users/vlad/Developer/madapes/ultrabase/observability/README.md#L1-L47)).
|
||||
|
||||
---
|
||||
|
||||
## **Functional Requirements**
|
||||
|
||||
### 1) Admin IAM (Users, Sessions, Roles)
|
||||
|
||||
#### 1.1 Users
|
||||
|
||||
- CRUD users with lifecycle states:
|
||||
- invited (pending acceptance), active, suspended, disabled, deleted (tombstoned)
|
||||
- Identity attributes:
|
||||
- email (primary), optional secondary identities
|
||||
- display name, avatar, metadata tags
|
||||
- auth methods enabled (password, OIDC providers), MFA state
|
||||
- Administrative actions:
|
||||
- invite/resend invite
|
||||
- reset password flow initiation
|
||||
- force MFA reset / revoke recovery codes
|
||||
- disable login / suspend user
|
||||
- impersonation (break-glass, audited, time-boxed)
|
||||
- Security constraints:
|
||||
- privileged actions require step-up / recent auth
|
||||
- sensitive events must be audit logged (who, what, when, why, from where)
|
||||
|
||||
#### 1.2 Sessions
|
||||
|
||||
- View active sessions and refresh token families:
|
||||
- by user, by tenant, by IP / geo, by device, by time range
|
||||
- Revoke capabilities:
|
||||
- revoke a single session
|
||||
- revoke all sessions for a user
|
||||
- revoke all sessions for a tenant (incident response)
|
||||
- Detection surfaces:
|
||||
- unusual session fanout (many sessions per user)
|
||||
- repeated failed logins / MFA failures
|
||||
- suspicious IP changes
|
||||
|
||||
#### 1.3 Roles & Permissions (RBAC)
|
||||
|
||||
- Roles are sets of permissions; assignments bind principals to roles in a scope.
|
||||
- Scopes:
|
||||
- global (platform-level)
|
||||
- tenant-scoped
|
||||
- environment-scoped (dev/staging/prod) when applicable
|
||||
- Required permission domains (minimum):
|
||||
- iam.users.* (create/update/suspend/delete)
|
||||
- iam.sessions.* (list/revoke)
|
||||
- iam.roles.* (create/update/assign)
|
||||
- tenants.* (create/update/archive)
|
||||
- configs.* (read/write/approve/apply)
|
||||
- definitions.* (read/write/validate/promote/rollback)
|
||||
- scale.* (view/apply/migrate/drain)
|
||||
- ops.* (deploy/rollback/restart/drain)
|
||||
- observability.* (view dashboards, manage alert rules)
|
||||
- audit.* (view/export)
|
||||
- Role templates:
|
||||
- owner, admin, operator, support, read-only, security-admin, break-glass
|
||||
|
||||
---
|
||||
|
||||
### 2) Tenant Management
|
||||
|
||||
- Create, list, and archive tenants.
|
||||
- Tenant status model:
|
||||
- provisioning, active, draining, migrating, degraded, suspended, archived
|
||||
- Tenant metadata:
|
||||
- plan/tier, quotas, feature flags, contact + billing metadata, environment(s)
|
||||
- Tenant operational actions:
|
||||
- trigger provisioning workflows (create streams/buckets, seed configs, create placement)
|
||||
- rotate tenant secrets (as definitions/config allow)
|
||||
- pause/resume workload (soft kill switch via config flags)
|
||||
|
||||
Tenant pages should mirror UltraBase’s “Tenant Overview + subpages” navigation patterns (example: [TenantsPage](file:///Users/vlad/Developer/madapes/ultrabase/ui/control-plane-admin/src/pages/TenantsPage.tsx) and [TenantOverviewPage](file:///Users/vlad/Developer/madapes/ultrabase/ui/control-plane-admin/src/pages/TenantOverviewPage.tsx)).
|
||||
|
||||
---
|
||||
|
||||
### 3) Configuration Management (Global + Per-Tenant)
|
||||
|
||||
#### 3.1 Config Model
|
||||
|
||||
- Config items are versioned, typed documents with:
|
||||
- scope (global / tenant / environment)
|
||||
- schema version
|
||||
- provenance (who/what wrote it)
|
||||
- effective date and rollout strategy
|
||||
- Config must support:
|
||||
- validation against a schema
|
||||
- diff view (previous vs next)
|
||||
- staged rollout (preview → apply)
|
||||
- rollback to a prior version
|
||||
|
||||
#### 3.2 Node-Related Configuration
|
||||
|
||||
Required config surfaces (minimum):
|
||||
- **Gateway**: routing/placement sources, auth policies, rate limits (see routing expectations in [gateway/prd.md](file:///Users/vlad/Developer/cloudlysis/gateway/prd.md#L154-L175)).
|
||||
- **Aggregate / Projection / Runner**:
|
||||
- shard identifiers and tenant allowlists/placement settings
|
||||
- drain/reload toggles and safety thresholds
|
||||
- resource limits / concurrency caps
|
||||
|
||||
---
|
||||
|
||||
### 4) Definition Management (System “Blueprints”)
|
||||
|
||||
Definitions are the declarative “what the platform is” and “what runs” layer: aggregates, projections, sagas, effect providers, and any manifests that tie runtime-function programs to entity types.
|
||||
|
||||
Required capabilities:
|
||||
- Upload/edit versioned definitions with:
|
||||
- validation (schema + semantic checks)
|
||||
- “impact analysis” (which tenants/services are affected)
|
||||
- promotion workflow (dev → staging → prod)
|
||||
- Change controls:
|
||||
- approvals (role-based) for production promotion
|
||||
- emergency rollback path (one-click revert to last-known-good definition bundle)
|
||||
- Tenant overrides:
|
||||
- allow per-tenant definition overrides only when explicitly permitted by policy
|
||||
|
||||
The control plane must present definitions in a way that maps to the node runtime responsibilities:
|
||||
- Aggregates and deterministic decide/apply programs ([aggregate/prd.md](file:///Users/vlad/Developer/cloudlysis/aggregate/prd.md#L155-L160))
|
||||
- Projections and deterministic project programs ([projection/prd.md](file:///Users/vlad/Developer/cloudlysis/projection/prd.md#L36-L55))
|
||||
- Runner sagas and effect provider manifests ([runner/prd.md](file:///Users/vlad/Developer/cloudlysis/runner/prd.md#L41-L57))
|
||||
|
||||
---
|
||||
|
||||
### 5) Scale Management (Tenant Placement, Shards, Fleet)
|
||||
|
||||
#### 5.1 Placement Model
|
||||
|
||||
- Placement is modeled as:
|
||||
- a set of nodes/shards and their attributes (labels, capacity, region)
|
||||
- tenant → shard assignments per service kind (Aggregate, Projection, Runner, optionally Gateway when relevant)
|
||||
- Control plane supports both:
|
||||
- static placement (development)
|
||||
- dynamic placement (production) backed by NATS KV (consistent with existing client patterns in [swarm.rs](file:///Users/vlad/Developer/cloudlysis/aggregate/src/swarm.rs#L79-L227))
|
||||
|
||||
#### 5.2 Tenant Migration
|
||||
|
||||
- Provide guided migration planning and execution:
|
||||
- show current assignment, target assignment, and a sequenced action plan
|
||||
- execute “graceful drain → update placement → reload” style plans (see [plan_graceful_tenant_migration](file:///Users/vlad/Developer/cloudlysis/aggregate/src/swarm.rs#L41-L65))
|
||||
- Migration safety:
|
||||
- require explicit confirmation and reason
|
||||
- block if draining is unsafe (inflight work too high, storage unhealthy, consumer lag too high)
|
||||
- time-box and alert if drains do not converge
|
||||
|
||||
#### 5.3 Fleet View
|
||||
|
||||
- Fleet inventory:
|
||||
- nodes (labels, region, capacity, version)
|
||||
- services (replicas, image version, health)
|
||||
- per-node and per-service load indicators (CPU/mem, request rate, consumer lag)
|
||||
- Operator actions:
|
||||
- scale replicas, restart services, cordon/drain nodes (when supported by orchestrator)
|
||||
|
||||
UX should align with the UltraBase “Fleet” and “Topology” navigation patterns ([FleetPage](file:///Users/vlad/Developer/madapes/ultrabase/ui/control-plane-admin/src/pages/FleetPage.tsx), [TopologyPage](file:///Users/vlad/Developer/madapes/ultrabase/ui/control-plane-admin/src/pages/TopologyPage.tsx)).
|
||||
|
||||
---
|
||||
|
||||
### 6) Production Operations (Deployments, Maintenance, Safety)
|
||||
|
||||
#### 6.1 Deployments
|
||||
|
||||
- Manage deployable artifacts per service (Aggregate/Gateway/Projection/Runner) with:
|
||||
- environment-specific rollout policies
|
||||
- canary/rolling deploy support (when orchestrator supports it)
|
||||
- automatic health checks gates and rollback triggers
|
||||
- Track releases:
|
||||
- “what is running where” (service version matrix)
|
||||
- change log links and approvals
|
||||
|
||||
#### 6.2 Maintenance Operations
|
||||
|
||||
- Drain operations:
|
||||
- tenant drain (stop acquiring new work, finish inflight; required by Runner semantics in [TenantGate](file:///Users/vlad/Developer/cloudlysis/runner/src/tenant_placement.rs#L106-L200))
|
||||
- node drain (aggregate tenant ranges, projection consumers, runner workers)
|
||||
- Replay / rebuild operations:
|
||||
- projection rebuild triggers (dangerous, must be guarded and audited)
|
||||
- workflow replay controls (reset checkpoints only with explicit intent)
|
||||
|
||||
#### 6.3 Incident Response Toolkit
|
||||
|
||||
- “Safe switches”:
|
||||
- per-tenant kill switch (disable commands/effects via config)
|
||||
- global degrade modes (rate limit reductions, disable expensive features)
|
||||
- Run actions:
|
||||
- revoke sessions at scale
|
||||
- freeze deployments
|
||||
- trigger drain/migrate with guided plan
|
||||
|
||||
---
|
||||
|
||||
### 7) Observability (VictoriaMetrics + Loki + Grafana) and Dashboards
|
||||
|
||||
#### 7.1 Stack Requirements
|
||||
|
||||
Adopt a production-ready stack consistent with UltraBase’s operational baseline:
|
||||
- **VictoriaMetrics** for metrics storage and Prometheus-compatible query
|
||||
- **vmagent** for scraping and remote_write
|
||||
- **Grafana** for dashboards and alert routing
|
||||
- **Loki** (+ optional **Promtail**) for logs
|
||||
- Optional **vmalert** for rule evaluation against VictoriaMetrics
|
||||
|
||||
UltraBase’s observability design is a direct reference implementation to mirror and adapt:
|
||||
- Stack overview and conventions: [observability/README.md](file:///Users/vlad/Developer/madapes/ultrabase/observability/README.md#L1-L47)
|
||||
- Provisioned dashboards and datasources: [grafana provisioning](file:///Users/vlad/Developer/madapes/ultrabase/observability/grafana/provisioning)
|
||||
|
||||
#### 7.2 Metrics Conventions
|
||||
|
||||
- Every service exports `/metrics` in Prometheus format.
|
||||
- Required labels:
|
||||
- `service` (stable, low cardinality)
|
||||
- `env` (dev/staging/prod)
|
||||
- `tenant_id` only where safe and bounded; avoid tenant_id on high-frequency per-request series unless cardinality is controlled.
|
||||
- HTTP metrics must avoid unbounded `path` cardinality; prefer route templates (pattern-based paths).
|
||||
|
||||
Tenant-aware metrics guidelines:
|
||||
- Prefer **tenant-only aggregates** for “who is hurting us?” views:
|
||||
- `..._requests_total{tenant_id,service,status_class}` (no `path`)
|
||||
- `..._request_duration_seconds{tenant_id,service}` (no `path`, limited bucket count)
|
||||
- Prefer **route-only aggregates** for “what endpoint is hurting us?” views:
|
||||
- `..._requests_total{service,path,status}` (no `tenant_id`)
|
||||
- Where per-tenant and per-route both matter, implement a **top-k sampling** policy:
|
||||
- emit `(tenant_id,path)` series only for top N tenants, or only for a fixed allowlist of routes.
|
||||
|
||||
#### 7.3 Required Dashboards (Production)
|
||||
|
||||
Minimum set of dashboards (provisioned on startup):
|
||||
- **Platform — Operations overview**
|
||||
- `up` for core services and observability stack
|
||||
- RPS, 4xx/5xx ratio, p95/p99 latency per service
|
||||
- saturation indicators (CPU/mem, inflight, queue depth)
|
||||
- **Platform — HTTP detail**
|
||||
- per-service request breakdown by route template, method, status
|
||||
- top failing paths and latency outliers
|
||||
- **Platform — Logs**
|
||||
- Loki stream filtering by `service`, `tenant_id` (where present), and correlation identifiers
|
||||
- **Platform — Event bus / JetStream**
|
||||
- consumer lag, redeliveries, ack latency, stream storage pressure
|
||||
- **Platform — Workers (Runner)**
|
||||
- outbox depth, effect latency, poison message counts, schedules backlog
|
||||
- **Platform — Storage (libmdbx)**
|
||||
- DB size growth, write stalls, fsync latency (where exported), disk usage
|
||||
- **Platform — Cluster / Orchestrator**
|
||||
- node health, container restarts, placement distribution by tenant range
|
||||
|
||||
Dashboards should be modeled after UltraBase’s default set (for structure, not content), e.g. [ultrabase-operations.json](file:///Users/vlad/Developer/madapes/ultrabase/observability/grafana/provisioning/dashboards/default/ultrabase-operations.json) and [ultrabase-http-detail.json](file:///Users/vlad/Developer/madapes/ultrabase/observability/grafana/provisioning/dashboards/default/ultrabase-http-detail.json).
|
||||
|
||||
Additional production-operability dashboards (chosen and adapted):
|
||||
- **Platform — Noisy Neighbor & Tenant Health**
|
||||
- Purpose: identify a tenant causing cluster instability (attack, runaway job, bad config) and quickly pivot all panels to that tenant.
|
||||
- Panels (minimum):
|
||||
- Top tenants by Gateway RPS (topk of tenant-only request counters).
|
||||
- Tenant latency distribution (p95/p99 per tenant) from tenant-only latency histograms.
|
||||
- Tenant error ratio (5xx and 429) per tenant.
|
||||
- Aggregate in-flight commands by tenant (already exported: `aggregate_in_flight_commands{tenant_id}`).
|
||||
- Projection processing error rate by tenant (from `projection_processing_errors_total{tenant_id,view_type}` aggregated per tenant).
|
||||
- Loki logs panel with a `tenant_id` variable selector; selecting a tenant syncs RPS/latency/errors + logs.
|
||||
- Required instrumentation:
|
||||
- Gateway must expose **tenant-level** HTTP counters/histograms (tenant + status class + service, without `path`) in addition to existing route-level metrics.
|
||||
|
||||
- **Platform — API Regression & Deployment**
|
||||
- Purpose: determine whether a newly rolled out image caused regressions, and correlate changes with deployment events.
|
||||
- Panels (minimum):
|
||||
- Error rate comparison “old vs new” by `service` and `version` (or `image_tag`) labels.
|
||||
- Latency comparison “old vs new” (p95/p99) per service.
|
||||
- Restart / flapping rate per service (container restarts, crash loops).
|
||||
- Dependency latency correlation:
|
||||
- Gateway request duration vs Aggregate command duration vs Projection processing duration vs Runner effect latency.
|
||||
- Loki “new errors” panel:
|
||||
- errors seen in the last 10m that were not present in the prior 60m window, grouped by `service`.
|
||||
- Deployment annotations:
|
||||
- vertical markers when Swarm service updates started/finished (via annotations or a deploy event metric).
|
||||
- Required instrumentation:
|
||||
- Every service exports a `*_build_info{service,version,git_sha}` gauge (value=1) or equivalent, and scrape relabeling adds `image_tag` where possible.
|
||||
- Control plane emits deployment annotations/events (or pulls them from the orchestrator and writes to Grafana annotations).
|
||||
|
||||
- **Platform — Storage & Event Bus Bottlenecks**
|
||||
- Purpose: debug timeouts when the API is “up” but underlying storage/eventing is saturated (the Cloudlysis equivalent of DB firefighting).
|
||||
- Panels (minimum):
|
||||
- NATS/JetStream health:
|
||||
- stream storage pressure, publish/ack latency, consumer lag, redeliveries.
|
||||
- Projection lag and throughput:
|
||||
- events processed rate, processing duration, error rate.
|
||||
- Aggregate write-path pressure:
|
||||
- command duration, version conflicts, in-flight commands, tenant errors.
|
||||
- Runner pressure:
|
||||
- outbox dispatch failure rate, effect timeout rate, deadletter writes.
|
||||
- Disk saturation on nodes hosting libmdbx:
|
||||
- disk usage, read/write latency, IOPS; correlate with spikes in command/query latency.
|
||||
- Optional Postgres/Autobase panels only when a managed DB backs any control-plane metadata:
|
||||
- pool saturation, replica lag, slow queries, long transactions.
|
||||
- Required instrumentation:
|
||||
- Ensure JetStream metrics are scraped (NATS server `/varz` exporter or native Prometheus endpoint depending on deployment).
|
||||
- Ensure node-level disk/IO metrics are scraped (node exporter / cadvisor / equivalent).
|
||||
|
||||
- **Platform — Infrastructure Exhaustion**
|
||||
- Purpose: detect node/resource pressure earlier than raw CPU% and catch observability blind spots.
|
||||
- Panels (minimum):
|
||||
- CPU/memory pressure (PSI) per node (when available), plus load average and CPU saturation.
|
||||
- OOM kill tracker across the cluster.
|
||||
- Disk usage + IO wait/latency on data volumes (libmdbx, Loki, VictoriaMetrics).
|
||||
- vmagent health:
|
||||
- scrape error rate, remote_write errors, queue backlog.
|
||||
- Loki ingestion health:
|
||||
- dropped log lines (promtail) and ingestion errors (loki).
|
||||
- Swarm task hygiene:
|
||||
- desired_state vs current_state mismatches, orphaned tasks, restart loops.
|
||||
- Required instrumentation:
|
||||
- node exporter / cadvisor (or equivalent) must be part of the production scrape plan.
|
||||
- promtail (or alternative) must expose drop/error metrics when logs are enabled.
|
||||
|
||||
#### 7.4 Alerting Requirements
|
||||
|
||||
Minimum alert classes:
|
||||
- Availability:
|
||||
- service down (`up == 0`)
|
||||
- scrape failures, vmagent remote_write errors
|
||||
- Reliability:
|
||||
- sustained elevated 5xx ratio
|
||||
- sustained elevated p95 latency per service
|
||||
- Backlogs:
|
||||
- JetStream consumer lag above threshold
|
||||
- Runner outbox depth above threshold
|
||||
- Data safety:
|
||||
- disk usage near full (nodes hosting libmdbx)
|
||||
- abnormal restart loops
|
||||
- Security:
|
||||
- login anomaly detection signals (where instrumented)
|
||||
- suspicious spike in session revocations / failed MFA
|
||||
|
||||
Alert rules can follow UltraBase’s approach of version-controlled rules in YAML (reference: [alerts/](file:///Users/vlad/Developer/madapes/ultrabase/observability/alerts)).
|
||||
|
||||
#### 7.5 Control Plane → Observability Linking
|
||||
|
||||
The Admin UI must embed or deep-link into observability tools:
|
||||
- per-tenant and per-service quick links to Grafana dashboards and Loki queries
|
||||
- incident triage shortcuts (operations overview → HTTP detail → logs)
|
||||
|
||||
This mirrors UltraBase’s “observability links JSON” concept ([observability/README.md](file:///Users/vlad/Developer/madapes/ultrabase/observability/README.md#L65-L75)), but adapted to Cloudlysis services and dashboards.
|
||||
|
||||
---
|
||||
|
||||
### 8) Audit, Compliance, and Change History
|
||||
|
||||
- Audit log is an append-only stream of security and operations events:
|
||||
- authentication and session events
|
||||
- RBAC changes and permission grants
|
||||
- config/definition changes and promotions
|
||||
- scaling, drain, and migration operations
|
||||
- deployments and rollbacks
|
||||
- Audit log must support:
|
||||
- search and export (bounded and access controlled)
|
||||
- correlation to production incidents (request ids, trace ids)
|
||||
- retention policy controls
|
||||
|
||||
---
|
||||
|
||||
### 9) Control Plane API Surface (Admin API)
|
||||
|
||||
The control plane requires a stable API surface for the Admin UI and automation.
|
||||
|
||||
Minimum API capabilities:
|
||||
- **Idempotent jobs for multi-step operations**:
|
||||
- every mutating operation returns a `job_id`, supports polling and cancellation, and records a full execution trace in the audit log.
|
||||
- **Preflight endpoints**:
|
||||
- validate an intended change and return a plan (and “would-change” diff) without applying it.
|
||||
- **RBAC-first access model**:
|
||||
- all endpoints enforce permission checks at the API boundary (UI is not trusted).
|
||||
|
||||
Minimum endpoint groups:
|
||||
- `/admin/v1/iam/*` (users, roles, assignments, sessions)
|
||||
- `/admin/v1/tenants/*` (tenants lifecycle, status, metadata)
|
||||
- `/admin/v1/config/*` (versioned config, diff, apply, rollback)
|
||||
- `/admin/v1/definitions/*` (bundles, validate, promote, rollback)
|
||||
- `/admin/v1/scale/*` (placement, migrations, drain status)
|
||||
- `/admin/v1/ops/*` (deployments, rollbacks, service actions)
|
||||
- `/admin/v1/observability/*` (links, saved queries, dashboard registry)
|
||||
- `/admin/v1/audit/*` (search, export)
|
||||
|
||||
Authentication/authorization integration:
|
||||
- Prefer using the **Gateway** as the system of record for admin identities and sessions, with the control plane API validating requests using Gateway-issued tokens and enforcing platform-specific permissions.
|
||||
|
||||
---
|
||||
|
||||
### 10) Secrets and Credentials Management
|
||||
|
||||
The control plane must treat secrets as first-class operational data with strict handling.
|
||||
|
||||
Requirements:
|
||||
- Secret values must never be logged and must be redacted in UI/API responses.
|
||||
- Secrets must support:
|
||||
- creation and rotation workflows
|
||||
- scoped access (global/tenant/environment)
|
||||
- staged rollout (write new → verify → promote → retire old)
|
||||
- Rendering rules:
|
||||
- after creation, secret plaintext must not be retrievable unless explicitly enabled by policy (default: write-only).
|
||||
- Integrations:
|
||||
- support referencing secrets from config/definitions without embedding values (secret refs).
|
||||
|
||||
---
|
||||
|
||||
### 11) Backups, Restore, and Disaster Recovery (Production Operability)
|
||||
|
||||
The control plane must provide explicit visibility and guardrails for data safety operations.
|
||||
|
||||
Minimum requirements:
|
||||
- **Backup status**:
|
||||
- show last successful backup timestamps per critical store (metadata DB, NATS state if applicable, Grafana provisioning state as code, tenant placement/config stores).
|
||||
- **Restore readiness**:
|
||||
- preflight checks that validate a restore plan (target environment, versions, dependencies).
|
||||
- **Operational playbooks**:
|
||||
- link to the exact restore procedure and post-restore verification checklist.
|
||||
- **Key rotation**:
|
||||
- explicit workflows and audit logs for rotating signing keys, service credentials, and secret backends.
|
||||
|
||||
This should align with the platform’s existing operational patterns (e.g., the explicit “restore / post-restore checks” concept used in UltraBase observability docs).
|
||||
|
||||
---
|
||||
|
||||
## **Admin UI Requirements (Information Architecture + UX)**
|
||||
|
||||
### Navigation (Minimum)
|
||||
|
||||
Left navigation sections:
|
||||
- Overview
|
||||
- Tenants
|
||||
- Users
|
||||
- Sessions
|
||||
- Roles & Permissions
|
||||
- Config
|
||||
- Definitions
|
||||
- Scale & Placement
|
||||
- Deployments
|
||||
- Observability
|
||||
- Audit Log
|
||||
- Settings
|
||||
|
||||
### Page Patterns (Reuse UltraBase UI)
|
||||
|
||||
Adopt the UltraBase component system and page layout patterns:
|
||||
- Layout, styling tokens, UI primitives: [components/ui](file:///Users/vlad/Developer/madapes/ultrabase/ui/control-plane-admin/src/components/ui)
|
||||
- Table + search + action dropdown pattern: [TenantsPage](file:///Users/vlad/Developer/madapes/ultrabase/ui/control-plane-admin/src/pages/TenantsPage.tsx#L94-L203)
|
||||
|
||||
Required page types:
|
||||
- List pages:
|
||||
- searchable table, bulk actions, row actions menu, status pills, empty states
|
||||
- Detail pages:
|
||||
- header with primary actions (drain, migrate, rollback)
|
||||
- sub-nav tabs for domain-specific views
|
||||
- Mutation flows:
|
||||
- modal confirmation + explicit reason entry for high-impact changes
|
||||
- toast notifications and “busy” state handling consistent with UltraBase patterns
|
||||
|
||||
### Tenant Detail Subpages (Minimum)
|
||||
|
||||
- Overview (status, assignments, SLO highlights)
|
||||
- Placement (per service: Aggregate/Projection/Runner)
|
||||
- Health (node readiness and dependency checks)
|
||||
- Config (effective config + diffs)
|
||||
- Definitions (applied definition bundle + version)
|
||||
- Activity (audit trail filtered to tenant)
|
||||
- Observability (embedded links / panels)
|
||||
|
||||
---
|
||||
|
||||
## **Non-Functional Requirements**
|
||||
|
||||
- **Security**:
|
||||
- strict RBAC everywhere; deny-by-default
|
||||
- audit every privileged operation
|
||||
- step-up for sensitive actions
|
||||
- CSRF protection for browser sessions
|
||||
- safe secret handling (no secret values rendered after creation unless explicitly permitted)
|
||||
- allowlist outbound integrations (Grafana/Loki/VM URLs, orchestration API endpoints) to prevent SSRF-style abuse
|
||||
- **Reliability**:
|
||||
- control plane operations are idempotent and resilient to partial failures
|
||||
- operations have clear “current state” and do not rely on UI assumptions
|
||||
- **Performance**:
|
||||
- list pages paginate and filter server-side for large fleets
|
||||
- dashboards load with bounded query costs and controlled label cardinality
|
||||
- **Operability**:
|
||||
- control plane itself must be observable (metrics/logs, dashboards, alerts)
|
||||
- every operation must surface preflight checks and post-conditions
|
||||
|
||||
---
|
||||
|
||||
## **Open Questions / Design Constraints (To Resolve During Implementation)**
|
||||
|
||||
- Where does the source of truth live for:
|
||||
- users/sessions/roles (Gateway vs control-plane backing store)?
|
||||
- configs/definitions (NATS KV vs database vs GitOps)?
|
||||
- How should production promotions be modeled:
|
||||
- environment branches, approval workflow, and rollback semantics?
|
||||
- What orchestrator is the production baseline (Docker Swarm per existing PRDs, or will Kubernetes be introduced)?
|
||||
- Where should the job/execution state for long-running operations live:
|
||||
- embedded in the control plane API process, durable store, or NATS workflows?
|
||||
24
control/ui/.gitignore
vendored
Normal file
24
control/ui/.gitignore
vendored
Normal file
@@ -0,0 +1,24 @@
|
||||
# Logs
|
||||
logs
|
||||
*.log
|
||||
npm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
pnpm-debug.log*
|
||||
lerna-debug.log*
|
||||
|
||||
node_modules
|
||||
dist
|
||||
dist-ssr
|
||||
*.local
|
||||
|
||||
# Editor directories and files
|
||||
.vscode/*
|
||||
!.vscode/extensions.json
|
||||
.idea
|
||||
.DS_Store
|
||||
*.suo
|
||||
*.ntvs*
|
||||
*.njsproj
|
||||
*.sln
|
||||
*.sw?
|
||||
73
control/ui/README.md
Normal file
73
control/ui/README.md
Normal file
@@ -0,0 +1,73 @@
|
||||
# React + TypeScript + Vite
|
||||
|
||||
This template provides a minimal setup to get React working in Vite with HMR and some ESLint rules.
|
||||
|
||||
Currently, two official plugins are available:
|
||||
|
||||
- [@vitejs/plugin-react](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react) uses [Oxc](https://oxc.rs)
|
||||
- [@vitejs/plugin-react-swc](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react-swc) uses [SWC](https://swc.rs/)
|
||||
|
||||
## React Compiler
|
||||
|
||||
The React Compiler is not enabled on this template because of its impact on dev & build performances. To add it, see [this documentation](https://react.dev/learn/react-compiler/installation).
|
||||
|
||||
## Expanding the ESLint configuration
|
||||
|
||||
If you are developing a production application, we recommend updating the configuration to enable type-aware lint rules:
|
||||
|
||||
```js
|
||||
export default defineConfig([
|
||||
globalIgnores(['dist']),
|
||||
{
|
||||
files: ['**/*.{ts,tsx}'],
|
||||
extends: [
|
||||
// Other configs...
|
||||
|
||||
// Remove tseslint.configs.recommended and replace with this
|
||||
tseslint.configs.recommendedTypeChecked,
|
||||
// Alternatively, use this for stricter rules
|
||||
tseslint.configs.strictTypeChecked,
|
||||
// Optionally, add this for stylistic rules
|
||||
tseslint.configs.stylisticTypeChecked,
|
||||
|
||||
// Other configs...
|
||||
],
|
||||
languageOptions: {
|
||||
parserOptions: {
|
||||
project: ['./tsconfig.node.json', './tsconfig.app.json'],
|
||||
tsconfigRootDir: import.meta.dirname,
|
||||
},
|
||||
// other options...
|
||||
},
|
||||
},
|
||||
])
|
||||
```
|
||||
|
||||
You can also install [eslint-plugin-react-x](https://github.com/Rel1cx/eslint-react/tree/main/packages/plugins/eslint-plugin-react-x) and [eslint-plugin-react-dom](https://github.com/Rel1cx/eslint-react/tree/main/packages/plugins/eslint-plugin-react-dom) for React-specific lint rules:
|
||||
|
||||
```js
|
||||
// eslint.config.js
|
||||
import reactX from 'eslint-plugin-react-x'
|
||||
import reactDom from 'eslint-plugin-react-dom'
|
||||
|
||||
export default defineConfig([
|
||||
globalIgnores(['dist']),
|
||||
{
|
||||
files: ['**/*.{ts,tsx}'],
|
||||
extends: [
|
||||
// Other configs...
|
||||
// Enable lint rules for React
|
||||
reactX.configs['recommended-typescript'],
|
||||
// Enable lint rules for React DOM
|
||||
reactDom.configs.recommended,
|
||||
],
|
||||
languageOptions: {
|
||||
parserOptions: {
|
||||
project: ['./tsconfig.node.json', './tsconfig.app.json'],
|
||||
tsconfigRootDir: import.meta.dirname,
|
||||
},
|
||||
// other options...
|
||||
},
|
||||
},
|
||||
])
|
||||
```
|
||||
23
control/ui/eslint.config.js
Normal file
23
control/ui/eslint.config.js
Normal file
@@ -0,0 +1,23 @@
|
||||
import js from '@eslint/js'
|
||||
import globals from 'globals'
|
||||
import reactHooks from 'eslint-plugin-react-hooks'
|
||||
import reactRefresh from 'eslint-plugin-react-refresh'
|
||||
import tseslint from 'typescript-eslint'
|
||||
import { defineConfig, globalIgnores } from 'eslint/config'
|
||||
|
||||
export default defineConfig([
|
||||
globalIgnores(['dist']),
|
||||
{
|
||||
files: ['**/*.{ts,tsx}'],
|
||||
extends: [
|
||||
js.configs.recommended,
|
||||
tseslint.configs.recommended,
|
||||
reactHooks.configs.flat.recommended,
|
||||
reactRefresh.configs.vite,
|
||||
],
|
||||
languageOptions: {
|
||||
ecmaVersion: 2020,
|
||||
globals: globals.browser,
|
||||
},
|
||||
},
|
||||
])
|
||||
13
control/ui/index.html
Normal file
13
control/ui/index.html
Normal file
@@ -0,0 +1,13 @@
|
||||
<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<link rel="icon" type="image/svg+xml" href="/favicon.svg" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>ui</title>
|
||||
</head>
|
||||
<body>
|
||||
<div id="root"></div>
|
||||
<script type="module" src="/src/main.tsx"></script>
|
||||
</body>
|
||||
</html>
|
||||
11
control/ui/nginx.conf
Normal file
11
control/ui/nginx.conf
Normal file
@@ -0,0 +1,11 @@
|
||||
server {
|
||||
listen 80;
|
||||
server_name _;
|
||||
|
||||
root /usr/share/nginx/html;
|
||||
index index.html;
|
||||
|
||||
location / {
|
||||
try_files $uri $uri/ /index.html;
|
||||
}
|
||||
}
|
||||
5333
control/ui/package-lock.json
generated
Normal file
5333
control/ui/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
37
control/ui/package.json
Normal file
37
control/ui/package.json
Normal file
@@ -0,0 +1,37 @@
|
||||
{
|
||||
"name": "ui",
|
||||
"private": true,
|
||||
"version": "0.0.0",
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"dev": "vite",
|
||||
"build": "tsc -b && vite build",
|
||||
"lint": "eslint .",
|
||||
"typecheck": "tsc -b --pretty false",
|
||||
"test": "vitest run",
|
||||
"preview": "vite preview"
|
||||
},
|
||||
"dependencies": {
|
||||
"react": "^19.2.4",
|
||||
"react-dom": "^19.2.4",
|
||||
"react-router-dom": "^7.9.3"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@eslint/js": "^9.39.4",
|
||||
"@testing-library/jest-dom": "^6.9.0",
|
||||
"@testing-library/react": "^16.3.0",
|
||||
"@types/node": "^24.12.0",
|
||||
"@types/react": "^19.2.14",
|
||||
"@types/react-dom": "^19.2.3",
|
||||
"@vitejs/plugin-react": "^6.0.1",
|
||||
"eslint": "^9.39.4",
|
||||
"eslint-plugin-react-hooks": "^7.0.1",
|
||||
"eslint-plugin-react-refresh": "^0.5.2",
|
||||
"globals": "^17.4.0",
|
||||
"jsdom": "^27.0.0",
|
||||
"typescript": "~5.9.3",
|
||||
"typescript-eslint": "^8.57.0",
|
||||
"vite": "^8.0.1",
|
||||
"vitest": "^3.2.4"
|
||||
}
|
||||
}
|
||||
1
control/ui/public/favicon.svg
Normal file
1
control/ui/public/favicon.svg
Normal file
File diff suppressed because one or more lines are too long
|
After Width: | Height: | Size: 9.3 KiB |
24
control/ui/public/icons.svg
Normal file
24
control/ui/public/icons.svg
Normal file
@@ -0,0 +1,24 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg">
|
||||
<symbol id="bluesky-icon" viewBox="0 0 16 17">
|
||||
<g clip-path="url(#bluesky-clip)"><path fill="#08060d" d="M7.75 7.735c-.693-1.348-2.58-3.86-4.334-5.097-1.68-1.187-2.32-.981-2.74-.79C.188 2.065.1 2.812.1 3.251s.241 3.602.398 4.13c.52 1.744 2.367 2.333 4.07 2.145-2.495.37-4.71 1.278-1.805 4.512 3.196 3.309 4.38-.71 4.987-2.746.608 2.036 1.307 5.91 4.93 2.746 2.72-2.746.747-4.143-1.747-4.512 1.702.189 3.55-.4 4.07-2.145.156-.528.397-3.691.397-4.13s-.088-1.186-.575-1.406c-.42-.19-1.06-.395-2.741.79-1.755 1.24-3.64 3.752-4.334 5.099"/></g>
|
||||
<defs><clipPath id="bluesky-clip"><path fill="#fff" d="M.1.85h15.3v15.3H.1z"/></clipPath></defs>
|
||||
</symbol>
|
||||
<symbol id="discord-icon" viewBox="0 0 20 19">
|
||||
<path fill="#08060d" d="M16.224 3.768a14.5 14.5 0 0 0-3.67-1.153c-.158.286-.343.67-.47.976a13.5 13.5 0 0 0-4.067 0c-.128-.306-.317-.69-.476-.976A14.4 14.4 0 0 0 3.868 3.77C1.546 7.28.916 10.703 1.231 14.077a14.7 14.7 0 0 0 4.5 2.306q.545-.748.965-1.587a9.5 9.5 0 0 1-1.518-.74q.191-.14.372-.293c2.927 1.369 6.107 1.369 8.999 0q.183.152.372.294-.723.437-1.52.74.418.838.963 1.588a14.6 14.6 0 0 0 4.504-2.308c.37-3.911-.63-7.302-2.644-10.309m-9.13 8.234c-.878 0-1.599-.82-1.599-1.82 0-.998.705-1.82 1.6-1.82.894 0 1.614.82 1.599 1.82.001 1-.705 1.82-1.6 1.82m5.91 0c-.878 0-1.599-.82-1.599-1.82 0-.998.705-1.82 1.6-1.82.893 0 1.614.82 1.599 1.82 0 1-.706 1.82-1.6 1.82"/>
|
||||
</symbol>
|
||||
<symbol id="documentation-icon" viewBox="0 0 21 20">
|
||||
<path fill="none" stroke="#aa3bff" stroke-linecap="round" stroke-linejoin="round" stroke-width="1.35" d="m15.5 13.333 1.533 1.322c.645.555.967.833.967 1.178s-.322.623-.967 1.179L15.5 18.333m-3.333-5-1.534 1.322c-.644.555-.966.833-.966 1.178s.322.623.966 1.179l1.534 1.321"/>
|
||||
<path fill="none" stroke="#aa3bff" stroke-linecap="round" stroke-linejoin="round" stroke-width="1.35" d="M17.167 10.836v-4.32c0-1.41 0-2.117-.224-2.68-.359-.906-1.118-1.621-2.08-1.96-.599-.21-1.349-.21-2.848-.21-2.623 0-3.935 0-4.983.369-1.684.591-3.013 1.842-3.641 3.428C3 6.449 3 7.684 3 10.154v2.122c0 2.558 0 3.838.706 4.726q.306.383.713.671c.76.536 1.79.64 3.581.66"/>
|
||||
<path fill="none" stroke="#aa3bff" stroke-linecap="round" stroke-linejoin="round" stroke-width="1.35" d="M3 10a2.78 2.78 0 0 1 2.778-2.778c.555 0 1.209.097 1.748-.047.48-.129.854-.503.982-.982.145-.54.048-1.194.048-1.749a2.78 2.78 0 0 1 2.777-2.777"/>
|
||||
</symbol>
|
||||
<symbol id="github-icon" viewBox="0 0 19 19">
|
||||
<path fill="#08060d" fill-rule="evenodd" d="M9.356 1.85C5.05 1.85 1.57 5.356 1.57 9.694a7.84 7.84 0 0 0 5.324 7.44c.387.079.528-.168.528-.376 0-.182-.013-.805-.013-1.454-2.165.467-2.616-.935-2.616-.935-.349-.91-.864-1.143-.864-1.143-.71-.48.051-.48.051-.48.787.051 1.2.805 1.2.805.695 1.194 1.817.857 2.268.649.064-.507.27-.857.49-1.052-1.728-.182-3.545-.857-3.545-3.87 0-.857.31-1.558.8-2.104-.078-.195-.349-1 .077-2.078 0 0 .657-.208 2.14.805a7.5 7.5 0 0 1 1.946-.26c.657 0 1.328.092 1.946.26 1.483-1.013 2.14-.805 2.14-.805.426 1.078.155 1.883.078 2.078.502.546.799 1.247.799 2.104 0 3.013-1.818 3.675-3.558 3.87.284.247.528.714.528 1.454 0 1.052-.012 1.896-.012 2.156 0 .208.142.455.528.377a7.84 7.84 0 0 0 5.324-7.441c.013-4.338-3.48-7.844-7.773-7.844" clip-rule="evenodd"/>
|
||||
</symbol>
|
||||
<symbol id="social-icon" viewBox="0 0 20 20">
|
||||
<path fill="none" stroke="#aa3bff" stroke-linecap="round" stroke-linejoin="round" stroke-width="1.35" d="M12.5 6.667a4.167 4.167 0 1 0-8.334 0 4.167 4.167 0 0 0 8.334 0"/>
|
||||
<path fill="none" stroke="#aa3bff" stroke-linecap="round" stroke-linejoin="round" stroke-width="1.35" d="M2.5 16.667a5.833 5.833 0 0 1 8.75-5.053m3.837.474.513 1.035c.07.144.257.282.414.309l.93.155c.596.1.736.536.307.965l-.723.73a.64.64 0 0 0-.152.531l.207.903c.164.715-.213.991-.84.618l-.872-.52a.63.63 0 0 0-.577 0l-.872.52c-.624.373-1.003.094-.84-.618l.207-.903a.64.64 0 0 0-.152-.532l-.723-.729c-.426-.43-.289-.864.306-.964l.93-.156a.64.64 0 0 0 .412-.31l.513-1.034c.28-.562.735-.562 1.012 0"/>
|
||||
</symbol>
|
||||
<symbol id="x-icon" viewBox="0 0 19 19">
|
||||
<path fill="#08060d" fill-rule="evenodd" d="M1.893 1.98c.052.072 1.245 1.769 2.653 3.77l2.892 4.114c.183.261.333.48.333.486s-.068.089-.152.183l-.522.593-.765.867-3.597 4.087c-.375.426-.734.834-.798.905a1 1 0 0 0-.118.148c0 .01.236.017.664.017h.663l.729-.83c.4-.457.796-.906.879-.999a692 692 0 0 0 1.794-2.038c.034-.037.301-.34.594-.675l.551-.624.345-.392a7 7 0 0 1 .34-.374c.006 0 .93 1.306 2.052 2.903l2.084 2.965.045.063h2.275c1.87 0 2.273-.003 2.266-.021-.008-.02-1.098-1.572-3.894-5.547-2.013-2.862-2.28-3.246-2.273-3.266.008-.019.282-.332 2.085-2.38l2-2.274 1.567-1.782c.022-.028-.016-.03-.65-.03h-.674l-.3.342a871 871 0 0 1-1.782 2.025c-.067.075-.405.458-.75.852a100 100 0 0 1-.803.91c-.148.172-.299.344-.99 1.127-.304.343-.32.358-.345.327-.015-.019-.904-1.282-1.976-2.808L6.365 1.85H1.8zm1.782.91 8.078 11.294c.772 1.08 1.413 1.973 1.425 1.984.016.017.241.02 1.05.017l1.03-.004-2.694-3.766L7.796 5.75 5.722 2.852l-1.039-.004-1.039-.004z" clip-rule="evenodd"/>
|
||||
</symbol>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 4.9 KiB |
184
control/ui/src/App.css
Normal file
184
control/ui/src/App.css
Normal file
@@ -0,0 +1,184 @@
|
||||
.counter {
|
||||
font-size: 16px;
|
||||
padding: 5px 10px;
|
||||
border-radius: 5px;
|
||||
color: var(--accent);
|
||||
background: var(--accent-bg);
|
||||
border: 2px solid transparent;
|
||||
transition: border-color 0.3s;
|
||||
margin-bottom: 24px;
|
||||
|
||||
&:hover {
|
||||
border-color: var(--accent-border);
|
||||
}
|
||||
&:focus-visible {
|
||||
outline: 2px solid var(--accent);
|
||||
outline-offset: 2px;
|
||||
}
|
||||
}
|
||||
|
||||
.hero {
|
||||
position: relative;
|
||||
|
||||
.base,
|
||||
.framework,
|
||||
.vite {
|
||||
inset-inline: 0;
|
||||
margin: 0 auto;
|
||||
}
|
||||
|
||||
.base {
|
||||
width: 170px;
|
||||
position: relative;
|
||||
z-index: 0;
|
||||
}
|
||||
|
||||
.framework,
|
||||
.vite {
|
||||
position: absolute;
|
||||
}
|
||||
|
||||
.framework {
|
||||
z-index: 1;
|
||||
top: 34px;
|
||||
height: 28px;
|
||||
transform: perspective(2000px) rotateZ(300deg) rotateX(44deg) rotateY(39deg)
|
||||
scale(1.4);
|
||||
}
|
||||
|
||||
.vite {
|
||||
z-index: 0;
|
||||
top: 107px;
|
||||
height: 26px;
|
||||
width: auto;
|
||||
transform: perspective(2000px) rotateZ(300deg) rotateX(40deg) rotateY(39deg)
|
||||
scale(0.8);
|
||||
}
|
||||
}
|
||||
|
||||
#center {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 25px;
|
||||
place-content: center;
|
||||
place-items: center;
|
||||
flex-grow: 1;
|
||||
|
||||
@media (max-width: 1024px) {
|
||||
padding: 32px 20px 24px;
|
||||
gap: 18px;
|
||||
}
|
||||
}
|
||||
|
||||
#next-steps {
|
||||
display: flex;
|
||||
border-top: 1px solid var(--border);
|
||||
text-align: left;
|
||||
|
||||
& > div {
|
||||
flex: 1 1 0;
|
||||
padding: 32px;
|
||||
@media (max-width: 1024px) {
|
||||
padding: 24px 20px;
|
||||
}
|
||||
}
|
||||
|
||||
.icon {
|
||||
margin-bottom: 16px;
|
||||
width: 22px;
|
||||
height: 22px;
|
||||
}
|
||||
|
||||
@media (max-width: 1024px) {
|
||||
flex-direction: column;
|
||||
text-align: center;
|
||||
}
|
||||
}
|
||||
|
||||
#docs {
|
||||
border-right: 1px solid var(--border);
|
||||
|
||||
@media (max-width: 1024px) {
|
||||
border-right: none;
|
||||
border-bottom: 1px solid var(--border);
|
||||
}
|
||||
}
|
||||
|
||||
#next-steps ul {
|
||||
list-style: none;
|
||||
padding: 0;
|
||||
display: flex;
|
||||
gap: 8px;
|
||||
margin: 32px 0 0;
|
||||
|
||||
.logo {
|
||||
height: 18px;
|
||||
}
|
||||
|
||||
a {
|
||||
color: var(--text-h);
|
||||
font-size: 16px;
|
||||
border-radius: 6px;
|
||||
background: var(--social-bg);
|
||||
display: flex;
|
||||
padding: 6px 12px;
|
||||
align-items: center;
|
||||
gap: 8px;
|
||||
text-decoration: none;
|
||||
transition: box-shadow 0.3s;
|
||||
|
||||
&:hover {
|
||||
box-shadow: var(--shadow);
|
||||
}
|
||||
.button-icon {
|
||||
height: 18px;
|
||||
width: 18px;
|
||||
}
|
||||
}
|
||||
|
||||
@media (max-width: 1024px) {
|
||||
margin-top: 20px;
|
||||
flex-wrap: wrap;
|
||||
justify-content: center;
|
||||
|
||||
li {
|
||||
flex: 1 1 calc(50% - 8px);
|
||||
}
|
||||
|
||||
a {
|
||||
width: 100%;
|
||||
justify-content: center;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#spacer {
|
||||
height: 88px;
|
||||
border-top: 1px solid var(--border);
|
||||
@media (max-width: 1024px) {
|
||||
height: 48px;
|
||||
}
|
||||
}
|
||||
|
||||
.ticks {
|
||||
position: relative;
|
||||
width: 100%;
|
||||
|
||||
&::before,
|
||||
&::after {
|
||||
content: '';
|
||||
position: absolute;
|
||||
top: -4.5px;
|
||||
border: 5px solid transparent;
|
||||
}
|
||||
|
||||
&::before {
|
||||
left: 0;
|
||||
border-left-color: var(--border);
|
||||
}
|
||||
&::after {
|
||||
right: 0;
|
||||
border-right-color: var(--border);
|
||||
}
|
||||
}
|
||||
8
control/ui/src/App.tsx
Normal file
8
control/ui/src/App.tsx
Normal file
@@ -0,0 +1,8 @@
|
||||
import { RouterProvider } from 'react-router-dom'
|
||||
import { createBrowserAppRouter } from './app/router'
|
||||
|
||||
const router = createBrowserAppRouter()
|
||||
|
||||
export default function App() {
|
||||
return <RouterProvider router={router} />
|
||||
}
|
||||
122
control/ui/src/api/client.ts
Normal file
122
control/ui/src/api/client.ts
Normal file
@@ -0,0 +1,122 @@
|
||||
type RequestIds = {
|
||||
requestId: string
|
||||
correlationId?: string
|
||||
traceparent?: string
|
||||
}
|
||||
|
||||
const LAST_IDS_STORAGE_KEY = 'control:last_request_ids'
|
||||
|
||||
export class ApiError extends Error {
|
||||
status: number
|
||||
requestId: string
|
||||
correlationId?: string
|
||||
traceparent?: string
|
||||
|
||||
constructor(args: {
|
||||
status: number
|
||||
message: string
|
||||
requestId: string
|
||||
correlationId?: string
|
||||
traceparent?: string
|
||||
}) {
|
||||
super(args.message)
|
||||
this.name = 'ApiError'
|
||||
this.status = args.status
|
||||
this.requestId = args.requestId
|
||||
this.correlationId = args.correlationId
|
||||
this.traceparent = args.traceparent
|
||||
}
|
||||
}
|
||||
|
||||
const state: {
|
||||
last?: RequestIds
|
||||
} = {}
|
||||
|
||||
function isRecord(value: unknown): value is Record<string, unknown> {
|
||||
return typeof value === 'object' && value !== null
|
||||
}
|
||||
|
||||
function loadLastIds(): RequestIds | undefined {
|
||||
try {
|
||||
const raw = localStorage.getItem(LAST_IDS_STORAGE_KEY)
|
||||
if (!raw) return undefined
|
||||
const parsed = JSON.parse(raw) as unknown
|
||||
if (isRecord(parsed) && typeof parsed.requestId === 'string') {
|
||||
const correlationId =
|
||||
typeof parsed.correlationId === 'string' ? parsed.correlationId : undefined
|
||||
const traceparent =
|
||||
typeof parsed.traceparent === 'string' ? parsed.traceparent : undefined
|
||||
return { requestId: parsed.requestId, correlationId, traceparent }
|
||||
}
|
||||
} catch {
|
||||
return undefined
|
||||
}
|
||||
return undefined
|
||||
}
|
||||
|
||||
function persistLastIds(ids: RequestIds) {
|
||||
try {
|
||||
localStorage.setItem(LAST_IDS_STORAGE_KEY, JSON.stringify(ids))
|
||||
} catch {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
function newRequestId(): string {
|
||||
if (typeof crypto !== 'undefined' && 'randomUUID' in crypto) {
|
||||
return crypto.randomUUID()
|
||||
}
|
||||
return `${Date.now()}-${Math.random().toString(16).slice(2)}`
|
||||
}
|
||||
|
||||
export function getLastRequestIds(): RequestIds | undefined {
|
||||
return state.last ?? loadLastIds()
|
||||
}
|
||||
|
||||
type ApiRequestInit = RequestInit & {
|
||||
correlationId?: string
|
||||
traceparent?: string
|
||||
useLastCorrelationId?: boolean
|
||||
useLastTraceparent?: boolean
|
||||
}
|
||||
|
||||
export async function apiFetch(
|
||||
input: RequestInfo | URL,
|
||||
init?: ApiRequestInit,
|
||||
) {
|
||||
const requestId = newRequestId()
|
||||
|
||||
const headers = new Headers(init?.headers)
|
||||
headers.set('x-request-id', requestId)
|
||||
const last = getLastRequestIds()
|
||||
const correlationId =
|
||||
init?.correlationId ?? (init?.useLastCorrelationId ? last?.correlationId : undefined)
|
||||
const traceparent =
|
||||
init?.traceparent ?? (init?.useLastTraceparent ? last?.traceparent : undefined)
|
||||
|
||||
if (correlationId) headers.set('x-correlation-id', correlationId)
|
||||
if (traceparent) headers.set('traceparent', traceparent)
|
||||
|
||||
const res = await fetch(input, { ...init, headers })
|
||||
const resCorrelationId = res.headers.get('x-correlation-id') ?? correlationId ?? undefined
|
||||
const resTraceparent = res.headers.get('traceparent') ?? traceparent ?? undefined
|
||||
const ids = { requestId, correlationId: resCorrelationId, traceparent: resTraceparent }
|
||||
state.last = ids
|
||||
persistLastIds(ids)
|
||||
|
||||
if (!res.ok) {
|
||||
const text = await res.text().catch(() => '')
|
||||
const err = new ApiError({
|
||||
status: res.status,
|
||||
requestId,
|
||||
correlationId: resCorrelationId,
|
||||
traceparent: resTraceparent,
|
||||
message: `API error ${res.status}${text ? `: ${text}` : ''} (request_id=${requestId}${
|
||||
resCorrelationId ? ` correlation_id=${resCorrelationId}` : ''
|
||||
})`,
|
||||
})
|
||||
throw err
|
||||
}
|
||||
|
||||
return res
|
||||
}
|
||||
179
control/ui/src/api/control.ts
Normal file
179
control/ui/src/api/control.ts
Normal file
@@ -0,0 +1,179 @@
|
||||
import { apiFetch } from './client'
|
||||
import { getAccessToken } from '../auth/token'
|
||||
|
||||
function baseUrl() {
|
||||
const v = import.meta.env.VITE_CONTROL_API_URL as string | undefined
|
||||
return (v ?? 'http://127.0.0.1:8080').replace(/\/$/, '')
|
||||
}
|
||||
|
||||
async function apiJson<T>(path: string): Promise<T> {
|
||||
const controller = new AbortController()
|
||||
const t = window.setTimeout(() => controller.abort(), 2000)
|
||||
|
||||
const token = getAccessToken()
|
||||
const headers: HeadersInit = token ? { Authorization: `Bearer ${token}` } : {}
|
||||
|
||||
try {
|
||||
const res = await apiFetch(`${baseUrl()}${path}`, {
|
||||
headers,
|
||||
signal: controller.signal,
|
||||
useLastCorrelationId: true,
|
||||
useLastTraceparent: true,
|
||||
})
|
||||
return (await res.json()) as T
|
||||
} finally {
|
||||
window.clearTimeout(t)
|
||||
}
|
||||
}
|
||||
|
||||
async function apiPostJson<T>(path: string, body: unknown, idempotencyKey?: string): Promise<T> {
|
||||
const controller = new AbortController()
|
||||
const t = window.setTimeout(() => controller.abort(), 2000)
|
||||
|
||||
const token = getAccessToken()
|
||||
const headers: HeadersInit = {
|
||||
'content-type': 'application/json',
|
||||
...(token ? { Authorization: `Bearer ${token}` } : {}),
|
||||
...(idempotencyKey ? { 'Idempotency-Key': idempotencyKey } : {}),
|
||||
}
|
||||
|
||||
try {
|
||||
const res = await apiFetch(`${baseUrl()}${path}`, {
|
||||
method: 'POST',
|
||||
headers,
|
||||
body: JSON.stringify(body),
|
||||
signal: controller.signal,
|
||||
useLastCorrelationId: true,
|
||||
useLastTraceparent: true,
|
||||
})
|
||||
return (await res.json()) as T
|
||||
} finally {
|
||||
window.clearTimeout(t)
|
||||
}
|
||||
}
|
||||
|
||||
export type FleetSnapshot = {
|
||||
services: Array<{
|
||||
name: string
|
||||
base_url: string
|
||||
health_ok: boolean
|
||||
ready_ok: boolean
|
||||
metrics_ok: boolean
|
||||
}>
|
||||
}
|
||||
|
||||
export type PlacementResponse = {
|
||||
kind: 'aggregate' | 'projection' | 'runner'
|
||||
revision: string
|
||||
placements: Array<{ tenant_id: string; targets: string[] }>
|
||||
}
|
||||
|
||||
export type TenantsResponse = {
|
||||
tenants: Array<{
|
||||
tenant_id: string
|
||||
aggregate_targets: string[]
|
||||
projection_targets: string[]
|
||||
runner_targets: string[]
|
||||
}>
|
||||
}
|
||||
|
||||
export type Job = {
|
||||
job_id: string
|
||||
status: 'pending' | 'running' | 'succeeded' | 'failed' | 'cancelled'
|
||||
steps: Array<{ name: string; status: Job['status']; attempts: number; error?: string | null }>
|
||||
error?: string | null
|
||||
created_at_ms: number
|
||||
started_at_ms?: number | null
|
||||
finished_at_ms?: number | null
|
||||
}
|
||||
|
||||
export type AuditEvent = {
|
||||
ts_ms: number
|
||||
principal_sub: string
|
||||
action: string
|
||||
tenant_id?: string | null
|
||||
reason: string
|
||||
job_id?: string | null
|
||||
}
|
||||
|
||||
export function getFleetSnapshot(): Promise<FleetSnapshot> {
|
||||
return apiJson('/admin/v1/fleet/snapshot')
|
||||
}
|
||||
|
||||
export function getPlacement(kind: 'aggregate' | 'projection' | 'runner'): Promise<PlacementResponse> {
|
||||
return apiJson(`/admin/v1/placement/${kind}`)
|
||||
}
|
||||
|
||||
export function getTenants(): Promise<TenantsResponse> {
|
||||
return apiJson('/admin/v1/tenants')
|
||||
}
|
||||
|
||||
export function getJob(jobId: string): Promise<Job> {
|
||||
return apiJson(`/admin/v1/jobs/${jobId}`)
|
||||
}
|
||||
|
||||
export function cancelJob(jobId: string): Promise<void> {
|
||||
return apiPostJson(`/admin/v1/jobs/${jobId}/cancel`, {}, undefined).then(() => undefined)
|
||||
}
|
||||
|
||||
export function startTenantDrainJob(args: {
|
||||
tenantId: string
|
||||
reason: string
|
||||
idempotencyKey: string
|
||||
}): Promise<{ job_id: string }> {
|
||||
return apiPostJson(
|
||||
'/admin/v1/jobs/tenant/drain',
|
||||
{ tenant_id: args.tenantId, reason: args.reason },
|
||||
args.idempotencyKey,
|
||||
)
|
||||
}
|
||||
|
||||
export function startTenantMigrateJob(args: {
|
||||
tenantId: string
|
||||
runnerTarget: string
|
||||
reason: string
|
||||
idempotencyKey: string
|
||||
}): Promise<{ job_id: string }> {
|
||||
return apiPostJson(
|
||||
'/admin/v1/jobs/tenant/migrate',
|
||||
{ tenant_id: args.tenantId, runner_target: args.runnerTarget, reason: args.reason },
|
||||
args.idempotencyKey,
|
||||
)
|
||||
}
|
||||
|
||||
export function planTenantMigrate(args: { tenantId: string; runnerTarget: string; reason: string }): Promise<{ steps: string[] }> {
|
||||
return apiPostJson('/admin/v1/plan/tenant/migrate', {
|
||||
tenant_id: args.tenantId,
|
||||
runner_target: args.runnerTarget,
|
||||
reason: args.reason,
|
||||
})
|
||||
}
|
||||
|
||||
export function listAudit(): Promise<{ events: AuditEvent[] }> {
|
||||
return apiJson('/admin/v1/audit')
|
||||
}
|
||||
|
||||
export type SwarmService = {
|
||||
name: string
|
||||
image?: string | null
|
||||
mode?: string | null
|
||||
replicas?: string | null
|
||||
updated_at?: string | null
|
||||
}
|
||||
|
||||
export type SwarmTask = {
|
||||
id: string
|
||||
service: string
|
||||
node?: string | null
|
||||
desired_state?: string | null
|
||||
current_state?: string | null
|
||||
error?: string | null
|
||||
}
|
||||
|
||||
export function getSwarmServices(): Promise<{ services: SwarmService[] }> {
|
||||
return apiJson('/admin/v1/swarm/services')
|
||||
}
|
||||
|
||||
export function getSwarmTasks(serviceName: string): Promise<{ service: string; tasks: SwarmTask[] }> {
|
||||
return apiJson(`/admin/v1/swarm/services/${encodeURIComponent(serviceName)}/tasks`)
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user