Monorepo consolidation: workspace, shared types, transport plans, docker/swam assets

2026-03-30 11:40:42 +03:00
parent 7e7041cf8b
commit 1298d9a3df
246 changed files with 55434 additions and 0 deletions
--- a/gateway/.gitignore
+++ b/gateway/.gitignore
@@ -0,0 +1,27 @@
+/target/
+/target-*/
+**/target/
+*.rs.bk
+*.pdb
+*.dSYM/
+*.orig
+*.rej
+*.log
+.DS_Store
+.idea/
+.vscode/
+*.swp
+*.swo
+.env
+.env.*
+.envrc
+.direnv/
+*.mdbx
+*.mdbx-*
+/data/
+/tmp/
+/coverage/
+lcov.info
+*.profraw
+*.profdata
+docker-compose.override.yml
--- a/gateway/Cargo.toml
+++ b/gateway/Cargo.toml
@@ -0,0 +1,48 @@
+ [package]
+ name = "gateway"
+ version = "0.1.0"
+ edition = "2021"
+ 
+ [dependencies]
+shared = { path = "../shared" }
+ anyhow = "1"
+argon2 = "0.5"
+async-nats = "0.39"
+async-trait = "0.1"
+ axum = { version = "0.7", features = ["json"] }
+base32 = "0.5"
+chrono = { version = "0.4", features = ["serde"] }
+edge_storage = { version = "0.1", registry = "madapes" }
+edge-logger-client = { version = "0.1", registry = "madapes" }
+futures = "0.3"
+hex = "0.4"
+hmac = "0.12"
+ http = "1"
+jsonwebtoken = "9"
+libmdbx = "0.6"
+ metrics = "0.23"
+ metrics-exporter-prometheus = "0.15"
+rand_core = "0.6"
+prost = "0.13"
+reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
+serde = { version = "1", features = ["derive", "rc"] }
+ serde_json = "1"
+serde_yaml = "0.9"
+sha2 = "0.10"
+sha1 = "0.10"
+subtle = "2"
+ thiserror = "2"
+ tokio = { version = "1", features = ["macros", "rt-multi-thread", "signal"] }
+tonic = { version = "0.12", default-features = false, features = ["codegen", "prost", "transport", "tls"] }
+ tower = { version = "0.5", features = ["timeout", "util"] }
+tower-http = { version = "0.6", features = ["limit", "request-id", "trace"] }
+ tracing = "0.1"
+ tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt", "json"] }
+urlencoding = "2"
+ uuid = { version = "1", features = ["v4"] }
+ 
+ [dev-dependencies]
+
+[build-dependencies]
+tonic-build = { version = "0.12", default-features = false, features = ["prost"] }
+protoc-bin-vendored = "3"
--- a/gateway/DEPLOYMENT.md
+++ b/gateway/DEPLOYMENT.md
@@ -0,0 +1,62 @@
+# Deployment Notes
+
+## Swarm Ingress and TLS
+
+Gateway exposes:
+- HTTP: `:8080`
+- gRPC: `:8081`
+
+Recommended pattern in Swarm:
+- Terminate TLS at an ingress proxy (Traefik / Nginx / Envoy) on the overlay network.
+- Route HTTP and gRPC to the Gateway service by port.
+- Prefer L7 routing (Host + Path for HTTP, SNI for gRPC) and keep the Swarm routing mesh disabled unless you explicitly want it.
+
+## Secrets
+
+The Swarm stack expects these secrets to exist:
+- `gateway_jwt_secrets` (comma-separated or newline-separated signing secrets)
+- `google_oidc_client_id`
+- `google_oidc_client_secret`
+
+The container reads them via:
+- `GATEWAY_JWT_SECRETS_FILE`
+- `GOOGLE_OIDC_CLIENT_ID_FILE`
+- `GOOGLE_OIDC_CLIENT_SECRET_FILE`
+
+## Internal mTLS (Optional)
+
+Gateway can be configured to use mTLS when calling internal upstreams.
+
+HTTP upstream (reqwest):
+- `GATEWAY_INTERNAL_CA_CERT_PEM_FILE`
+- `GATEWAY_INTERNAL_IDENTITY_PEM_FILE` (combined cert + key PEM)
+
+gRPC upstream (tonic):
+- `GATEWAY_INTERNAL_GRPC_TLS` = `true|false` (or use `https://` upstream URLs)
+- `GATEWAY_INTERNAL_GRPC_CA_CERT_PEM_FILE`
+- `GATEWAY_INTERNAL_GRPC_CLIENT_CERT_PEM_FILE`
+- `GATEWAY_INTERNAL_GRPC_CLIENT_KEY_PEM_FILE`
+
+## HA Validation (Manual)
+
+With `gateway` running at `replicas: 2`:
+- Verify `/ready` stays healthy during rolling updates.
+- Verify refresh rotation works across replicas (no sticky sessions):
+  - Sign in → refresh from one replica → refresh again against the other replica and confirm the old token is rejected.
+- Verify admin IAM changes are visible across replicas:
+  - Create a role/assignment via `/admin/iam` on one replica → call an authorized endpoint via the other replica.
+
+### Suggested Procedure
+
+1. Deploy the stack:
+   - `docker stack deploy -c swarm/stacks/platform.yml cloudlysis`
+2. Confirm `gateway` has 2 running tasks:
+   - `docker service ps cloudlysis_gateway`
+3. Smoke readiness (at least one replica reachable through ingress):
+   - `curl -fsS http://localhost:8080/ready`
+4. Refresh across replicas:
+   - Run the refresh flow twice while forcing traffic to different replicas (hostnames or per-task published ports if you use an ingress proxy).
+5. Rolling update guard:
+   - Update the image tag and redeploy, then watch:
+     - `docker service ps cloudlysis_gateway`
+     - `curl -fsS http://localhost:8080/ready` in a loop
--- a/gateway/DEVELOPMENT_PLAN.md
+++ b/gateway/DEVELOPMENT_PLAN.md
@@ -0,0 +1,339 @@
+# Development Plan: Gateway
+
+## Overview
+
+This plan breaks down the Gateway implementation into milestones ordered by dependency. Each milestone includes:
+- **Tasks** with clear deliverables
+- **Test Requirements** (unit tests + tautological tests + integration tests where applicable)
+- **Dependencies** on previous milestones
+
+**Development Approach:**
+1. Complete one milestone at a time
+2. Write tests before implementation (TDD where applicable)
+3. Do not start the next milestone until the current milestone’s tests are passing (green)
+4. Mark tasks complete with `[x]` as you progress
+
+---
+
+## Milestone 1: Project Foundation
+
+**Goal:** Create the Gateway service as a Rust project aligned with existing node conventions (Axum + Tokio + tracing + Prometheus metrics).
+
+### Tasks
+- [x] **1.1** Initialize Cargo project
+  - Create `src/lib.rs` and `src/main.rs`
+  - Establish module layout for: http, grpc, authn, authz, routing, upstream, observability, config, storage
+- [x] **1.2** Choose and wire core dependencies (aligned with existing services)
+  - HTTP: `axum`
+  - gRPC: `tonic`
+  - Runtime: `tokio`
+  - Serialization: `serde`, `serde_json`
+  - Errors: `thiserror`, `anyhow`
+  - Telemetry: `tracing`, `metrics-exporter-prometheus` or existing metrics pattern in the codebase
+- [x] **1.3** Add baseline runtime endpoints
+  - `GET /health`, `GET /ready`, `GET /metrics`
+  - Structured logs with request id propagation
+
+### Tests
+- [x] **T1.1** Project compiles
+- [x] **T1.2** `GET /health` returns 200
+- [x] **T1.3** Tautological test: core state types are Send + Sync
+
+---
+
+## Milestone 2: Persistent State (Auth + RBAC + Sessions) for HA
+
+**Goal:** Define where Gateway state lives so the service can run as **HA (max 2 replicas)** without sticky sessions and without losing auth/admin consistency.
+
+### Dependencies
+- Milestone 1 (project foundation)
+
+### Tasks
+- [x] **2.1** Choose and implement the backing store for identity + authorization state
+  - Recommended default for platform alignment: NATS JetStream KV buckets for:
+    - users
+    - identities (OIDC links)
+    - password credential records (hash only)
+    - refresh token/session records (hash only, revocable, rotating)
+    - MFA enrollments + recovery codes (hash only)
+    - rights/roles/assignments
+    - audit log index (append-only model)
+- [x] **2.2** Define storage schema + versioning
+  - Key naming conventions
+  - JSON shapes (forward compatible)
+  - Migration strategy for schema changes
+- [x] **2.3** Implement storage client abstraction
+  - CRUD primitives with compare-and-set semantics where needed (e.g., refresh rotation)
+  - Pagination/scan strategy for admin listing endpoints
+  - Consistent error mapping for storage failures
+
+### Tests
+- [x] **T2.1** Sensitive values are stored only as hashes (reset tokens, refresh tokens, recovery codes)
+- [x] **T2.2** Refresh token rotation is atomic (cannot be used twice under concurrency)
+- [x] **T2.3** Tautological test: storage client is Send + Sync
+
+---
+
+## Milestone 3: Routing Config + Service Discovery
+
+**Goal:** Implement the routing layer described in [prd.md](file:///Users/vlad/Developer/cloudlysis/gateway/prd.md), supporting independent placement per service kind and hot reload.
+
+### Dependencies
+- Milestone 1 (project foundation)
+ 
+### Exit Criteria
+- All Milestone 3 tests pass
+
+### Tasks
+- [x] **3.1** Define routing config model (in-memory)
+  - Placement maps per service kind: `aggregate_placement`, `projection_placement`, `runner_placement`
+  - Shard directory per service kind: `*_shards[shard_id] -> endpoint(s)`
+  - Revision tracking and last-known-good semantics
+- [x] **3.2** Implement config sources
+  - Static file config for local development
+  - NATS JetStream KV watcher for production
+- [x] **3.3** Implement routing decision API
+  - `(tenant_id, service_kind) -> selected endpoint`
+  - Admin introspection: `GET /admin/routing`
+- [x] **3.4** Implement config reload semantics
+  - `POST /admin/routing/reload` to force refresh
+  - Watcher-based reload that updates atomically
+
+### Tests
+- [x] **T3.1** Routing resolves endpoints for `(tenant_id, service_kind)` correctly
+- [x] **T3.2** Hot reload swaps routing tables atomically (no partial reads)
+- [x] **T3.3** Unknown tenant returns a consistent, typed routing error
+
+---
+
+## Milestone 4: AuthN Core (Tokens, Passwords, OIDC, MFA)
+
+**Goal:** Implement the authentication layer and the public AuthN HTTP APIs described in the PRD: signup/signin/signout/refresh/forgot/reset and MFA primitives.
+
+### Dependencies
+- Milestone 1 (project foundation)
+ - Milestone 2 (persistent state)
+
+### Exit Criteria
+- All Milestone 4 tests pass
+
+### Tasks
+- [x] **4.1** Implement token model
+  - Access token (short-lived)
+  - Refresh token (rotating, revocable)
+  - Key rotation for signing keys
+- [x] **4.2** Implement password flows
+  - `POST /v1/auth/signup`, `POST /v1/auth/signin`, `POST /v1/auth/signout`, `POST /v1/auth/refresh`
+  - Forgot/reset: `POST /v1/auth/forgot`, `POST /v1/auth/reset`
+- [x] **4.3** Implement Google OIDC integration points
+  - `POST /v1/auth/oidc/google/start`
+  - `GET /v1/auth/oidc/google/callback`
+  - Account linking rules
+- [x] **4.4** Implement MFA (TOTP) primitives
+  - Enrollment start/confirm
+  - Challenge and verification
+  - Recovery codes
+- [x] **4.5** Abuse protections
+  - Rate limits for signin/forgot/reset
+  - Generic “account not found” responses where appropriate
+
+### Tests
+- [x] **T4.1** Password hashing/verification works (Argon2id)
+- [x] **T4.2** Refresh token rotation: old refresh token is invalid after use
+- [x] **T4.3** Forgot/reset tokens are one-time and expire
+- [x] **T4.4** MFA TOTP enrollment and challenge succeed for valid codes and fail for invalid
+
+---
+
+## Milestone 5: AuthZ (RBAC) + Tenant Enforcement
+
+**Goal:** Enforce authorization decisions at the Gateway boundary, including tenant selection rules for `x-tenant-id`.
+
+### Dependencies
+- Milestone 4 (authn)
+
+### Exit Criteria
+- All Milestone 5 tests pass
+
+### Tasks
+- [x] **5.1** Define RBAC model
+  - Rights (permissions), roles, assignments (principal ↔ tenant ↔ role)
+  - Platform admin vs tenant admin vs tenant member scoping rules
+- [x] **5.2** Implement authorization engine
+  - Inputs: principal, tenant_id, action, resource attributes (aggregate_type, view_type)
+  - Outputs: allow/deny with reason
+- [x] **5.3** Enforce `x-tenant-id` rules
+  - Required on tenant-scoped endpoints
+  - Validated format and tenant membership checks
+- [x] **5.4** Add consistent error envelope mapping (401/403/400)
+
+### Tests
+- [x] **T5.1** Tenant spoofing is rejected (principal lacks membership)
+- [x] **T5.2** Role assignment enables expected actions and denies others
+- [x] **T5.3** Missing `x-tenant-id` on tenant routes returns 400
+
+---
+
+## Milestone 6: Upstream Proxying (Aggregate / Projection / Runner)
+
+**Goal:** Route authenticated and authorized requests to the node services.
+
+### Dependencies
+- Milestone 3 (routing)
+- Milestone 5 (authz)
+
+### Exit Criteria
+- All Milestone 6 tests pass
+
+### Tasks
+- [x] **6.1** Aggregate submit command proxy
+  - gRPC server implementing `aggregate.gateway.v1.CommandService/SubmitCommand`
+  - HTTP wrapper `POST /v1/commands/{aggregate_type}/{aggregate_id}`
+  - Propagate `x-tenant-id` and correlation metadata
+  - Ensure safe retry semantics using `command_id` idempotency
+- [x] **6.2** Projection query proxy
+  - `POST /v1/query/{view_type}` forwarding to Projection query endpoint once available
+- [x] **6.3** Runner admin passthrough (admin-only)
+  - `/admin/runner/*` forwarding with strict authorization
+
+### Tests
+- [x] **T6.1** gRPC SubmitCommand forwards tenant metadata and returns upstream events
+- [x] **T6.2** HTTP command endpoint returns the same shape as gRPC response
+- [x] **T6.3** Query endpoint enforces tenant scoping and denies unauthorized callers
+
+---
+
+## Milestone 7: Admin IAM APIs (Users, Roles, Rights)
+
+**Goal:** Expose the admin IAM endpoints for the Admin UI node to manage authn/authz data.
+
+### Dependencies
+- Milestone 4 (authn)
+- Milestone 5 (authz)
+
+### Exit Criteria
+- All Milestone 7 tests pass
+
+### Tasks
+- [x] **7.1** Implement admin IAM endpoints
+  - Users CRUD and disable/delete
+  - Identities link/unlink (OIDC), manage password credentials
+  - Rights CRUD, roles CRUD, role↔rights management
+  - Assignments CRUD (principal ↔ tenant ↔ role)
+  - Service accounts credential create/rotate and tenant role assignment
+  - MFA admin actions (reset MFA, revoke recovery codes)
+  - Session revocation for user (global signout)
+- [x] **7.2** Implement audit trail for admin IAM actions
+  - Immutable record of actor, action, target, tenant scope, timestamp, request metadata
+
+### Tests
+- [x] **T7.1** Only platform/tenant admins can access relevant endpoints
+- [x] **T7.2** All admin mutations emit an audit record
+- [x] **T7.3** Assignment changes immediately affect authorization decisions
+
+---
+
+## Milestone 8: Rebalancing Operations (Control Plane Hooks)
+
+**Goal:** Provide the pieces needed to support tenant rebalancing as described in the PRD: visibility, readiness gates, and safe cutover support.
+
+### Dependencies
+- Milestone 3 (routing config)
+- Milestone 6 (upstream proxying)
+
+### Exit Criteria
+- All Milestone 8 tests pass
+
+### Tasks
+- [x] **8.1** Expose placement introspection and status
+  - Current placement revision per service kind
+  - Effective routing decisions for a given tenant (admin-only)
+- [x] **8.2** Define and implement readiness gates used by rebalancer
+  - Projection: warmup/catchup signal (lag)
+  - Runner: tenant drained / checkpoint stable signal
+  - Aggregate: tenant drain and state availability signal (as defined by upstream changes)
+- [x] **8.3** Add operator-facing rebalancing endpoints (optional if a separate rebalancer service exists)
+  - Plan/apply/rollback APIs with strong authorization
+
+### Tests
+- [x] **T8.1** Placement revision changes are visible immediately and atomically
+- [x] **T8.2** Rebalancing guardrails prevent cutover when target shard is not ready
+
+---
+
+## Milestone 9: Docker Swarm Deployment + HA (Max 2 Replicas)
+
+**Goal:** Define and validate the Docker Swarm architecture for Gateway, including HA behavior with at most **2 Gateway replicas**.
+
+### Dependencies
+- Milestone 1 (health/ready/metrics)
+- Milestone 2 (persistent state suitable for HA)
+- Milestone 6 (proxying) for end-to-end smoke tests
+
+### Exit Criteria
+- All Milestone 9 tests pass
+- The platform stack (`swarm/stacks/platform.yml`) can deploy the Gateway with `replicas: 2` and serve traffic during rolling updates
+
+### Tasks
+- [x] **9.1** Build container image
+  - Dockerfile, multi-stage build, minimal runtime image
+  - Embed build metadata (version, git sha)
+- [x] **9.2** Define Swarm service topology (2 nodes max)
+  - `gateway` service with `deploy.replicas: 2`
+  - Healthcheck based on `/ready`
+  - Rolling update strategy (start-first), rollback policy on failure
+  - Network: overlay network for internal traffic to NATS and nodes
+- [x] **9.3** Define ingress and TLS termination strategy
+  - Swarm routing mesh or an ingress proxy (document choice in stack)
+  - Ensure HTTP and gRPC can be routed correctly
+- [x] **9.4** Secrets and config distribution
+  - OIDC client secrets, JWT signing keys (rotation-ready), NATS credentials
+  - Use Swarm secrets/configs instead of environment variables for secrets where possible
+- [x] **9.5** HA behavior validation
+  - Run two replicas and ensure:
+    - refresh token rotation works across replicas (no stickiness)
+    - admin IAM updates are visible from both replicas
+    - in-flight requests survive a single replica restart
+
+### Tests
+- [x] **T9.1** `swarm/stacks/platform.yml` parses as valid YAML
+- [x] **T9.2** Smoke: deploy 2 replicas and confirm `/ready` is healthy on both
+- [x] **T9.3** Rolling update does not drop readiness below 1 available replica
+- [x] **T9.4** Auth session/refresh works across replicas (no sticky sessions required)
+
+---
+
+## Milestone 10: Observability + Hardening
+
+**Goal:** Make the Gateway production-ready with robust telemetry and safety defaults.
+
+### Dependencies
+- Milestone 6 (proxying)
+
+### Exit Criteria
+- All Milestone 10 tests pass
+
+### Tasks
+- [x] **10.1** Structured logs with correlation
+  - `request_id`, `trace_id`, principal id, tenant id (when present), upstream target
+- [x] **10.2** Metrics
+  - Request counts/latency, auth failures, upstream errors, routing misses, rate limit blocks
+- [x] **10.3** Security hardening
+  - CSRF protections for cookie-based flows
+  - JWT key rotation strategy and config
+  - mTLS/service auth boundary for internal upstreams
+- [x] **10.4** Load and failure testing strategy
+  - Soak tests for routing reload + auth endpoints
+  - Backpressure/timeouts/circuit breaker verification
+- [ ] **10.5** Correlation and trace context propagation (Gateway as source of truth)
+  - Accept inbound `x-correlation-id` and `traceparent` on HTTP and gRPC requests
+  - If missing, generate `x-correlation-id` at the start of request handling and start a new trace
+  - Echo `x-correlation-id` (and `traceparent` when applicable) on responses
+  - Propagate `x-correlation-id` and `traceparent` to upstream nodes (Aggregate/Projection/Runner) and record them in request spans/log fields
+
+### Tests
+- [x] **T10.1** Metrics include expected labels and counters increment correctly
+- [x] **T10.2** Secrets never appear in logs in representative error cases
+- [x] **T10.3** Rate limits trigger under abusive patterns
+- [ ] **T10.4** Gateway generates `x-correlation-id` when missing and echoes it on responses
+- [ ] **T10.5** Gateway propagates `x-correlation-id` and `traceparent` to upstream calls and includes them in logs/spans
--- a/gateway/LOAD_TESTING.md
+++ b/gateway/LOAD_TESTING.md
@@ -0,0 +1,44 @@
+# Load and Failure Testing Strategy
+
+## Goals
+
+- Verify the Gateway stays responsive under sustained traffic.
+- Verify auth flows behave correctly under concurrency.
+- Verify routing reloads are atomic and safe under load.
+- Verify upstream failures are bounded (timeouts) and observable (metrics/logs).
+
+## Scenarios
+
+### AuthN
+
+- Sign up once, then:
+  - Burst sign-in attempts to verify rate limits and correct 401/429 behavior.
+  - Parallel refresh calls to verify refresh rotation correctness.
+
+### Routing Reload
+
+- Run steady traffic to:
+  - `POST /v1/query/{view_type}`
+  - `POST /v1/commands/{aggregate_type}/{aggregate_id}`
+- Trigger `POST /admin/routing/reload` repeatedly and verify:
+  - No 500s from partial routing table reads.
+  - Routing decisions switch only at revision boundaries.
+
+### Upstream Failure Modes
+
+- Configure routing to a shard endpoint that:
+  - Refuses connections (ECONNREFUSED)
+  - Hangs (no response)
+  - Returns 5xx
+- Verify:
+  - Gateway timeouts are enforced.
+  - Errors are surfaced as 5xx to callers.
+  - `gateway_http_requests_total` and duration histograms capture the failures.
+
+### HA Behavior (Swarm)
+
+- Run `gateway` with 2 replicas and no sticky sessions.
+- Verify:
+  - Refresh works across replicas.
+  - IAM updates become effective immediately on both replicas.
+  - Rolling update keeps at least 1 replica ready.
--- a/gateway/build.rs
+++ b/gateway/build.rs
@@ -0,0 +1,12 @@
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let proto_path = "../aggregate/proto/aggregate.proto";
+    let proto_dir = "../aggregate/proto";
+
+    tonic_build::configure()
+        .build_server(true)
+        .build_client(true)
+        .compile_protos(&[proto_path], &[proto_dir])?;
+
+    println!("cargo:rerun-if-changed={}", proto_path);
+    Ok(())
+}
--- a/gateway/external_prd.md
+++ b/gateway/external_prd.md
@@ -0,0 +1,192 @@
+### External PRD: Changes Required in Aggregate, Projection, Runner
+
+This document captures the work needed outside the Gateway to support:
+- Tenant-aware routing via `x-tenant-id`
+- Independent horizontal scalability of Aggregate, Projection, Runner
+- A safe mechanism for tenant rebalancing per service kind
+
+---
+
+## **Target State**
+
+### Independent Placements
+
+Each service kind has its own placement map:
+- `aggregate_placement[tenant_id] -> aggregate_shard_id`
+- `projection_placement[tenant_id] -> projection_shard_id`
+- `runner_placement[tenant_id] -> runner_shard_id`
+
+Each shard is a replica set that can scale independently.
+
+### Rebalancing Contract (Per Service Kind)
+
+All nodes MUST support:
+- Dynamic placement updates (watch NATS KV or reload config)
+- A drain mechanism that can target a specific tenant (stop acquiring new work for that tenant, finish in-flight, report status)
+- Clear readiness semantics that reflect whether the node will accept work for a tenant
+
+Additionally, all nodes SHOULD converge on the same operational contract:
+- A per-tenant “accepting” gate (can this shard accept new work/queries/commands for tenant X?)
+- A per-tenant “drained” signal (no in-flight work remains for tenant X)
+- A per-tenant warmup/catchup signal where relevant (projection lag, aggregate snapshot availability)
+
+---
+
+## **Aggregate: Required Changes**
+
+### 1) Expose a Real Command API (Gateway Upstream)
+
+Today, Aggregate has internal command handling types (e.g., `CommandServer`) but its running HTTP server only exposes health/metrics/admin endpoints ([aggregate/http_server.rs](file:///Users/vlad/Developer/cloudlysis/aggregate/src/http_server.rs#L15-L82), [aggregate/server/mod.rs](file:///Users/vlad/Developer/cloudlysis/aggregate/src/server/mod.rs#L81-L213)).
+
+Aggregate MUST expose one of the following upstream APIs for the Gateway to call:
+- **Option A (Recommended)**: gRPC server implementing `aggregate.gateway.v1.CommandService/SubmitCommand` compatible with [aggregate.proto](file:///Users/vlad/Developer/cloudlysis/aggregate/proto/aggregate.proto#L1-L31).
+- **Option B**: HTTP endpoint for command submission (REST), with a stable request/response shape that the Gateway can proxy.
+
+### 2) Tenant Placement Enforcement
+
+Aggregate MUST enforce “hosted tenants” so independent scaling is safe:
+- If an Aggregate shard/node is not assigned a tenant, it MUST reject commands for that tenant (e.g., `403` or `503` with retriable hint depending on whether the issue is authorization vs placement).
+- Aggregate SHOULD maintain an in-memory allowlist of hosted tenants that is driven by:
+  - NATS KV placement watcher (preferred), or
+  - Hot-reloaded config pushed via `/admin/reload`
+
+Aggregate already has admin hooks for drain/reload, but they are currently generic and/or illustrative ([aggregate/http_server.rs](file:///Users/vlad/Developer/cloudlysis/aggregate/src/http_server.rs#L15-L72), [aggregate/server/mod.rs](file:///Users/vlad/Developer/cloudlysis/aggregate/src/server/mod.rs#L402-L442)). These need to become placement-aware.
+
+### 3) Tenant Drain (Per Tenant)
+
+Aggregate MUST provide a per-tenant drain mechanism to support rebalancing:
+- Stop accepting new commands for the tenant.
+- Allow in-flight commands to finish (bounded wait), then report drained.
+- Expose drain status per tenant (admin endpoint).
+
+### 4) Rebalancing State Strategy
+
+Aggregate persists snapshots locally (MDBX) and uses JetStream for events. To move a tenant:
+- **Approach 1 (Snapshot migration)**: copy tenant snapshot DB/state to the target shard, then switch placement.
+- **Approach 2 (Cold rehydrate)**: switch placement and let the target shard rebuild state by replaying events from JetStream; expect higher latency during warmup.
+
+The system should support both, with the rebalancer selecting the strategy based on tenant size/SLO.
+
+### 5) Metrics for Placement Decisions
+
+Aggregate SHOULD expose:
+- Per-tenant command rate, error rate
+- In-flight commands by tenant
+- Rehydrate time / snapshot hit ratio
+- Storage size per tenant (if feasible)
+
+---
+
+## **Projection: Required Changes**
+
+### 1) Expose Query API Upstream for Gateway
+
+Projection has a working `QueryService` with tenant-scoped prefix scans ([uqf.rs](file:///Users/vlad/Developer/cloudlysis/projection/src/query/uqf.rs#L121-L162)) but it is not exposed via HTTP/gRPC (current HTTP routes are health/ready/metrics/info only: [projection/http/mod.rs](file:///Users/vlad/Developer/cloudlysis/projection/src/http/mod.rs#L102-L109)).
+
+Projection MUST add one upstream API the Gateway can route to:
+- `POST /query/{view_type}` (HTTP) accepting `x-tenant-id` and a UQF payload, returning `QueryResponse`.
+- Or a gRPC query service (new proto) if gRPC is preferred end-to-end.
+
+### 2) Tenant Placement Filtering (Independent Scaling)
+
+Projection MUST support running in one of these modes:
+- **Multi-tenant shard**: consumes all tenants (simple, less isolated).
+- **Tenant-filtered shard (required for rebalancing)**:
+  - only consumes/serves queries for the tenants assigned to that shard
+  - rejects queries for unassigned tenants (consistent error semantics)
+
+Implementation direction:
+- Add a placement watcher similar to Runner’s tenant filter ([runner/tenant_placement.rs](file:///Users/vlad/Developer/cloudlysis/runner/src/tenant_placement.rs#L8-L100)).
+- Apply tenant filter to:
+  - event consumption subject filters (preferred), and
+  - query serving validation (always).
+
+### 3) Drain + Warmup Endpoints
+
+Projection SHOULD add:
+- `/admin/drain?tenant_id=...` (stop consuming new events for that tenant, finish in-flight, flush checkpoints)
+- `/admin/reload` (apply latest placement/config)
+- Optional warmup status: whether the shard has caught up to JetStream tail for that tenant/view_types
+
+### 4) Rebalancing Strategy for Projection
+
+Projection can rebalance safely with “warm then cut over”:
+- Assign tenant to the new projection shard while old shard still serves.
+- New shard catches up (replay from JetStream, build view KV).
+- Switch Gateway placement for query routing to new shard.
+- Drain old shard for that tenant and optionally delete old tenant KV keys.
+
+### 5) Metrics for Placement Decisions
+
+Projection SHOULD expose:
+- JetStream lag per tenant/view_type (tail minus checkpoint)
+- Query latency and scan counts
+- Storage size per tenant (if feasible)
+
+---
+
+## **Runner: Required Changes**
+
+Runner already has:
+- A tenant placement watcher capable of producing an allowlist ([tenant_placement.rs](file:///Users/vlad/Developer/cloudlysis/runner/src/tenant_placement.rs#L8-L100))
+- Admin endpoints including drain/reload/config ([runner/http/mod.rs](file:///Users/vlad/Developer/cloudlysis/runner/src/http/mod.rs#L69-L86))
+- Gateway client integration for aggregate command submission ([runner/gateway/mod.rs](file:///Users/vlad/Developer/cloudlysis/runner/src/gateway/mod.rs#L1-L47))
+
+To support independent scalability + rebalancing, Runner needs the following.
+
+### 1) Per-Tenant Drain (Not Only Global)
+
+Runner’s current drain is global (`/admin/drain` toggles a single draining flag). Runner MUST support draining a specific tenant:
+- Stop acquiring new saga/effect work for the tenant.
+- Allow in-flight work for the tenant to finish (bounded).
+- Flush outbox for the tenant (or guarantee idempotency on handoff).
+- Persist final checkpoints so another shard can continue without duplication beyond at-least-once bounds.
+
+### 2) Placement-Enforced Work Acquisition
+
+Runner MUST validate tenant assignment at the boundary where it:
+- consumes JetStream messages (saga triggers, effect commands), and
+- dispatches outbox work.
+
+If a tenant is not assigned to the shard, Runner must not process its work.
+
+### 3) Handoff Safety Rules for Rebalancing
+
+Runner rebalancing should follow:
+- New shard begins processing only after it is assigned the tenant.
+- Old shard stops acquiring new work for that tenant, then drains.
+- Idempotency remains correct across handoff using checkpoints and dedupe markers.
+
+### 4) Metrics for Placement Decisions
+
+Runner SHOULD expose:
+- Outbox depth by tenant
+- Work processing latency and retries by tenant/effect
+- Schedule due items by tenant
+- Consumer lag by tenant (if the consumption model supports per-tenant lag)
+
+### 5) Auth Delivery Side Effects (Email/SMS/Push)
+
+If the platform’s AuthN flows require out-of-band delivery (password reset links, email verification, MFA codes), the Runner SHOULD be the standard place to execute those side effects:
+- Define a stable effect interface for sending transactional emails (reset links, verification links, security alerts).
+- Optionally add SMS/push providers later under the same effect contract.
+
+This keeps the Gateway free of long-lived provider credentials and aligns with the existing “effects are executed by workers” pattern.
+
+---
+
+## **Gateway Integration Notes**
+
+Once the above changes exist:
+- Gateway routes per `(tenant_id, service_kind)` using independent placement maps.
+- Gateway can implement “warm then cut over” rebalancing for Projection and Runner by switching only query/workflow routing after readiness conditions are met.
+- Gateway can enforce consistent tenant validation, authn/authz, and error semantics at the edge even as placements move.
+
+---
+
+## **Gaps / Opportunities**
+
+- **KV schema + ownership**: define the exact NATS KV bucket layout, key naming, revisioning rules, and who is allowed to write placement updates.
+- **Rebalancer API**: define operator workflows (plan/apply/rollback), status reporting, and audit log requirements for placement changes.
+- **Shard discovery**: define how shard endpoints are registered (static config vs KV directory entries) and how health is represented.
+- **Consistency boundaries**: define rebalancing guarantees per service kind (projection can be warm-cutover; runner requires checkpoint handoff; aggregate requires single-writer and state availability).
--- a/gateway/prd.md
+++ b/gateway/prd.md
@@ -0,0 +1,425 @@
+### 🧱 Component: Gateway
+
+**Definition:**  
+The Gateway is the single ingress for the platform. It provides:
+- **Tenant-aware routing** to the node services: **Aggregate** (write/commands), **Projection** (read/queries), and **Runner** (workflow/saga + effects admin).
+- Centralized **authn** (password via Argon2 + Google OIDC; extensible to more providers) and **authz** (tenant-scoped RBAC).
+- Cross-cutting concerns: request validation, rate limiting, observability, and consistent error semantics.
+
+The Gateway is responsible for enforcing multi-tenancy at the edge: it treats `x-tenant-id` as the tenant selection signal, validates it against the caller identity, and routes requests to the correct tenant shard/node.
+
+---
+
+## **Context: Existing Nodes**
+
+This PRD is based on the currently implemented node repositories:
+- **Aggregate**: defines gRPC Command API `aggregate.gateway.v1.CommandService/SubmitCommand` in [aggregate.proto](file:///Users/vlad/Developer/cloudlysis/aggregate/proto/aggregate.proto#L1-L31). Aggregate’s PRD explicitly expects the Gateway to route by `x-tenant-id` ([aggregate/prd.md](file:///Users/vlad/Developer/cloudlysis/aggregate/prd.md#L5-L12)).
+- **Projection**: provides health/admin HTTP endpoints and implements an in-process UQF query engine as `QueryService` but does not currently expose it over HTTP/gRPC ([uqf.rs](file:///Users/vlad/Developer/cloudlysis/projection/src/query/uqf.rs#L8-L162)).
+- **Runner**: uses a gRPC client to submit aggregate commands “through the gateway” (config key `aggregate_gateway_url`), propagating `x-tenant-id` as gRPC metadata ([GatewayClient](file:///Users/vlad/Developer/cloudlysis/runner/src/gateway/mod.rs#L1-L47), [OutboxRelay](file:///Users/vlad/Developer/cloudlysis/runner/src/outbox/relay.rs#L37-L110)).
+- **Tenant placement**: there is precedent for **NATS JetStream KV** as a control plane for tenant placement/sharding (Runner tenant filter watcher: [tenant_placement.rs](file:///Users/vlad/Developer/cloudlysis/runner/src/tenant_placement.rs#L8-L100); Aggregate KV client helper: [swarm.rs](file:///Users/vlad/Developer/cloudlysis/aggregate/src/swarm.rs#L79-L227)). There is also a simple static mapping example in [gateway-routing.yaml](file:///Users/vlad/Developer/cloudlysis/aggregate/gateway-routing.yaml#L1-L3).
+
+---
+
+## **Problem Statement**
+
+Clients (and internal workers like Runner) need a stable, secure entrypoint that:
+- Authenticates identities (humans and services)
+- Authorizes actions per tenant
+- Routes requests to the correct node(s) for the selected tenant
+- Provides consistent APIs independent of the underlying shard topology and service discovery
+
+Without a Gateway, each node would need to re-implement auth, tenant enforcement, rate limiting, and topology discovery, increasing security risk and operational complexity.
+
+---
+
+## **Goals**
+
+- Provide one entrypoint for **command submission** (Aggregate) and **query execution** (Projection), and an authenticated entrypoint for **workflow/admin actions** (Runner).
+- Enforce tenant isolation using `x-tenant-id`:
+  - Validate tenant selection is allowed for the caller
+  - Prevent tenant spoofing
+- Prioritize **independent scalability** of Aggregate, Projection, and Runner:
+  - Scale each service horizontally without requiring the others to scale
+  - Allow tenant assignments for each service to be rebalanced independently
+- Support **authn**:
+  - Username/password with Argon2 password hashing
+  - Google OIDC login (future providers supported)
+- Support **authz**:
+  - Tenant-scoped RBAC with explicit permissions
+  - Service identities for internal traffic (Runner → Gateway)
+- Provide operational endpoints: `/health`, `/ready`, `/metrics`, config/routing introspection (admin-only).
+
+---
+
+## **Non-Goals**
+
+- Implement the Aggregate/Projection/Runner business logic.
+- Replace NATS JetStream as the event bus or the storage responsibilities of nodes.
+- Provide a general-purpose API gateway for arbitrary upstreams; this Gateway is purpose-built for platform nodes.
+- Provide UI/console; the Gateway only exposes APIs.
+
+---
+
+## **Primary Users**
+
+- **External clients**: applications submitting commands and running queries.
+- **Internal services**: Runner submitting commands on behalf of sagas.
+- **Operators**: managing tenant placement and observing health/metrics.
+
+---
+
+## **Key Concepts**
+
+### Tenant Selection and Enforcement
+
+- `x-tenant-id` is the canonical tenant selector for all tenant-scoped requests.
+- The Gateway MUST reject requests when:
+  - The endpoint is tenant-scoped and `x-tenant-id` is missing (unless explicitly configured as single-tenant default).
+  - The caller is not authorized for that tenant.
+- The Gateway SHOULD normalize and validate tenant IDs using the same constraints the nodes already use (alphanumeric + `-` + `_`).
+
+### Node Types and Traffic Classes
+
+- **Aggregate (write path)**: synchronous command submission; returns events.
+- **Projection (read path)**: query execution; returns query results; eventual consistency is expected.
+- **Runner (workflow/admin path)**: operational endpoints for runner configuration, drain, reload, and diagnostics; access is admin-only.
+
+### Tenant-Aware Routing
+
+- Routing decision is primarily based on `tenant_id`, and secondarily on request kind (aggregate vs projection vs runner).
+- The Gateway abstracts the topology: clients do not need to know which node hosts their tenant.
+
+### Independent Scalability and Rebalancing
+
+- Each service (Aggregate, Projection, Runner) can have its own tenant-to-shard placement. The Gateway resolves routing per `(tenant_id, service_kind)`.
+- Rebalancing is defined as moving a tenant’s assignment for a specific service from one shard to another with bounded disruption.
+
+---
+
+## **Functional Requirements**
+
+### 1) Authentication (AuthN)
+
+- **AuthN surface area**:
+  - Signup, signin, signout
+  - Forgot password, reset password
+  - MFA enrollment and MFA challenge (step-up)
+  - Google OIDC login (and future providers)
+  - Service identities (internal callers)
+
+- **Password-based accounts**:
+  - Store passwords hashed with **Argon2id** using per-user random salts and parameters suitable for production.
+  - Signup MUST support email verification before the account becomes active (configurable per environment).
+  - Signin MUST support MFA when required by policy.
+  - Signout MUST revoke refresh tokens (and optionally maintain a short-lived access-token denylist only if needed).
+
+- **Sessions and tokens**:
+  - Issue a short-lived access token and a refresh token with rotation.
+  - Refresh tokens MUST be stored server-side (hashed at rest) to support revocation and rotation.
+  - Support both browser and API clients:
+    - Browser: refresh token in an HttpOnly cookie with CSRF protections.
+    - API clients: refresh token in an authorization header or secure client storage (no localStorage guidance in the PRD; implementation chooses).
+- **OIDC (Google)**:
+  - Support Authorization Code flow with PKCE.
+  - Map OIDC identities to internal users; allow linking multiple providers per user.
+  - Future providers (e.g., GitHub, Azure AD) should fit the same model.
+- **Service auth** (internal):
+  - Support service identities for Runner → Gateway and other future internal callers.
+  - Recommended approach: mTLS and/or signed JWTs with a `sub` of `service:<name>` plus explicit RBAC grants.
+
+- **Forgot / reset password**:
+  - Forgot password MUST create a one-time reset token with an expiry and store only a hash of it.
+  - Reset password MUST verify the token, enforce password policy, rotate credentials, and revoke all refresh tokens for the user.
+  - Sending reset links/codes is a side effect; the Gateway SHOULD trigger it via the platform’s effect execution path (Runner effect providers) rather than embedding SMTP credentials in the Gateway.
+
+- **MFA**:
+  - Support TOTP (authenticator apps) as the default MFA method.
+  - Support recovery codes (one-time use) for account recovery.
+  - MFA enrollment MUST require a recent primary authentication (step-up).
+  - MFA challenges MUST be bound to an auth session and have short expiration.
+
+### 2) Authorization (AuthZ / RBAC)
+
+- RBAC entities:
+  - **User** (human identity)
+  - **Service** (machine identity)
+  - **Tenant**
+  - **Role** (set of permissions)
+  - **Assignment** (principal ↔ tenant ↔ role)
+- Authorization checks:
+  - Command submission permissions: per tenant, optionally scoped by `aggregate_type`.
+  - Query permissions: per tenant, optionally scoped by `view_type`.
+  - Admin permissions: routing/config endpoints, runner admin passthrough, tenant placement changes.
+
+### 3) Routing to Nodes
+
+The Gateway MUST route to:
+- **Aggregate nodes** for command submission.
+- **Projection nodes** for query execution.
+- **Runner nodes** for admin/ops passthrough.
+
+Routing inputs:
+- `tenant_id` (from `x-tenant-id` or request body for internal gRPC; header is authoritative for external HTTP).
+- A routing table defining tenant → shard/node → service endpoint(s), where placement MAY differ per service kind.
+
+Routing behavior:
+- The Gateway MUST be able to hot-reload routing configuration without restart.
+- The Gateway SHOULD support both:
+  - **Static config** (file-based mapping for development)
+  - **Dynamic config** (NATS KV-based control plane for production)
+- The Gateway MUST support routing when placements are independent:
+  - `aggregate_placement[tenant_id] -> aggregate_shard_id`
+  - `projection_placement[tenant_id] -> projection_shard_id`
+  - `runner_placement[tenant_id] -> runner_shard_id`
+- The Gateway SHOULD expose placement revisions and effective routing decisions for debugging (admin-only).
+
+### 4) Public APIs (Initial)
+
+The Gateway exposes two public surface areas:
+
+#### Command Submission (Write)
+
+- **gRPC**: implement `aggregate.gateway.v1.CommandService/SubmitCommand` for internal callers (Runner) and optional external clients.
+- **HTTP**: provide a simple REST wrapper to allow browser and non-gRPC clients.
+
+HTTP sketch:
+- `POST /v1/commands/{aggregate_type}/{aggregate_id}`
+  - Headers: `Authorization`, `x-tenant-id`
+  - Body: JSON command payload
+  - Response: JSON containing events (mirrors the gRPC response shape)
+
+#### Query Execution (Read)
+
+Because Projection currently implements UQF query logic but does not expose it, the Gateway defines a stable API and routes to a Projection query endpoint once it exists.
+
+HTTP sketch:
+- `POST /v1/query/{view_type}`
+  - Headers: `Authorization`, `x-tenant-id`
+  - Body: `{ "uqf": "<json-string>" }`
+  - Response: `{ "mode": "find" | "count", ... }` compatible with Projection’s `QueryResponse` shape.
+
+### 5) Operational APIs
+
+- `GET /health` and `GET /ready` for load balancers.
+- `GET /metrics` for Prometheus/Victoria Metrics.
+- Admin-only:
+  - `GET /admin/routing` (current effective routing table and revision)
+  - `POST /admin/routing/reload` (force reload; should still be safe if watcher exists)
+  - Runner passthrough under `/admin/runner/*` (authenticated + authorized)
+
+### 6) AuthN Endpoints (HTTP)
+
+The Gateway SHOULD expose a stable HTTP AuthN API (exact payloads may evolve; semantics should not):
+- `POST /v1/auth/signup`
+- `POST /v1/auth/signin`
+- `POST /v1/auth/signout`
+- `POST /v1/auth/refresh`
+- `POST /v1/auth/forgot`
+- `POST /v1/auth/reset`
+- `POST /v1/auth/mfa/enroll/start`
+- `POST /v1/auth/mfa/enroll/confirm`
+- `POST /v1/auth/mfa/challenge`
+- `POST /v1/auth/oidc/google/start`
+- `GET /v1/auth/oidc/google/callback`
+
+The Gateway MUST enforce rate limits on signin/forgot/reset and MUST apply abuse protections (generic error responses for account existence, IP/device throttling).
+
+### 7) Admin IAM APIs (HTTP)
+
+The Gateway MUST expose an admin-facing API surface for the Admin UI node to manage authentication + authorization:
+- **Users**: create, read, update, disable, delete
+- **Identities**: link/unlink OIDC identities, manage password credentials, enforce email verification status
+- **Roles and Rights**: define permissions (rights), create/update roles, assign rights to roles
+- **Assignments**: assign roles to principals (users/services) scoped to a tenant
+- **Service Accounts**: create/rotate credentials for internal callers, assign tenant roles
+- **MFA Admin Actions**: reset MFA for a user, revoke recovery codes, force re-enrollment
+- **Sessions**: revoke refresh tokens for a user (global signout)
+
+Endpoint sketch (admin-only, audited, paginated):
+- `GET /v1/admin/iam/users`
+- `POST /v1/admin/iam/users`
+- `GET /v1/admin/iam/users/{user_id}`
+- `PATCH /v1/admin/iam/users/{user_id}`
+- `POST /v1/admin/iam/users/{user_id}/disable`
+- `POST /v1/admin/iam/users/{user_id}/sessions/revoke`
+- `POST /v1/admin/iam/users/{user_id}/mfa/reset`
+- `GET /v1/admin/iam/rights`
+- `POST /v1/admin/iam/rights`
+- `GET /v1/admin/iam/roles`
+- `POST /v1/admin/iam/roles`
+- `GET /v1/admin/iam/roles/{role_id}`
+- `PATCH /v1/admin/iam/roles/{role_id}`
+- `POST /v1/admin/iam/roles/{role_id}/rights`
+- `GET /v1/admin/iam/assignments`
+- `POST /v1/admin/iam/assignments`
+- `DELETE /v1/admin/iam/assignments/{assignment_id}`
+
+Tenant scoping rules:
+- Tenant-scoped operations MUST require `x-tenant-id` and apply within that tenant (role assignments, tenant membership, tenant admin).
+- Platform-scoped operations MUST NOT depend on `x-tenant-id` (right/permission catalog, platform admins, global user search).
+
+All admin IAM endpoints MUST require strong authorization (platform admin or tenant admin depending on the resource) and MUST produce an immutable audit trail (who changed what, from where, and when).
+
+---
+
+## **Non-Functional Requirements**
+
+- **Security**
+  - Reject requests missing tenant context when required.
+  - Do not trust `x-tenant-id` unless it is authorized by the caller identity.
+  - Rate limit authentication endpoints and command submission endpoints.
+  - Ensure secrets never appear in logs (tokens, OIDC codes, passwords).
+  - Enforce secure defaults for sessions:
+    - HttpOnly + Secure cookies where applicable, explicit CSRF protections for browser flows.
+    - Access token TTLs and refresh token rotation with revocation.
+    - Account lockout / progressive throttling for credential stuffing.
+  - Require key management and rotation:
+    - JWT signing keys MUST support rotation; old keys remain valid only for bounded overlap.
+    - Password reset tokens, email verification tokens, and refresh tokens MUST be stored as hashes.
+  - Require transport security:
+    - mTLS between Gateway and internal nodes (or an equivalent, explicit service-to-service auth boundary).
+  - Produce auditable, immutable logs for admin IAM actions and tenant placement changes.
+- **Reliability**
+  - Timeouts for upstream calls; bounded retries only when safe (idempotency key present).
+  - Circuit breaking per upstream endpoint.
+  - Graceful degradation when routing config control plane is temporarily unavailable (serve last known good config).
+- **Observability**
+  - Correlate requests with `request_id` and `trace_id`.
+  - Emit structured logs and Prometheus metrics (request counts, latency histograms, auth failures, upstream errors).
+  - Emit security signals (failed signins, MFA failures, suspicious IP/device patterns) suitable for alerting.
+- **Performance**
+  - Minimize per-request allocations; use connection pools for upstreams.
+  - Cache routing decisions keyed by `(tenant_id, service_kind)` with small TTL and invalidation on routing config change.
+- **Compatibility**
+  - Support single-tenant mode (empty tenant id) for development and early environments, without changing client code.
+  - Define API versioning rules and a consistent error envelope for HTTP APIs.
+
+---
+
+## **Proposed Architecture**
+
+### High-Level Flow
+
+```
+Client / Runner
+  |
+  |  (Authorization, x-tenant-id)
+  v
+Gateway
+  | 1) AuthN (password/OIDC/service)
+  | 2) AuthZ (RBAC per tenant + permission)
+  | 3) Tenant routing (tenant_id -> node -> endpoint)
+  v
+Aggregate / Projection / Runner nodes
+```
+
+### Components Inside the Gateway
+
+- **API Layer**
+  - HTTP server for REST endpoints
+  - gRPC server implementing `aggregate.gateway.v1.CommandService` for Runner compatibility
+- **Identity Layer**
+  - Credential verification (Argon2)
+  - OIDC provider integration (Google)
+  - Token issuance and verification (JWT access + refresh token rotation)
+- **Authorization Layer**
+  - RBAC policy evaluation for each request
+  - Tenant membership validation for `x-tenant-id`
+- **Routing Layer**
+  - Routing config loader: file + NATS KV watcher
+  - Routing decision: `(tenant_id, service_kind) -> endpoint` with independent placement per service kind
+  - Health-aware endpoint selection (optional phase): avoid unhealthy endpoints when multiple replicas exist
+- **Upstream Clients**
+  - Aggregate upstream: gRPC client (forward SubmitCommand)
+  - Projection upstream: HTTP or gRPC client (forward Query)
+  - Runner upstream: HTTP client for admin passthrough (restricted)
+
+### Routing Config Model (Recommended)
+
+Represent routing as two layers:
+- **Placement maps** (tenant → shard), per service kind:
+  - `aggregate_placement[tenant_id] -> aggregate_shard_id`
+  - `projection_placement[tenant_id] -> projection_shard_id`
+  - `runner_placement[tenant_id] -> runner_shard_id`
+- **Shard directory** (shard → endpoints), per service kind:
+  - `aggregate_shards[aggregate_shard_id] -> { grpc_endpoint, http_endpoint, admin_endpoint? }`
+  - `projection_shards[projection_shard_id] -> { http_endpoint, admin_endpoint? }`
+  - `runner_shards[runner_shard_id] -> { http_endpoint, admin_endpoint }`
+
+This supports both:
+- Static YAML/JSON config files for local runs.
+- Dynamic updates via NATS KV:
+  - Keys like `aggregate/tenants/<tenant_id>`, `projection/tenants/<tenant_id>`, `runner/tenants/<tenant_id>`
+  - Keys like `aggregate/shards/<shard_id>`, `projection/shards/<shard_id>`, `runner/shards/<shard_id>`
+
+The Gateway keeps:
+- **Last known good** routing config
+- A **revision** number (KV revision or monotonic local revision) for observability/debugging
+
+### Rebalancing Mechanism (Control Plane)
+
+Rebalancing is driven by a small control plane that updates placement and coordinates safe handoff:
+- **Placement Store**: NATS JetStream KV buckets holding placement maps and shard directory entries.
+- **Rebalancer** (operator-driven initially, automated later):
+  - Reads load signals (Gateway/Node metrics) and proposes moves: `(service_kind, tenant_id, from_shard, to_shard)`
+  - Applies moves by writing to KV and orchestrating drain/warmup as needed
+  - Provides audit trail: who moved what, when, and why
+
+Rebalance flow (per service kind):
+- Update placement (KV) to include the target shard assignment with a revision.
+- Ensure the target shard is ready for the tenant (service-specific warmup).
+- Drain the tenant on the old shard (stop accepting new work for that tenant, finish in-flight).
+- Finalize by removing/overwriting the old assignment and triggering config reload/watchers.
+
+Service-specific notes:
+- **Projection**: can rebuild from JetStream; rebalancing can be “cold” (new shard catches up) with minimal coordination beyond tenant filtering.
+- **Runner**: must stop acquiring new work for a tenant, flush outbox dispatch, and persist checkpoints before handing off.
+- **Aggregate**: must ensure single-writer semantics per aggregate instance; tenant drain should block new commands during handoff, and the target shard must have state (snapshot transfer) or accept a cold rehydrate from JetStream.
+
+---
+
+## **Error Semantics**
+
+- Auth failures: `401` (unauthenticated) or `403` (forbidden)
+- Tenant header issues:
+  - Missing `x-tenant-id` on tenant-scoped routes: `400`
+  - Invalid tenant format: `400`
+  - Tenant not permitted for principal: `403`
+- Routing failures:
+  - Unknown tenant assignment: `503` with retriable hint
+  - No healthy upstream endpoints: `503`
+- Upstream errors:
+  - Preserve upstream error category when safe; normalize into a consistent error envelope.
+
+---
+
+## **Rollout Plan**
+
+Phase 1 (Minimum viable ingress)
+- Implement tenant-aware routing for Aggregate command submission.
+- Implement gRPC `SubmitCommand` compatible with Runner.
+- Add HTTP wrapper for command submission.
+- Introduce basic authn/authz (service identity + a minimal RBAC model).
+
+Phase 2 (Read path + OIDC)
+- Add query API and route to Projection query endpoint (Projection may need an exposed endpoint).
+- Add Google OIDC login and account linking.
+- Harden RBAC and permissions by resource type (`aggregate_type`, `view_type`).
+
+Phase 3 (Operations + topology)
+- NATS KV routing config watcher (hot reload).
+- Admin APIs for routing inspection and controlled updates.
+- Health-aware routing and per-tenant rate limits.
+- Introduce placement maps per service kind (independent scaling).
+- Introduce a rebalancer workflow (manual first) to move tenant placements safely.
+
+---
+
+## **Gaps / Opportunities**
+
+- **Tenant lifecycle APIs**: tenant creation, tenant metadata, domain verification, invite flows, default roles, and bootstrap of the first tenant admin.
+- **API conventions**: standard error envelope, pagination/cursors, request IDs, idempotency semantics for command submission retries.
+- **Identity hardening**: password policy, breached-password checks, device/session management, step-up authentication rules, and admin break-glass procedures.
+- **SSO / enterprise**: SCIM provisioning and additional OIDC/SAML providers as a future track.
+- **Audit & compliance**: immutable audit log schema, export/retention policies, and per-tenant data access trails.
+- **Rebalancer safety**: explicit two-phase cutover semantics (warmup readiness gates + drain completion signals) with operator-visible status.
--- a/gateway/src/admin_iam.rs
+++ b/gateway/src/admin_iam.rs
--- a/gateway/src/admin_rebalance.rs
+++ b/gateway/src/admin_rebalance.rs
@@ -0,0 +1,790 @@
+use axum::extract::Query;
+use axum::extract::State;
+use axum::http::StatusCode;
+use axum::Json;
+use serde::Deserialize;
+use serde::Serialize;
+use std::time::Duration;
+
+use crate::authz;
+use crate::authz::AuthzRejection;
+use crate::authz::Principal;
+use crate::routing::ServiceKind;
+use crate::storage::StorageError;
+use crate::AppState;
+
+pub fn router() -> axum::Router<AppState> {
+    axum::Router::new()
+        .route("/status", axum::routing::get(status))
+        .route("/gates", axum::routing::get(gates))
+        .route("/plans", axum::routing::get(list_plans))
+        .route("/plan", axum::routing::post(create_plan))
+        .route("/apply", axum::routing::post(apply_plan))
+        .route("/rollback", axum::routing::post(rollback_plan))
+}
+
+#[derive(Debug, Deserialize)]
+pub struct ResolveQuery {
+    pub tenant_id: String,
+    pub kind: String,
+}
+
+#[derive(Debug, Serialize)]
+pub struct ResolveResponse {
+    pub tenant_id: String,
+    pub kind: ServiceKind,
+    pub endpoint: String,
+    pub revision: u64,
+}
+
+#[derive(Debug, Deserialize)]
+struct TenantQuery {
+    tenant_id: String,
+}
+
+#[derive(Debug, Serialize)]
+struct StatusResponse {
+    tenant_id: String,
+    revision: u64,
+    aggregate: Option<String>,
+    projection: Option<String>,
+    runner: Option<String>,
+}
+
+#[derive(Debug, Serialize)]
+struct GatesResponse {
+    tenant_id: String,
+    aggregate_ready: bool,
+    projection_ready: bool,
+    runner_ready: bool,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+struct Stored<T> {
+    v: u32,
+    data: T,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+struct RebalancePlan {
+    plan_id: String,
+    tenant_id: String,
+    kind: ServiceKind,
+    from_endpoint: Option<String>,
+    to_endpoint: Option<String>,
+    status: String,
+    actor_id: String,
+    created_at_ms: i64,
+    updated_at_ms: i64,
+}
+
+#[derive(Debug, Deserialize)]
+struct CreatePlanBody {
+    tenant_id: String,
+    kind: String,
+    to_endpoint: Option<String>,
+}
+
+#[derive(Debug, Deserialize)]
+struct PlanActionBody {
+    plan_id: String,
+    tenant_id: String,
+}
+
+#[derive(Debug, Deserialize)]
+struct ListPlansQuery {
+    tenant_id: Option<String>,
+    limit: Option<usize>,
+}
+
+pub async fn resolve(
+    State(state): State<AppState>,
+    principal: Principal,
+    Query(q): Query<ResolveQuery>,
+) -> Result<Json<ResolveResponse>, AuthzRejection> {
+    require_platform_admin(&state.storage, &principal.user_id).await?;
+
+    let kind = parse_kind(&q.kind).ok_or(AuthzRejection::Internal)?;
+    let table = state.routing.snapshot().await;
+    let endpoint = table.resolve(&q.tenant_id, kind).map_err(|e| match e {
+        crate::routing::RoutingError::UnknownTenant => AuthzRejection::NotFound,
+        crate::routing::RoutingError::MissingShard | crate::routing::RoutingError::EmptyShard => {
+            AuthzRejection::Internal
+        }
+    })?;
+
+    Ok(Json(ResolveResponse {
+        tenant_id: q.tenant_id,
+        kind,
+        endpoint,
+        revision: table.revision,
+    }))
+}
+
+async fn status(
+    State(state): State<AppState>,
+    principal: Principal,
+    Query(q): Query<TenantQuery>,
+) -> Result<Json<StatusResponse>, AuthzRejection> {
+    require_platform_admin(&state.storage, &principal.user_id).await?;
+
+    let table = state.routing.snapshot().await;
+    let aggregate = table.resolve(&q.tenant_id, ServiceKind::Aggregate).ok();
+    let projection = table.resolve(&q.tenant_id, ServiceKind::Projection).ok();
+    let runner = table.resolve(&q.tenant_id, ServiceKind::Runner).ok();
+
+    Ok(Json(StatusResponse {
+        tenant_id: q.tenant_id,
+        revision: table.revision,
+        aggregate,
+        projection,
+        runner,
+    }))
+}
+
+async fn gates(
+    State(state): State<AppState>,
+    principal: Principal,
+    Query(q): Query<TenantQuery>,
+) -> Result<Json<GatesResponse>, AuthzRejection> {
+    require_platform_admin(&state.storage, &principal.user_id).await?;
+
+    let projection_endpoint = state
+        .routing
+        .resolve(&q.tenant_id, ServiceKind::Projection)
+        .await
+        .ok();
+    let runner_endpoint = state
+        .routing
+        .resolve(&q.tenant_id, ServiceKind::Runner)
+        .await
+        .ok();
+    let aggregate_endpoint = state
+        .routing
+        .resolve(&q.tenant_id, ServiceKind::Aggregate)
+        .await
+        .ok();
+
+    let projection_ready = if let Some(ep) = projection_endpoint {
+        projection_gate_ready(&ep, &q.tenant_id)
+            .await
+            .unwrap_or(false)
+    } else {
+        false
+    };
+    let runner_ready = if let Some(ep) = runner_endpoint {
+        http_ready(&ep).await.unwrap_or(false)
+    } else {
+        false
+    };
+    let aggregate_ready = if let Some(ep) = aggregate_endpoint {
+        aggregate_ready(&ep).await.unwrap_or(false)
+    } else {
+        false
+    };
+
+    Ok(Json(GatesResponse {
+        tenant_id: q.tenant_id,
+        aggregate_ready,
+        projection_ready,
+        runner_ready,
+    }))
+}
+
+async fn http_ready(endpoint: &str) -> Result<bool, AuthzRejection> {
+    let url = format!("{}/ready", endpoint.trim_end_matches('/'));
+    let client = crate::upstream::http_client();
+    let resp = tokio::time::timeout(Duration::from_secs(2), client.get(url).send())
+        .await
+        .map_err(|_| AuthzRejection::Internal)?
+        .map_err(|_| AuthzRejection::Internal)?;
+    Ok(resp.status().is_success())
+}
+
+async fn aggregate_ready(endpoint: &str) -> Result<bool, AuthzRejection> {
+    if endpoint.contains(":50051") {
+        let http_ep = endpoint.replace(":50051", ":8080");
+        return http_ready(&http_ep).await;
+    }
+    http_ready(endpoint).await
+}
+
+async fn projection_gate_ready(endpoint: &str, tenant_id: &str) -> Result<bool, AuthzRejection> {
+    let url = format!("{}/metrics", endpoint.trim_end_matches('/'));
+    let client = crate::upstream::http_client();
+    let resp = tokio::time::timeout(Duration::from_secs(2), client.get(url).send())
+        .await
+        .map_err(|_| AuthzRejection::Internal)?
+        .map_err(|_| AuthzRejection::Internal)?;
+    if !resp.status().is_success() {
+        return Ok(false);
+    }
+    let text = resp.text().await.map_err(|_| AuthzRejection::Internal)?;
+
+    let ready = parse_prom_gauge(&text, "projection_ready").unwrap_or(0.0) >= 1.0;
+    if !ready {
+        return Ok(false);
+    }
+
+    let max_lag = parse_projection_max_lag(&text, tenant_id).unwrap_or(u64::MAX);
+    let threshold = std::env::var("GATEWAY_REBALANCE_PROJECTION_MAX_LAG")
+        .ok()
+        .and_then(|v| v.parse::<u64>().ok())
+        .unwrap_or(0);
+    Ok(max_lag <= threshold)
+}
+
+fn parse_prom_gauge(metrics: &str, name: &str) -> Option<f64> {
+    for line in metrics.lines() {
+        let line = line.trim();
+        if line.starts_with('#') || line.is_empty() {
+            continue;
+        }
+        if line.starts_with(name) && !line.contains('{') {
+            let mut it = line.split_whitespace();
+            let _ = it.next()?;
+            return it.next()?.parse::<f64>().ok();
+        }
+    }
+    None
+}
+
+fn parse_projection_max_lag(metrics: &str, tenant_id: &str) -> Option<u64> {
+    let mut max: Option<u64> = None;
+    for line in metrics.lines() {
+        let line = line.trim();
+        if !line.starts_with("projection_lag{") {
+            continue;
+        }
+        if !line.contains(&format!("tenant_id=\"{}\"", tenant_id)) {
+            continue;
+        }
+        let value = line
+            .split_whitespace()
+            .nth(1)
+            .and_then(|v| v.parse::<u64>().ok())?;
+        max = Some(max.map(|m| m.max(value)).unwrap_or(value));
+    }
+    max
+}
+
+fn parse_kind(kind: &str) -> Option<ServiceKind> {
+    match kind.trim().to_ascii_lowercase().as_str() {
+        "aggregate" => Some(ServiceKind::Aggregate),
+        "projection" => Some(ServiceKind::Projection),
+        "runner" => Some(ServiceKind::Runner),
+        _ => None,
+    }
+}
+
+async fn require_platform_admin(
+    storage: &crate::storage::GatewayStorage,
+    principal_id: &str,
+) -> Result<(), AuthzRejection> {
+    authz::ensure_allowed(storage, principal_id, "*", "iam.platform_admin").await
+}
+
+async fn create_plan(
+    State(state): State<AppState>,
+    principal: Principal,
+    Json(body): Json<CreatePlanBody>,
+) -> Result<Json<RebalancePlan>, AuthzRejection> {
+    require_platform_admin(&state.storage, &principal.user_id).await?;
+
+    if body.tenant_id.trim().is_empty() {
+        return Err(AuthzRejection::BadRequest);
+    }
+    let kind = parse_kind(&body.kind).ok_or(AuthzRejection::BadRequest)?;
+    let to_endpoint = body.to_endpoint.filter(|s| !s.trim().is_empty());
+    if to_endpoint.is_none() {
+        return Err(AuthzRejection::BadRequest);
+    }
+
+    let from_endpoint = state.routing.resolve(&body.tenant_id, kind).await.ok();
+    let plan_id = uuid::Uuid::new_v4().to_string();
+    let now_ms = unix_ms();
+
+    let plan = RebalancePlan {
+        plan_id: plan_id.clone(),
+        tenant_id: body.tenant_id.clone(),
+        kind,
+        from_endpoint,
+        to_endpoint,
+        status: "planned".to_string(),
+        actor_id: principal.user_id,
+        created_at_ms: now_ms,
+        updated_at_ms: now_ms,
+    };
+
+    let key = plan_key(&plan.tenant_id, &plan.plan_id);
+    state
+        .storage
+        .audit_index
+        .create(
+            &key,
+            encode_stored(&plan).map_err(|_| AuthzRejection::Internal)?,
+        )
+        .await
+        .map_err(|e| match e {
+            StorageError::AlreadyExists => AuthzRejection::Conflict,
+            _ => AuthzRejection::Internal,
+        })?;
+
+    Ok(Json(plan))
+}
+
+async fn apply_plan(
+    State(state): State<AppState>,
+    principal: Principal,
+    Json(body): Json<PlanActionBody>,
+) -> Result<StatusCode, AuthzRejection> {
+    require_platform_admin(&state.storage, &principal.user_id).await?;
+    transition_plan_status(&state, &body.tenant_id, &body.plan_id, "apply_requested").await?;
+    Ok(StatusCode::NO_CONTENT)
+}
+
+async fn rollback_plan(
+    State(state): State<AppState>,
+    principal: Principal,
+    Json(body): Json<PlanActionBody>,
+) -> Result<StatusCode, AuthzRejection> {
+    require_platform_admin(&state.storage, &principal.user_id).await?;
+    transition_plan_status(&state, &body.tenant_id, &body.plan_id, "rollback_requested").await?;
+    Ok(StatusCode::NO_CONTENT)
+}
+
+async fn list_plans(
+    State(state): State<AppState>,
+    principal: Principal,
+    Query(q): Query<ListPlansQuery>,
+) -> Result<Json<Vec<RebalancePlan>>, AuthzRejection> {
+    require_platform_admin(&state.storage, &principal.user_id).await?;
+    let prefix = match &q.tenant_id {
+        Some(t) => format!("v1/rebalance/plans/{}/", t.trim()),
+        None => "v1/rebalance/plans/".to_string(),
+    };
+    let mut keys = state
+        .storage
+        .audit_index
+        .list_keys(&prefix)
+        .await
+        .map_err(|_| AuthzRejection::Internal)?;
+    keys.sort();
+    keys.reverse();
+
+    let limit = q.limit.unwrap_or(50).min(200);
+    let mut out = Vec::new();
+    for key in keys.into_iter().take(limit) {
+        let entry = state
+            .storage
+            .audit_index
+            .get(&key)
+            .await
+            .map_err(|_| AuthzRejection::Internal)?;
+        let Some(entry) = entry else {
+            continue;
+        };
+        let plan: RebalancePlan =
+            decode_stored(&entry.value).map_err(|_| AuthzRejection::Internal)?;
+        out.push(plan);
+    }
+    Ok(Json(out))
+}
+
+async fn transition_plan_status(
+    state: &AppState,
+    tenant_id: &str,
+    plan_id: &str,
+    next_status: &str,
+) -> Result<(), AuthzRejection> {
+    let key = plan_key(tenant_id, plan_id);
+    for _ in 0..10 {
+        let entry = state
+            .storage
+            .audit_index
+            .get(&key)
+            .await
+            .map_err(|_| AuthzRejection::Internal)?
+            .ok_or(AuthzRejection::NotFound)?;
+
+        let mut plan: Stored<RebalancePlan> =
+            serde_json::from_slice(&entry.value).map_err(|_| AuthzRejection::Internal)?;
+        plan.data.status = next_status.to_string();
+        plan.data.updated_at_ms = unix_ms();
+        let payload = serde_json::to_vec(&plan).map_err(|_| AuthzRejection::Internal)?;
+
+        match state
+            .storage
+            .audit_index
+            .update(&key, entry.revision, payload)
+            .await
+        {
+            Ok(_) => return Ok(()),
+            Err(StorageError::CasMismatch) => continue,
+            Err(_) => return Err(AuthzRejection::Internal),
+        }
+    }
+    Err(AuthzRejection::Internal)
+}
+
+fn plan_key(tenant_id: &str, plan_id: &str) -> String {
+    format!("v1/rebalance/plans/{tenant_id}/{plan_id}")
+}
+
+fn encode_stored<T: Serialize>(data: &T) -> Result<Vec<u8>, StorageError> {
+    serde_json::to_vec(&Stored {
+        v: crate::storage::SCHEMA_VERSION,
+        data,
+    })
+    .map_err(|e| StorageError::Serde(e.to_string()))
+}
+
+fn decode_stored<T: for<'de> Deserialize<'de>>(bytes: &[u8]) -> Result<T, StorageError> {
+    let stored: Stored<T> =
+        serde_json::from_slice(bytes).map_err(|e| StorageError::Serde(e.to_string()))?;
+    Ok(stored.data)
+}
+
+fn unix_ms() -> i64 {
+    std::time::SystemTime::now()
+        .duration_since(std::time::UNIX_EPOCH)
+        .unwrap_or_default()
+        .as_millis() as i64
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::authn;
+    use std::collections::HashMap;
+    use std::sync::Arc;
+    use tower::util::ServiceExt;
+
+    async fn test_app_with_routing(cfg: crate::routing::RoutingConfig) -> (axum::Router, AppState) {
+        let metrics = crate::observability::init_metrics_for_tests();
+        let source: Arc<dyn crate::routing::RoutingSource> =
+            Arc::new(crate::routing::FixedSource::new(cfg));
+        let routing = crate::routing::RouterState::new(source).await.unwrap();
+        let storage = crate::storage::GatewayStorage::new_in_memory();
+        let authn_cfg = crate::authn::AuthnConfig::for_tests();
+        let state = crate::AppState {
+            metrics,
+            routing,
+            storage,
+            authn: authn_cfg,
+        };
+        let app = crate::app(state.clone());
+        (app, state)
+    }
+
+    async fn signup_and_token(app: &axum::Router, cfg: &authn::AuthnConfig) -> (String, String) {
+        let response = app
+            .clone()
+            .oneshot(
+                axum::http::Request::builder()
+                    .method("POST")
+                    .uri("/v1/auth/signup")
+                    .header("content-type", "application/json")
+                    .body(axum::body::Body::from(
+                        r#"{"email":"a@b.com","password":"password123"}"#,
+                    ))
+                    .unwrap(),
+            )
+            .await
+            .unwrap();
+        let body = axum::body::to_bytes(response.into_body(), usize::MAX)
+            .await
+            .unwrap();
+        let created: crate::authn::AuthResponse = serde_json::from_slice(&body).unwrap();
+        let claims = cfg.verify_access_token(&created.access_token).unwrap();
+        (created.access_token, claims.sub)
+    }
+
+    #[tokio::test]
+    async fn resolve_requires_platform_admin() {
+        let cfg = crate::routing::RoutingConfig::empty();
+        let (app, state) = test_app_with_routing(cfg).await;
+        let (token, user_id) = signup_and_token(&app, &state.authn).await;
+
+        let resp = app
+            .clone()
+            .oneshot(
+                axum::http::Request::builder()
+                    .method("GET")
+                    .uri("/admin/routing/resolve?tenant_id=t1&kind=aggregate")
+                    .header("authorization", format!("Bearer {token}"))
+                    .body(axum::body::Body::empty())
+                    .unwrap(),
+            )
+            .await
+            .unwrap();
+        assert_eq!(resp.status(), axum::http::StatusCode::FORBIDDEN);
+
+        crate::authz::put_role(
+            &state.storage,
+            "role-platform-admin",
+            vec!["iam.platform_admin".to_string()],
+        )
+        .await
+        .unwrap();
+        crate::authz::assign_role(&state.storage, "*", &user_id, "role-platform-admin")
+            .await
+            .unwrap();
+
+        let resp = app
+            .oneshot(
+                axum::http::Request::builder()
+                    .method("GET")
+                    .uri("/admin/routing/resolve?tenant_id=t1&kind=aggregate")
+                    .header("authorization", format!("Bearer {token}"))
+                    .body(axum::body::Body::empty())
+                    .unwrap(),
+            )
+            .await
+            .unwrap();
+        assert_eq!(resp.status(), axum::http::StatusCode::NOT_FOUND);
+    }
+
+    #[tokio::test]
+    async fn status_includes_revision() {
+        let cfg = crate::routing::RoutingConfig {
+            revision: 42,
+            aggregate_placement: HashMap::new(),
+            projection_placement: HashMap::new(),
+            runner_placement: HashMap::new(),
+            aggregate_shards: HashMap::new(),
+            projection_shards: HashMap::new(),
+            runner_shards: HashMap::new(),
+        };
+        let (app, state) = test_app_with_routing(cfg).await;
+        let (token, user_id) = signup_and_token(&app, &state.authn).await;
+
+        crate::authz::put_role(
+            &state.storage,
+            "role-platform-admin",
+            vec!["iam.platform_admin".to_string()],
+        )
+        .await
+        .unwrap();
+        crate::authz::assign_role(&state.storage, "*", &user_id, "role-platform-admin")
+            .await
+            .unwrap();
+
+        let resp = app
+            .oneshot(
+                axum::http::Request::builder()
+                    .method("GET")
+                    .uri("/admin/rebalance/status?tenant_id=t1")
+                    .header("authorization", format!("Bearer {token}"))
+                    .body(axum::body::Body::empty())
+                    .unwrap(),
+            )
+            .await
+            .unwrap();
+        assert_eq!(resp.status(), axum::http::StatusCode::OK);
+        let body = axum::body::to_bytes(resp.into_body(), usize::MAX)
+            .await
+            .unwrap();
+        let value: serde_json::Value = serde_json::from_slice(&body).unwrap();
+        assert_eq!(value.get("revision").and_then(|v| v.as_u64()).unwrap(), 42);
+    }
+
+    #[tokio::test]
+    async fn gates_prevent_cutover_when_projection_not_ready_or_lagging() {
+        let metrics_not_ready = axum::Router::new().route(
+            "/metrics",
+            axum::routing::get(|| async { (axum::http::StatusCode::OK, "projection_ready 0\n") }),
+        );
+        let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
+        let addr = listener.local_addr().unwrap();
+        tokio::spawn(async move {
+            axum::serve(listener, metrics_not_ready).await.unwrap();
+        });
+        tokio::time::sleep(std::time::Duration::from_millis(50)).await;
+        let endpoint = format!("http://{}", addr);
+
+        let cfg = crate::routing::RoutingConfig {
+            revision: 1,
+            aggregate_placement: HashMap::new(),
+            projection_placement: HashMap::from([("tenant-a".to_string(), "p".to_string())]),
+            runner_placement: HashMap::new(),
+            aggregate_shards: HashMap::new(),
+            projection_shards: HashMap::from([("p".to_string(), vec![endpoint])]),
+            runner_shards: HashMap::new(),
+        };
+        let (app, state) = test_app_with_routing(cfg).await;
+        let (token, user_id) = signup_and_token(&app, &state.authn).await;
+
+        crate::authz::put_role(
+            &state.storage,
+            "role-platform-admin",
+            vec!["iam.platform_admin".to_string()],
+        )
+        .await
+        .unwrap();
+        crate::authz::assign_role(&state.storage, "*", &user_id, "role-platform-admin")
+            .await
+            .unwrap();
+
+        let resp = app
+            .clone()
+            .oneshot(
+                axum::http::Request::builder()
+                    .method("GET")
+                    .uri("/admin/rebalance/gates?tenant_id=tenant-a")
+                    .header("authorization", format!("Bearer {token}"))
+                    .body(axum::body::Body::empty())
+                    .unwrap(),
+            )
+            .await
+            .unwrap();
+        assert_eq!(resp.status(), axum::http::StatusCode::OK);
+        let body = axum::body::to_bytes(resp.into_body(), usize::MAX)
+            .await
+            .unwrap();
+        let value: serde_json::Value = serde_json::from_slice(&body).unwrap();
+        assert!(!value
+            .get("projection_ready")
+            .and_then(|v| v.as_bool())
+            .unwrap());
+
+        let metrics_lagging = axum::Router::new().route(
+            "/metrics",
+            axum::routing::get(|| async {
+                (
+                    axum::http::StatusCode::OK,
+                    "projection_ready 1\nprojection_lag{tenant_id=\"tenant-a\"} 5\n",
+                )
+            }),
+        );
+        let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
+        let addr = listener.local_addr().unwrap();
+        tokio::spawn(async move {
+            axum::serve(listener, metrics_lagging).await.unwrap();
+        });
+        tokio::time::sleep(std::time::Duration::from_millis(50)).await;
+        let endpoint = format!("http://{}", addr);
+
+        std::env::set_var("GATEWAY_REBALANCE_PROJECTION_MAX_LAG", "0");
+
+        let cfg = crate::routing::RoutingConfig {
+            revision: 2,
+            aggregate_placement: HashMap::new(),
+            projection_placement: HashMap::from([("tenant-a".to_string(), "p".to_string())]),
+            runner_placement: HashMap::new(),
+            aggregate_shards: HashMap::new(),
+            projection_shards: HashMap::from([("p".to_string(), vec![endpoint])]),
+            runner_shards: HashMap::new(),
+        };
+        let (app, state) = test_app_with_routing(cfg).await;
+        crate::authz::put_role(
+            &state.storage,
+            "role-platform-admin",
+            vec!["iam.platform_admin".to_string()],
+        )
+        .await
+        .unwrap();
+        crate::authz::assign_role(&state.storage, "*", &user_id, "role-platform-admin")
+            .await
+            .unwrap();
+        let resp = app
+            .oneshot(
+                axum::http::Request::builder()
+                    .method("GET")
+                    .uri("/admin/rebalance/gates?tenant_id=tenant-a")
+                    .header("authorization", format!("Bearer {token}"))
+                    .body(axum::body::Body::empty())
+                    .unwrap(),
+            )
+            .await
+            .unwrap();
+        assert_eq!(resp.status(), axum::http::StatusCode::OK);
+        let body = axum::body::to_bytes(resp.into_body(), usize::MAX)
+            .await
+            .unwrap();
+        let value: serde_json::Value = serde_json::from_slice(&body).unwrap();
+        assert!(!value
+            .get("projection_ready")
+            .and_then(|v| v.as_bool())
+            .unwrap());
+    }
+
+    #[tokio::test]
+    async fn plan_endpoints_require_platform_admin_and_persist_plans() {
+        let cfg = crate::routing::RoutingConfig::empty();
+        let (app, state) = test_app_with_routing(cfg).await;
+        let (token, user_id) = signup_and_token(&app, &state.authn).await;
+
+        let forbidden = app
+            .clone()
+            .oneshot(
+                axum::http::Request::builder()
+                    .method("POST")
+                    .uri("/admin/rebalance/plan")
+                    .header("authorization", format!("Bearer {token}"))
+                    .header("content-type", "application/json")
+                    .body(axum::body::Body::from(
+                        r#"{"tenant_id":"tenant-a","kind":"projection","to_endpoint":"http://p"}"#,
+                    ))
+                    .unwrap(),
+            )
+            .await
+            .unwrap();
+        assert_eq!(forbidden.status(), axum::http::StatusCode::FORBIDDEN);
+
+        crate::authz::put_role(
+            &state.storage,
+            "role-platform-admin",
+            vec!["iam.platform_admin".to_string()],
+        )
+        .await
+        .unwrap();
+        crate::authz::assign_role(&state.storage, "*", &user_id, "role-platform-admin")
+            .await
+            .unwrap();
+
+        let created = app
+            .clone()
+            .oneshot(
+                axum::http::Request::builder()
+                    .method("POST")
+                    .uri("/admin/rebalance/plan")
+                    .header("authorization", format!("Bearer {token}"))
+                    .header("content-type", "application/json")
+                    .body(axum::body::Body::from(
+                        r#"{"tenant_id":"tenant-a","kind":"projection","to_endpoint":"http://p"}"#,
+                    ))
+                    .unwrap(),
+            )
+            .await
+            .unwrap();
+        assert_eq!(created.status(), axum::http::StatusCode::OK);
+        let body = axum::body::to_bytes(created.into_body(), usize::MAX)
+            .await
+            .unwrap();
+        let plan: serde_json::Value = serde_json::from_slice(&body).unwrap();
+        let plan_id = plan.get("plan_id").and_then(|v| v.as_str()).unwrap();
+
+        let listed = app
+            .oneshot(
+                axum::http::Request::builder()
+                    .method("GET")
+                    .uri("/admin/rebalance/plans?tenant_id=tenant-a&limit=10")
+                    .header("authorization", format!("Bearer {token}"))
+                    .body(axum::body::Body::empty())
+                    .unwrap(),
+            )
+            .await
+            .unwrap();
+        assert_eq!(listed.status(), axum::http::StatusCode::OK);
+        let body = axum::body::to_bytes(listed.into_body(), usize::MAX)
+            .await
+            .unwrap();
+        let plans: serde_json::Value = serde_json::from_slice(&body).unwrap();
+        assert!(plans
+            .as_array()
+            .unwrap()
+            .iter()
+            .any(|p| p.get("plan_id").and_then(|v| v.as_str()) == Some(plan_id)));
+    }
+}
--- a/gateway/src/authn.rs
+++ b/gateway/src/authn.rs
--- a/gateway/src/authz.rs
+++ b/gateway/src/authz.rs
@@ -0,0 +1,839 @@
+use axum::extract::FromRef;
+use axum::extract::FromRequestParts;
+use axum::extract::Path;
+use axum::extract::Request;
+use axum::extract::State;
+use axum::http::header;
+use axum::http::request::Parts;
+use axum::http::StatusCode;
+use axum::response::IntoResponse;
+use axum::response::Response;
+use axum::routing::post;
+use axum::Json;
+use serde::Deserialize;
+use serde::Serialize;
+use serde_json::Value;
+use thiserror::Error;
+
+use crate::grpc;
+use crate::storage::GatewayStorage;
+use crate::storage::StorageError;
+use crate::AppState;
+
+pub fn router() -> axum::Router<AppState> {
+    axum::Router::new()
+        .route(
+            "/commands/:aggregate_type/:aggregate_id",
+            post(submit_command_stub),
+        )
+        .route("/query/:view_type", post(query_stub))
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct Principal {
+    pub user_id: String,
+    pub session_id: String,
+}
+
+#[async_trait::async_trait]
+impl<S> FromRequestParts<S> for Principal
+where
+    S: Send + Sync,
+    AppState: FromRef<S>,
+{
+    type Rejection = AuthzRejection;
+
+    async fn from_request_parts(parts: &mut Parts, state: &S) -> Result<Self, Self::Rejection> {
+        let auth_header = parts
+            .headers
+            .get(header::AUTHORIZATION)
+            .and_then(|v| v.to_str().ok())
+            .ok_or(AuthzRejection::Unauthorized)?;
+
+        let token = auth_header
+            .strip_prefix("Bearer ")
+            .ok_or(AuthzRejection::Unauthorized)?;
+
+        let app_state = AppState::from_ref(state);
+        let claims = app_state.authn.verify_access_token(token).map_err(|_| {
+            metrics::counter!("gateway_authn_token_verify_fail_total").increment(1);
+            AuthzRejection::Unauthorized
+        })?;
+
+        tracing::Span::current().record("principal_id", claims.sub.as_str());
+        Ok(Self {
+            user_id: claims.sub,
+            session_id: claims.session_id,
+        })
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct TenantId(pub String);
+
+#[async_trait::async_trait]
+impl<S> FromRequestParts<S> for TenantId
+where
+    S: Send + Sync,
+{
+    type Rejection = AuthzRejection;
+
+    async fn from_request_parts(parts: &mut Parts, _state: &S) -> Result<Self, Self::Rejection> {
+        let raw = parts
+            .headers
+            .get("x-tenant-id")
+            .and_then(|v| v.to_str().ok())
+            .ok_or(AuthzRejection::MissingTenant)?;
+
+        let tenant = raw.trim();
+        if tenant.is_empty()
+            || !tenant
+                .chars()
+                .all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
+        {
+            return Err(AuthzRejection::InvalidTenant);
+        }
+
+        tracing::Span::current().record("tenant_id", tenant);
+        Ok(TenantId(tenant.to_string()))
+    }
+}
+
+#[derive(Debug, Error)]
+pub enum AuthzRejection {
+    #[error("unauthorized")]
+    Unauthorized,
+    #[error("bad request")]
+    BadRequest,
+    #[error("missing x-tenant-id")]
+    MissingTenant,
+    #[error("invalid x-tenant-id")]
+    InvalidTenant,
+    #[error("forbidden")]
+    Forbidden,
+    #[error("not found")]
+    NotFound,
+    #[error("conflict")]
+    Conflict,
+    #[error("internal error")]
+    Internal,
+}
+
+impl IntoResponse for AuthzRejection {
+    fn into_response(self) -> axum::response::Response {
+        match self {
+            AuthzRejection::Unauthorized => {
+                (StatusCode::UNAUTHORIZED, self.to_string()).into_response()
+            }
+            AuthzRejection::BadRequest => {
+                (StatusCode::BAD_REQUEST, self.to_string()).into_response()
+            }
+            AuthzRejection::MissingTenant | AuthzRejection::InvalidTenant => {
+                (StatusCode::BAD_REQUEST, self.to_string()).into_response()
+            }
+            AuthzRejection::Forbidden => (StatusCode::FORBIDDEN, self.to_string()).into_response(),
+            AuthzRejection::NotFound => (StatusCode::NOT_FOUND, self.to_string()).into_response(),
+            AuthzRejection::Conflict => (StatusCode::CONFLICT, self.to_string()).into_response(),
+            AuthzRejection::Internal => {
+                (StatusCode::INTERNAL_SERVER_ERROR, self.to_string()).into_response()
+            }
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct RoleRecord {
+    pub role_id: String,
+    pub rights: Vec<String>,
+}
+
+#[derive(Debug, Deserialize)]
+struct HttpCommandRequest {
+    command_id: Option<String>,
+    payload: Value,
+    metadata: Option<std::collections::HashMap<String, String>>,
+}
+
+#[derive(Debug, Serialize)]
+struct HttpCommandResponse {
+    events: Vec<EventDto>,
+}
+
+#[derive(Debug, Serialize)]
+struct EventDto {
+    event_id: String,
+    command_id: String,
+    aggregate_id: String,
+    aggregate_type: String,
+    version: u64,
+    event_type: String,
+    payload_json: String,
+    timestamp_rfc3339: String,
+}
+
+async fn submit_command_stub(
+    State(state): State<AppState>,
+    ctx: crate::RequestContext,
+    principal: Principal,
+    TenantId(tenant_id): TenantId,
+    Path((aggregate_type, aggregate_id)): Path<(String, String)>,
+    Json(body): Json<HttpCommandRequest>,
+) -> Result<Json<HttpCommandResponse>, AuthzRejection> {
+    ensure_allowed(
+        &state.storage,
+        &principal.user_id,
+        &tenant_id,
+        "command.submit",
+    )
+    .await?;
+
+    let command_id = body
+        .command_id
+        .unwrap_or_else(|| uuid::Uuid::new_v4().to_string());
+
+    let metadata = body.metadata.unwrap_or_default();
+    let request = grpc::proto::SubmitCommandRequest {
+        tenant_id: tenant_id.clone(),
+        command_id,
+        aggregate_id,
+        aggregate_type,
+        payload_json: body.payload.to_string(),
+        metadata,
+    };
+
+    let resp = grpc::submit_command_via_routing(&state.routing, request, &ctx)
+        .await
+        .map_err(|_| AuthzRejection::Internal)?;
+
+    let events = resp
+        .events
+        .into_iter()
+        .map(|e| EventDto {
+            event_id: e.event_id,
+            command_id: e.command_id,
+            aggregate_id: e.aggregate_id,
+            aggregate_type: e.aggregate_type,
+            version: e.version,
+            event_type: e.event_type,
+            payload_json: e.payload_json,
+            timestamp_rfc3339: e.timestamp_rfc3339,
+        })
+        .collect();
+
+    Ok(Json(HttpCommandResponse { events }))
+}
+
+async fn query_stub(
+    State(state): State<AppState>,
+    ctx: crate::RequestContext,
+    principal: Principal,
+    TenantId(tenant_id): TenantId,
+    Path(view_type): Path<String>,
+    Json(payload): Json<Value>,
+) -> Result<Response, AuthzRejection> {
+    ensure_allowed(
+        &state.storage,
+        &principal.user_id,
+        &tenant_id,
+        "query.execute",
+    )
+    .await?;
+
+    let upstream = state
+        .routing
+        .resolve(&tenant_id, crate::routing::ServiceKind::Projection)
+        .await
+        .map_err(|_| AuthzRejection::Internal)?;
+    tracing::Span::current().record("upstream", upstream.as_str());
+
+    let url = format!("{}/v1/query/{}", upstream.trim_end_matches('/'), view_type);
+
+    let client = crate::upstream::http_client();
+    let resp = client
+        .post(url)
+        .header("x-tenant-id", tenant_id)
+        .header("x-correlation-id", ctx.correlation_id)
+        .header("traceparent", ctx.traceparent)
+        .json(&payload)
+        .send()
+        .await
+        .map_err(|_| AuthzRejection::Internal)?;
+
+    let status = StatusCode::from_u16(resp.status().as_u16()).unwrap_or(StatusCode::BAD_GATEWAY);
+    let bytes = resp.bytes().await.map_err(|_| AuthzRejection::Internal)?;
+    let mut out = Response::new(axum::body::Body::from(bytes));
+    *out.status_mut() = status;
+    Ok(out)
+}
+
+pub async fn runner_admin_proxy(
+    State(state): State<AppState>,
+    ctx: crate::RequestContext,
+    principal: Principal,
+    TenantId(tenant_id): TenantId,
+    Path(path): Path<String>,
+    request: Request,
+) -> Result<Response, AuthzRejection> {
+    ensure_allowed(
+        &state.storage,
+        &principal.user_id,
+        &tenant_id,
+        "runner.admin",
+    )
+    .await?;
+
+    let upstream = state
+        .routing
+        .resolve(&tenant_id, crate::routing::ServiceKind::Runner)
+        .await
+        .map_err(|_| AuthzRejection::Internal)?;
+    tracing::Span::current().record("upstream", upstream.as_str());
+
+    let mut url = format!(
+        "{}/admin/{}",
+        upstream.trim_end_matches('/'),
+        path.trim_start_matches('/')
+    );
+    if let Some(q) = request.uri().query() {
+        url.push('?');
+        url.push_str(q);
+    }
+
+    let method = request.method().clone();
+    let headers = request.headers().clone();
+    let body = axum::body::to_bytes(request.into_body(), usize::MAX)
+        .await
+        .map_err(|_| AuthzRejection::Internal)?;
+
+    let client = crate::upstream::http_client();
+    let mut req = client
+        .request(method, url)
+        .header("x-tenant-id", tenant_id)
+        .header("x-correlation-id", ctx.correlation_id)
+        .header("traceparent", ctx.traceparent)
+        .body(body);
+
+    for (k, v) in headers.iter() {
+        if k == header::HOST {
+            continue;
+        }
+        req = req.header(k, v);
+    }
+
+    let resp = req.send().await.map_err(|_| AuthzRejection::Internal)?;
+    let status = StatusCode::from_u16(resp.status().as_u16()).unwrap_or(StatusCode::BAD_GATEWAY);
+    let bytes = resp.bytes().await.map_err(|_| AuthzRejection::Internal)?;
+
+    let mut out = Response::new(axum::body::Body::from(bytes));
+    *out.status_mut() = status;
+    Ok(out)
+}
+
+pub async fn ensure_allowed(
+    storage: &GatewayStorage,
+    principal_id: &str,
+    tenant_id: &str,
+    required_right: &str,
+) -> Result<(), AuthzRejection> {
+    let mut roles = list_assigned_roles(storage, tenant_id, principal_id).await?;
+    roles.extend(list_assigned_roles(storage, "*", principal_id).await?);
+
+    if roles.is_empty() {
+        metrics::counter!(
+            "gateway_authz_decisions_total",
+            "tenant" => tenant_id.to_string(),
+            "right" => required_right.to_string(),
+            "result" => "deny"
+        )
+        .increment(1);
+        return Err(AuthzRejection::Forbidden);
+    }
+
+    for role_id in roles {
+        let key = role_key(&role_id);
+        let entry = storage
+            .roles
+            .get(&key)
+            .await
+            .map_err(|_| AuthzRejection::Internal)?;
+        let Some(entry) = entry else {
+            continue;
+        };
+        let role: RoleRecord = decode_stored(&entry.value).map_err(|_| AuthzRejection::Internal)?;
+        if role.rights.iter().any(|r| r == required_right) {
+            metrics::counter!(
+                "gateway_authz_decisions_total",
+                "tenant" => tenant_id.to_string(),
+                "right" => required_right.to_string(),
+                "result" => "allow"
+            )
+            .increment(1);
+            return Ok(());
+        }
+    }
+
+    metrics::counter!(
+        "gateway_authz_decisions_total",
+        "tenant" => tenant_id.to_string(),
+        "right" => required_right.to_string(),
+        "result" => "deny"
+    )
+    .increment(1);
+    Err(AuthzRejection::Forbidden)
+}
+
+async fn list_assigned_roles(
+    storage: &GatewayStorage,
+    tenant_id: &str,
+    principal_id: &str,
+) -> Result<Vec<String>, AuthzRejection> {
+    let prefix = assignment_prefix(tenant_id, principal_id);
+    let keys = storage
+        .assignments
+        .list_keys(&prefix)
+        .await
+        .map_err(|_| AuthzRejection::Internal)?;
+    Ok(keys
+        .into_iter()
+        .filter_map(|k| k.rsplit('/').next().map(|s| s.to_string()))
+        .collect())
+}
+
+fn role_key(role_id: &str) -> String {
+    format!("v1/roles/{role_id}")
+}
+
+fn assignment_key(tenant_id: &str, principal_id: &str, role_id: &str) -> String {
+    format!("v1/assignments/{tenant_id}/{principal_id}/{role_id}")
+}
+
+fn assignment_prefix(tenant_id: &str, principal_id: &str) -> String {
+    format!("v1/assignments/{tenant_id}/{principal_id}/")
+}
+
+fn decode_stored<T: for<'de> Deserialize<'de>>(bytes: &[u8]) -> Result<T, StorageError> {
+    #[derive(Deserialize)]
+    struct Stored<T> {
+        data: T,
+    }
+    let stored: Stored<T> =
+        serde_json::from_slice(bytes).map_err(|e| StorageError::Serde(e.to_string()))?;
+    Ok(stored.data)
+}
+
+pub async fn put_role(
+    storage: &GatewayStorage,
+    role_id: &str,
+    rights: Vec<String>,
+) -> Result<(), StorageError> {
+    let record = RoleRecord {
+        role_id: role_id.to_string(),
+        rights,
+    };
+    let payload = serde_json::to_vec(&serde_json::json!({
+        "v": crate::storage::SCHEMA_VERSION,
+        "data": record
+    }))
+    .map_err(|e| StorageError::Serde(e.to_string()))?;
+
+    storage.roles.put(&role_key(role_id), payload).await?;
+    Ok(())
+}
+
+pub async fn assign_role(
+    storage: &GatewayStorage,
+    tenant_id: &str,
+    principal_id: &str,
+    role_id: &str,
+) -> Result<(), StorageError> {
+    storage
+        .assignments
+        .put(
+            &assignment_key(tenant_id, principal_id, role_id),
+            b"1".to_vec(),
+        )
+        .await?;
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::authn;
+    use std::sync::Arc;
+    use tower::util::ServiceExt;
+
+    async fn test_app() -> (axum::Router, AppState) {
+        let metrics = crate::observability::init_metrics_for_tests();
+        let routing = crate::routing::RouterState::new(Arc::new(crate::routing::FixedSource::new(
+            crate::routing::RoutingConfig::empty(),
+        )))
+        .await
+        .unwrap();
+        let storage = crate::storage::GatewayStorage::new_in_memory();
+        let authn_cfg = crate::authn::AuthnConfig::for_tests();
+        let state = crate::AppState {
+            metrics,
+            routing,
+            storage,
+            authn: authn_cfg,
+        };
+        let app = crate::app(state.clone());
+        (app, state)
+    }
+
+    async fn test_app_with_routing(cfg: crate::routing::RoutingConfig) -> (axum::Router, AppState) {
+        let metrics = crate::observability::init_metrics_for_tests();
+        let routing =
+            crate::routing::RouterState::new(Arc::new(crate::routing::FixedSource::new(cfg)))
+                .await
+                .unwrap();
+        let storage = crate::storage::GatewayStorage::new_in_memory();
+        let authn_cfg = crate::authn::AuthnConfig::for_tests();
+        let state = crate::AppState {
+            metrics,
+            routing,
+            storage,
+            authn: authn_cfg,
+        };
+        let app = crate::app(state.clone());
+        (app, state)
+    }
+
+    async fn signup_and_get_claims(
+        app: &axum::Router,
+        cfg: &authn::AuthnConfig,
+    ) -> (String, authn::AccessClaims) {
+        let response = app
+            .clone()
+            .oneshot(
+                axum::http::Request::builder()
+                    .method("POST")
+                    .uri("/v1/auth/signup")
+                    .header("content-type", "application/json")
+                    .body(axum::body::Body::from(
+                        r#"{"email":"a@b.com","password":"password123"}"#,
+                    ))
+                    .unwrap(),
+            )
+            .await
+            .unwrap();
+        assert_eq!(response.status(), StatusCode::OK);
+        let body = axum::body::to_bytes(response.into_body(), usize::MAX)
+            .await
+            .unwrap();
+        let created: crate::authn::AuthResponse = serde_json::from_slice(&body).unwrap();
+
+        let claims = cfg.verify_access_token(&created.access_token).unwrap();
+        (created.access_token, claims)
+    }
+
+    #[tokio::test]
+    async fn missing_tenant_header_returns_400() {
+        let (app, state) = test_app().await;
+        let (token, claims) = signup_and_get_claims(&app, &state.authn).await;
+
+        put_role(
+            &state.storage,
+            "role-command",
+            vec!["command.submit".to_string()],
+        )
+        .await
+        .unwrap();
+
+        assign_role(&state.storage, "tenant-a", &claims.sub, "role-command")
+            .await
+            .unwrap();
+
+        let response = app
+            .oneshot(
+                axum::http::Request::builder()
+                    .method("POST")
+                    .uri("/v1/commands/User/u1")
+                    .header("authorization", format!("Bearer {token}"))
+                    .header("content-type", "application/json")
+                    .body(axum::body::Body::empty())
+                    .unwrap(),
+            )
+            .await
+            .unwrap();
+
+        assert_eq!(response.status(), StatusCode::BAD_REQUEST);
+    }
+
+    #[tokio::test]
+    async fn tenant_spoofing_is_rejected() {
+        let (app, state) = test_app().await;
+        let (token, claims) = signup_and_get_claims(&app, &state.authn).await;
+
+        put_role(
+            &state.storage,
+            "role-command",
+            vec!["command.submit".to_string()],
+        )
+        .await
+        .unwrap();
+        assign_role(&state.storage, "tenant-a", &claims.sub, "role-command")
+            .await
+            .unwrap();
+
+        let response = app
+            .oneshot(
+                axum::http::Request::builder()
+                    .method("POST")
+                    .uri("/v1/commands/User/u1")
+                    .header("authorization", format!("Bearer {token}"))
+                    .header("x-tenant-id", "tenant-b")
+                    .header("content-type", "application/json")
+                    .body(axum::body::Body::from(r#"{"payload":{}}"#))
+                    .unwrap(),
+            )
+            .await
+            .unwrap();
+
+        assert_eq!(response.status(), StatusCode::FORBIDDEN);
+    }
+
+    #[tokio::test]
+    async fn role_assignment_enables_expected_action() {
+        let (app, state) = test_app().await;
+        let (token, claims) = signup_and_get_claims(&app, &state.authn).await;
+
+        put_role(
+            &state.storage,
+            "role-command",
+            vec!["command.submit".to_string()],
+        )
+        .await
+        .unwrap();
+        assign_role(&state.storage, "tenant-a", &claims.sub, "role-command")
+            .await
+            .unwrap();
+
+        let response = app
+            .oneshot(
+                axum::http::Request::builder()
+                    .method("POST")
+                    .uri("/v1/commands/User/u1")
+                    .header("authorization", format!("Bearer {token}"))
+                    .header("x-tenant-id", "tenant-a")
+                    .header("content-type", "application/json")
+                    .body(axum::body::Body::from(r#"{"payload":{}}"#))
+                    .unwrap(),
+            )
+            .await
+            .unwrap();
+
+        assert_eq!(response.status(), StatusCode::INTERNAL_SERVER_ERROR);
+    }
+
+    #[tokio::test]
+    async fn http_command_endpoint_returns_same_shape_as_grpc_response() {
+        use crate::grpc::proto;
+        use crate::routing::RoutingConfig;
+        use std::collections::HashMap;
+
+        #[derive(Default)]
+        struct Upstream;
+
+        #[async_trait::async_trait]
+        impl proto::command_service_server::CommandService for Upstream {
+            async fn submit_command(
+                &self,
+                request: tonic::Request<proto::SubmitCommandRequest>,
+            ) -> Result<tonic::Response<proto::SubmitCommandResponse>, tonic::Status> {
+                let req = request.into_inner();
+                Ok(tonic::Response::new(proto::SubmitCommandResponse {
+                    events: vec![proto::Event {
+                        event_id: "e1".to_string(),
+                        command_id: req.command_id,
+                        aggregate_id: req.aggregate_id,
+                        aggregate_type: req.aggregate_type,
+                        version: 1,
+                        event_type: "Created".to_string(),
+                        payload_json: "{}".to_string(),
+                        timestamp_rfc3339: "2020-01-01T00:00:00Z".to_string(),
+                    }],
+                }))
+            }
+        }
+
+        let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
+        let addr = listener.local_addr().unwrap();
+        drop(listener);
+        tokio::spawn(async move {
+            tonic::transport::Server::builder()
+                .add_service(proto::command_service_server::CommandServiceServer::new(
+                    Upstream,
+                ))
+                .serve(addr)
+                .await
+                .unwrap();
+        });
+
+        tokio::time::sleep(std::time::Duration::from_millis(50)).await;
+
+        let upstream_url = format!("http://{}", addr);
+        let cfg = RoutingConfig {
+            revision: 1,
+            aggregate_placement: HashMap::from([("tenant-a".to_string(), "a".to_string())]),
+            projection_placement: HashMap::new(),
+            runner_placement: HashMap::new(),
+            aggregate_shards: HashMap::from([("a".to_string(), vec![upstream_url])]),
+            projection_shards: HashMap::new(),
+            runner_shards: HashMap::new(),
+        };
+
+        let (app, state) = test_app_with_routing(cfg).await;
+        let (token, claims) = signup_and_get_claims(&app, &state.authn).await;
+
+        put_role(
+            &state.storage,
+            "role-command",
+            vec!["command.submit".to_string()],
+        )
+        .await
+        .unwrap();
+        assign_role(&state.storage, "tenant-a", &claims.sub, "role-command")
+            .await
+            .unwrap();
+
+        let response = app
+            .oneshot(
+                axum::http::Request::builder()
+                    .method("POST")
+                    .uri("/v1/commands/User/u1")
+                    .header("authorization", format!("Bearer {token}"))
+                    .header("x-tenant-id", "tenant-a")
+                    .header("content-type", "application/json")
+                    .body(axum::body::Body::from(r#"{"payload":{}}"#))
+                    .unwrap(),
+            )
+            .await
+            .unwrap();
+        assert_eq!(response.status(), StatusCode::OK);
+
+        let body = axum::body::to_bytes(response.into_body(), usize::MAX)
+            .await
+            .unwrap();
+        let value: serde_json::Value = serde_json::from_slice(&body).unwrap();
+        assert!(
+            value
+                .get("events")
+                .and_then(|v| v.as_array())
+                .unwrap()
+                .len()
+                == 1
+        );
+        assert_eq!(
+            value.get("events").unwrap()[0]
+                .get("event_id")
+                .and_then(|v| v.as_str())
+                .unwrap(),
+            "e1"
+        );
+    }
+
+    #[tokio::test]
+    async fn query_endpoint_denies_unauthorized_and_forwards_when_authorized() {
+        use crate::routing::RoutingConfig;
+        use std::collections::HashMap;
+
+        let projection_app = axum::Router::new().route(
+            "/v1/query/TestView",
+            post(|headers: axum::http::HeaderMap| async move {
+                let correlation = headers
+                    .get("x-correlation-id")
+                    .and_then(|v| v.to_str().ok())
+                    .unwrap_or("");
+                let traceparent = headers
+                    .get("traceparent")
+                    .and_then(|v| v.to_str().ok())
+                    .unwrap_or("");
+                if correlation.trim().is_empty()
+                    || crate::trace_id_from_traceparent(traceparent).is_none()
+                {
+                    return (StatusCode::BAD_REQUEST, "missing correlation");
+                }
+                (StatusCode::OK, r#"{"mode":"count"}"#)
+            }),
+        );
+        let projection_listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
+        let projection_addr = projection_listener.local_addr().unwrap();
+        tokio::spawn(async move {
+            axum::serve(projection_listener, projection_app)
+                .await
+                .unwrap();
+        });
+        tokio::time::sleep(std::time::Duration::from_millis(50)).await;
+        let projection_url = format!("http://{}", projection_addr);
+
+        let cfg = RoutingConfig {
+            revision: 1,
+            aggregate_placement: HashMap::new(),
+            projection_placement: HashMap::from([("tenant-a".to_string(), "p".to_string())]),
+            runner_placement: HashMap::new(),
+            aggregate_shards: HashMap::new(),
+            projection_shards: HashMap::from([("p".to_string(), vec![projection_url])]),
+            runner_shards: HashMap::new(),
+        };
+
+        let (app, state) = test_app_with_routing(cfg).await;
+        let (token, claims) = signup_and_get_claims(&app, &state.authn).await;
+
+        let deny = app
+            .clone()
+            .oneshot(
+                axum::http::Request::builder()
+                    .method("POST")
+                    .uri("/v1/query/TestView")
+                    .header("authorization", format!("Bearer {token}"))
+                    .header("x-tenant-id", "tenant-a")
+                    .header("content-type", "application/json")
+                    .body(axum::body::Body::from(r#"{"uqf":"{}"}"#))
+                    .unwrap(),
+            )
+            .await
+            .unwrap();
+        assert_eq!(deny.status(), StatusCode::FORBIDDEN);
+
+        put_role(
+            &state.storage,
+            "role-query",
+            vec!["query.execute".to_string()],
+        )
+        .await
+        .unwrap();
+        assign_role(&state.storage, "tenant-a", &claims.sub, "role-query")
+            .await
+            .unwrap();
+
+        let ok = app
+            .oneshot(
+                axum::http::Request::builder()
+                    .method("POST")
+                    .uri("/v1/query/TestView")
+                    .header("authorization", format!("Bearer {token}"))
+                    .header("x-tenant-id", "tenant-a")
+                    .header("content-type", "application/json")
+                    .body(axum::body::Body::from(r#"{"uqf":"{}"}"#))
+                    .unwrap(),
+            )
+            .await
+            .unwrap();
+        assert_eq!(ok.status(), StatusCode::OK);
+        assert!(!ok
+            .headers()
+            .get("x-correlation-id")
+            .and_then(|v| v.to_str().ok())
+            .unwrap_or("")
+            .is_empty());
+        assert!(crate::trace_id_from_traceparent(
+            ok.headers()
+                .get("traceparent")
+                .and_then(|v| v.to_str().ok())
+                .unwrap_or("")
+        )
+        .is_some());
+    }
+}
--- a/gateway/src/grpc.rs
+++ b/gateway/src/grpc.rs
@@ -0,0 +1,275 @@
+use crate::routing::RouterState;
+use crate::routing::RoutingError;
+use crate::routing::ServiceKind;
+
+pub mod proto {
+    tonic::include_proto!("aggregate.gateway.v1");
+}
+
+#[derive(Clone)]
+pub struct GatewayCommandService {
+    routing: RouterState,
+}
+
+impl GatewayCommandService {
+    pub fn new(routing: RouterState) -> Self {
+        Self { routing }
+    }
+}
+
+#[async_trait::async_trait]
+impl proto::command_service_server::CommandService for GatewayCommandService {
+    async fn submit_command(
+        &self,
+        request: tonic::Request<proto::SubmitCommandRequest>,
+    ) -> Result<tonic::Response<proto::SubmitCommandResponse>, tonic::Status> {
+        let correlation_id = request
+            .metadata()
+            .get("x-correlation-id")
+            .and_then(|v| v.to_str().ok())
+            .map(|s| s.trim())
+            .filter(|s| !s.is_empty())
+            .map(|s| s.to_string())
+            .unwrap_or_else(|| uuid::Uuid::new_v4().to_string());
+
+        let traceparent = request
+            .metadata()
+            .get("traceparent")
+            .and_then(|v| v.to_str().ok())
+            .map(|s| s.trim())
+            .filter(|s| !s.is_empty())
+            .and_then(|s| {
+                if crate::trace_id_from_traceparent(s).is_some() {
+                    Some(s.to_string())
+                } else {
+                    None
+                }
+            })
+            .unwrap_or_else(|| {
+                let trace_id = uuid::Uuid::new_v4().simple().to_string();
+                let span_id = uuid::Uuid::new_v4().simple().to_string()[..16].to_string();
+                format!("00-{trace_id}-{span_id}-01")
+            });
+
+        let mut req = request.into_inner();
+
+        let tenant_id = req.tenant_id.trim().to_string();
+        if tenant_id.is_empty() {
+            return Err(tonic::Status::invalid_argument("tenant_id is required"));
+        }
+        req.tenant_id = tenant_id.clone();
+
+        let upstream = self
+            .routing
+            .resolve(&tenant_id, ServiceKind::Aggregate)
+            .await
+            .map_err(map_routing_error)?;
+        tracing::Span::current().record("upstream", upstream.as_str());
+
+        let channel = crate::upstream::grpc_endpoint(&upstream)
+            .map_err(|e| tonic::Status::unavailable(e.to_string()))?
+            .connect()
+            .await
+            .map_err(|e| tonic::Status::unavailable(e.to_string()))?;
+        let mut client = proto::command_service_client::CommandServiceClient::new(channel);
+
+        let mut upstream_req = tonic::Request::new(req);
+        if let Ok(v) = tonic::metadata::MetadataValue::try_from(tenant_id.as_str()) {
+            upstream_req.metadata_mut().insert("x-tenant-id", v);
+        }
+        if let Ok(v) = tonic::metadata::MetadataValue::try_from(correlation_id.as_str()) {
+            upstream_req.metadata_mut().insert("x-correlation-id", v);
+        }
+        if let Ok(v) = tonic::metadata::MetadataValue::try_from(traceparent.as_str()) {
+            upstream_req.metadata_mut().insert("traceparent", v);
+        }
+
+        let mut resp = client.submit_command(upstream_req).await?;
+        if let Ok(v) = tonic::metadata::MetadataValue::try_from(correlation_id.as_str()) {
+            resp.metadata_mut().insert("x-correlation-id", v);
+        }
+        if let Ok(v) = tonic::metadata::MetadataValue::try_from(traceparent.as_str()) {
+            resp.metadata_mut().insert("traceparent", v);
+        }
+        Ok(resp)
+    }
+}
+
+pub async fn submit_command_via_routing(
+    routing: &RouterState,
+    request: proto::SubmitCommandRequest,
+    ctx: &crate::RequestContext,
+) -> Result<proto::SubmitCommandResponse, tonic::Status> {
+    let tenant_id = request.tenant_id.trim().to_string();
+    if tenant_id.is_empty() {
+        return Err(tonic::Status::invalid_argument("tenant_id is required"));
+    }
+
+    let upstream = routing
+        .resolve(&tenant_id, ServiceKind::Aggregate)
+        .await
+        .map_err(map_routing_error)?;
+    tracing::Span::current().record("upstream", upstream.as_str());
+
+    let channel = crate::upstream::grpc_endpoint(&upstream)
+        .map_err(|e| tonic::Status::unavailable(e.to_string()))?
+        .connect()
+        .await
+        .map_err(|e| tonic::Status::unavailable(e.to_string()))?;
+    let mut client = proto::command_service_client::CommandServiceClient::new(channel);
+
+    let mut upstream_req = tonic::Request::new(request);
+    if let Ok(v) = tonic::metadata::MetadataValue::try_from(tenant_id.as_str()) {
+        upstream_req.metadata_mut().insert("x-tenant-id", v);
+    }
+    if let Ok(v) = tonic::metadata::MetadataValue::try_from(ctx.correlation_id.as_str()) {
+        upstream_req.metadata_mut().insert("x-correlation-id", v);
+    }
+    if let Ok(v) = tonic::metadata::MetadataValue::try_from(ctx.traceparent.as_str()) {
+        upstream_req.metadata_mut().insert("traceparent", v);
+    }
+
+    let resp = client.submit_command(upstream_req).await?;
+    Ok(resp.into_inner())
+}
+
+fn map_routing_error(err: RoutingError) -> tonic::Status {
+    match err {
+        RoutingError::UnknownTenant => tonic::Status::not_found("unknown tenant"),
+        RoutingError::MissingShard | RoutingError::EmptyShard => {
+            tonic::Status::unavailable(err.to_string())
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::routing::RoutingConfig;
+    use std::collections::HashMap;
+    use std::sync::Arc;
+
+    #[tokio::test]
+    async fn grpc_submit_command_forwards_tenant_metadata_and_returns_events() {
+        use proto::command_service_server::CommandService;
+
+        #[derive(Default)]
+        struct Upstream;
+
+        #[async_trait::async_trait]
+        impl proto::command_service_server::CommandService for Upstream {
+            async fn submit_command(
+                &self,
+                request: tonic::Request<proto::SubmitCommandRequest>,
+            ) -> Result<tonic::Response<proto::SubmitCommandResponse>, tonic::Status> {
+                let tenant_md = request
+                    .metadata()
+                    .get("x-tenant-id")
+                    .and_then(|v| v.to_str().ok())
+                    .unwrap_or("");
+                if tenant_md != request.get_ref().tenant_id {
+                    return Err(tonic::Status::failed_precondition(
+                        "missing tenant metadata",
+                    ));
+                }
+                let correlation = request
+                    .metadata()
+                    .get("x-correlation-id")
+                    .and_then(|v| v.to_str().ok())
+                    .unwrap_or("");
+                if correlation.trim().is_empty() {
+                    return Err(tonic::Status::failed_precondition(
+                        "missing correlation metadata",
+                    ));
+                }
+                let traceparent = request
+                    .metadata()
+                    .get("traceparent")
+                    .and_then(|v| v.to_str().ok())
+                    .unwrap_or("");
+                if crate::trace_id_from_traceparent(traceparent).is_none() {
+                    return Err(tonic::Status::failed_precondition("missing traceparent"));
+                }
+
+                let resp = proto::SubmitCommandResponse {
+                    events: vec![proto::Event {
+                        event_id: "e1".to_string(),
+                        command_id: request.get_ref().command_id.clone(),
+                        aggregate_id: request.get_ref().aggregate_id.clone(),
+                        aggregate_type: request.get_ref().aggregate_type.clone(),
+                        version: 1,
+                        event_type: "Created".to_string(),
+                        payload_json: "{}".to_string(),
+                        timestamp_rfc3339: "2020-01-01T00:00:00Z".to_string(),
+                    }],
+                };
+                Ok(tonic::Response::new(resp))
+            }
+        }
+
+        let upstream_listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
+        let upstream_addr = upstream_listener.local_addr().unwrap();
+        drop(upstream_listener);
+        let upstream_url = format!("http://{}", upstream_addr);
+
+        let upstream_task = tokio::spawn(async move {
+            tonic::transport::Server::builder()
+                .add_service(proto::command_service_server::CommandServiceServer::new(
+                    Upstream,
+                ))
+                .serve(upstream_addr)
+                .await
+                .unwrap();
+        });
+
+        tokio::time::sleep(std::time::Duration::from_millis(50)).await;
+
+        let cfg = RoutingConfig {
+            revision: 1,
+            aggregate_placement: HashMap::from([("tenant-a".to_string(), "a".to_string())]),
+            projection_placement: HashMap::new(),
+            runner_placement: HashMap::new(),
+            aggregate_shards: HashMap::from([("a".to_string(), vec![upstream_url])]),
+            projection_shards: HashMap::new(),
+            runner_shards: HashMap::new(),
+        };
+
+        let routing =
+            crate::routing::RouterState::new(Arc::new(crate::routing::FixedSource::new(cfg)))
+                .await
+                .unwrap();
+        let svc = GatewayCommandService::new(routing);
+
+        let request = proto::SubmitCommandRequest {
+            tenant_id: "tenant-a".to_string(),
+            command_id: "c1".to_string(),
+            aggregate_id: "id1".to_string(),
+            aggregate_type: "User".to_string(),
+            payload_json: "{}".to_string(),
+            metadata: HashMap::new(),
+        };
+
+        let resp = CommandService::submit_command(&svc, tonic::Request::new(request))
+            .await
+            .unwrap();
+        assert!(!resp
+            .metadata()
+            .get("x-correlation-id")
+            .and_then(|v| v.to_str().ok())
+            .unwrap_or("")
+            .is_empty());
+        assert!(crate::trace_id_from_traceparent(
+            resp.metadata()
+                .get("traceparent")
+                .and_then(|v| v.to_str().ok())
+                .unwrap_or("")
+        )
+        .is_some());
+        let resp = resp.into_inner();
+
+        assert_eq!(resp.events.len(), 1);
+        assert_eq!(resp.events[0].command_id, "c1");
+
+        upstream_task.abort();
+    }
+}
--- a/gateway/src/lib.rs
+++ b/gateway/src/lib.rs
@@ -0,0 +1,541 @@
+use std::time::Duration;
+use std::time::Instant;
+
+use axum::error_handling::HandleErrorLayer;
+use axum::extract::MatchedPath;
+use axum::extract::State;
+use axum::http::request::Parts;
+use axum::http::HeaderName;
+use axum::http::HeaderValue;
+use axum::http::StatusCode;
+use axum::middleware::Next;
+use axum::response::IntoResponse;
+use axum::routing::get;
+use axum::BoxError;
+use axum::Json;
+use axum::Router;
+use metrics_exporter_prometheus::PrometheusHandle;
+use serde::Serialize;
+use std::future::Future;
+use std::pin::Pin;
+use std::task::Context;
+use std::task::Poll;
+use tower::timeout::TimeoutLayer;
+use tower::Layer;
+use tower::Service;
+use tower::ServiceBuilder;
+use tower_http::limit::RequestBodyLimitLayer;
+use tower_http::request_id::MakeRequestUuid;
+use tower_http::request_id::PropagateRequestIdLayer;
+use tower_http::request_id::SetRequestIdLayer;
+use tower_http::trace::TraceLayer;
+use tracing::Level;
+
+#[derive(Debug, Clone)]
+pub struct RequestContext {
+    pub request_id: String,
+    pub correlation_id: String,
+    pub traceparent: String,
+    pub trace_id: String,
+}
+
+#[async_trait::async_trait]
+impl<S> axum::extract::FromRequestParts<S> for RequestContext
+where
+    S: Send + Sync,
+{
+    type Rejection = StatusCode;
+
+    async fn from_request_parts(parts: &mut Parts, _state: &S) -> Result<Self, Self::Rejection> {
+        let request_id = parts
+            .headers
+            .get("x-request-id")
+            .and_then(|v| v.to_str().ok())
+            .unwrap_or("")
+            .to_string();
+        let correlation_id = parts
+            .headers
+            .get("x-correlation-id")
+            .and_then(|v| v.to_str().ok())
+            .unwrap_or("")
+            .to_string();
+        let traceparent = parts
+            .headers
+            .get("traceparent")
+            .and_then(|v| v.to_str().ok())
+            .unwrap_or("")
+            .to_string();
+        let trace_id = trace_id_from_traceparent(&traceparent)
+            .map(|s| s.to_string())
+            .unwrap_or_default();
+
+        Ok(Self {
+            request_id,
+            correlation_id,
+            traceparent,
+            trace_id,
+        })
+    }
+}
+
+#[derive(Clone)]
+pub struct AppState {
+    pub metrics: PrometheusHandle,
+    pub routing: routing::RouterState,
+    pub storage: storage::GatewayStorage,
+    pub authn: authn::AuthnConfig,
+}
+
+#[derive(Serialize)]
+struct StatusResponse {
+    status: &'static str,
+}
+
+pub fn app(state: AppState) -> Router {
+    let request_id_header = HeaderName::from_static("x-request-id");
+
+    Router::new()
+        .route("/health", get(health))
+        .route("/ready", get(ready))
+        .route("/metrics", get(metrics))
+        .nest("/v1/auth", authn::router())
+        .nest("/v1", authz::router())
+        .nest("/admin/iam", admin_iam::router())
+        .nest("/v1/admin/iam", admin_iam::router())
+        .nest("/admin/rebalance", admin_rebalance::router())
+        .route("/admin/routing", get(admin_routing))
+        .route(
+            "/admin/runner/*path",
+            axum::routing::any(authz::runner_admin_proxy),
+        )
+        .route(
+            "/admin/routing/reload",
+            axum::routing::post(admin_routing_reload),
+        )
+        .route(
+            "/admin/routing/resolve",
+            axum::routing::get(admin_rebalance::resolve),
+        )
+        .route_layer(axum::middleware::from_fn(track_http_metrics))
+        .with_state(state)
+        .layer(
+            ServiceBuilder::new()
+                .layer(HandleErrorLayer::new(|error: BoxError| async move {
+                    (StatusCode::REQUEST_TIMEOUT, error.to_string())
+                }))
+                .layer(SetRequestIdLayer::new(
+                    request_id_header.clone(),
+                    MakeRequestUuid,
+                ))
+                .layer(PropagateRequestIdLayer::new(request_id_header))
+                .layer(EnsureCorrelationTraceLayer)
+                .layer(TraceLayer::new_for_http().make_span_with(
+                    |request: &axum::http::Request<_>| {
+                        let request_id = request
+                            .headers()
+                            .get("x-request-id")
+                            .and_then(|v| v.to_str().ok())
+                            .unwrap_or("");
+                        let correlation_id = request
+                            .headers()
+                            .get("x-correlation-id")
+                            .and_then(|v| v.to_str().ok())
+                            .unwrap_or("");
+                        let traceparent = request
+                            .headers()
+                            .get("traceparent")
+                            .and_then(|v| v.to_str().ok())
+                            .unwrap_or("");
+                        let trace_id = trace_id_from_traceparent(traceparent).unwrap_or("");
+                        let path = request_path_for_logging(request);
+
+                        tracing::span!(
+                            Level::INFO,
+                            "http.request",
+                            method = %request.method(),
+                            path = %path,
+                            request_id = request_id,
+                            correlation_id = correlation_id,
+                            trace_id = trace_id,
+                            tenant_id = tracing::field::Empty,
+                            principal_id = tracing::field::Empty,
+                            upstream = tracing::field::Empty,
+                        )
+                    },
+                ))
+                .layer(RequestBodyLimitLayer::new(1024 * 1024))
+                .layer(TimeoutLayer::new(Duration::from_secs(30))),
+        )
+}
+
+#[derive(Clone)]
+struct EnsureCorrelationTraceLayer;
+
+#[derive(Clone)]
+struct EnsureCorrelationTrace<S> {
+    inner: S,
+}
+
+impl<S> Layer<S> for EnsureCorrelationTraceLayer {
+    type Service = EnsureCorrelationTrace<S>;
+
+    fn layer(&self, inner: S) -> Self::Service {
+        Self::Service { inner }
+    }
+}
+
+impl<S, ReqBody, ResBody> Service<axum::http::Request<ReqBody>> for EnsureCorrelationTrace<S>
+where
+    S: Service<axum::http::Request<ReqBody>, Response = axum::http::Response<ResBody>>
+        + Clone
+        + Send
+        + 'static,
+    S::Future: Send + 'static,
+    S::Error: Send + 'static,
+    ReqBody: Send + 'static,
+    ResBody: Send + 'static,
+{
+    type Response = axum::http::Response<ResBody>;
+    type Error = S::Error;
+    type Future =
+        Pin<Box<dyn Future<Output = Result<axum::http::Response<ResBody>, S::Error>> + Send>>;
+
+    fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
+        self.inner.poll_ready(cx)
+    }
+
+    fn call(&mut self, mut req: axum::http::Request<ReqBody>) -> Self::Future {
+        let correlation_id = req
+            .headers()
+            .get("x-correlation-id")
+            .and_then(|v| v.to_str().ok())
+            .map(|s| s.trim())
+            .filter(|s| !s.is_empty())
+            .map(|s| s.to_string())
+            .unwrap_or_else(generate_correlation_id);
+
+        let traceparent = req
+            .headers()
+            .get("traceparent")
+            .and_then(|v| v.to_str().ok())
+            .map(|s| s.trim())
+            .filter(|s| !s.is_empty())
+            .and_then(|s| {
+                if trace_id_from_traceparent(s).is_some() {
+                    Some(s.to_string())
+                } else {
+                    None
+                }
+            })
+            .unwrap_or_else(generate_traceparent);
+
+        if let Ok(v) = HeaderValue::from_str(&correlation_id) {
+            req.headers_mut().insert("x-correlation-id", v);
+        }
+        if let Ok(v) = HeaderValue::from_str(&traceparent) {
+            req.headers_mut().insert("traceparent", v);
+        }
+
+        let mut inner = self.inner.clone();
+        Box::pin(async move {
+            let mut resp = inner.call(req).await?;
+            if resp.headers().get("x-correlation-id").is_none() {
+                if let Ok(v) = HeaderValue::from_str(&correlation_id) {
+                    resp.headers_mut().insert("x-correlation-id", v);
+                }
+            }
+            if resp.headers().get("traceparent").is_none() {
+                if let Ok(v) = HeaderValue::from_str(&traceparent) {
+                    resp.headers_mut().insert("traceparent", v);
+                }
+            }
+            Ok(resp)
+        })
+    }
+}
+
+fn generate_correlation_id() -> String {
+    uuid::Uuid::new_v4().to_string()
+}
+
+fn generate_traceparent() -> String {
+    let trace_id = uuid::Uuid::new_v4().simple().to_string();
+    let span_id = uuid::Uuid::new_v4().simple().to_string()[..16].to_string();
+    format!("00-{trace_id}-{span_id}-01")
+}
+
+pub(crate) fn trace_id_from_traceparent(traceparent: &str) -> Option<&str> {
+    shared::trace_id_from_traceparent(traceparent)
+}
+
+async fn track_http_metrics(
+    req: axum::http::Request<axum::body::Body>,
+    next: Next,
+) -> axum::response::Response {
+    let method = req.method().to_string();
+    let path = req
+        .extensions()
+        .get::<MatchedPath>()
+        .map(|p| p.as_str().to_string())
+        .unwrap_or_else(|| req.uri().path().to_string());
+    let start = Instant::now();
+
+    let response = next.run(req).await;
+
+    let status = response.status().as_u16().to_string();
+    let elapsed = start.elapsed().as_secs_f64();
+
+    metrics::counter!(
+        "gateway_http_requests_total",
+        "method" => method.clone(),
+        "path" => path.clone(),
+        "status" => status.clone()
+    )
+    .increment(1);
+    metrics::histogram!(
+        "gateway_http_request_duration_seconds",
+        "method" => method,
+        "path" => path,
+        "status" => status
+    )
+    .record(elapsed);
+
+    response
+}
+
+fn request_path_for_logging<B>(req: &axum::http::Request<B>) -> String {
+    req.extensions()
+        .get::<MatchedPath>()
+        .map(|p| p.as_str().to_string())
+        .unwrap_or_else(|| req.uri().path().to_string())
+}
+
+async fn health() -> impl IntoResponse {
+    metrics::counter!("gateway_health_requests_total").increment(1);
+    Json(StatusResponse { status: "ok" })
+}
+
+async fn ready() -> impl IntoResponse {
+    metrics::counter!("gateway_ready_requests_total").increment(1);
+    Json(StatusResponse { status: "ok" })
+}
+
+async fn metrics(State(state): State<AppState>) -> impl IntoResponse {
+    state.metrics.render()
+}
+
+async fn admin_routing(State(state): State<AppState>) -> impl IntoResponse {
+    Json(state.routing.snapshot().await)
+}
+
+async fn admin_routing_reload(State(state): State<AppState>) -> impl IntoResponse {
+    match state.routing.reload().await {
+        Ok(()) => StatusCode::NO_CONTENT.into_response(),
+        Err(e) => (StatusCode::INTERNAL_SERVER_ERROR, e.to_string()).into_response(),
+    }
+}
+
+pub mod http {}
+pub mod admin_iam;
+pub mod admin_rebalance;
+pub mod authn;
+pub mod authz;
+pub mod grpc;
+pub mod routing;
+pub mod upstream;
+pub mod config {}
+pub mod storage;
+
+pub mod observability {
+    use edge_logger_client::Config as EdgeLoggerConfig;
+    use edge_logger_client::EdgeLoggerLayer;
+    use metrics_exporter_prometheus::PrometheusBuilder;
+    use metrics_exporter_prometheus::PrometheusHandle;
+    use std::time::Duration;
+    use tracing_subscriber::prelude::*;
+
+    pub fn init_tracing() {
+        let filter = std::env::var("RUST_LOG").unwrap_or_else(|_| "info".to_string());
+        let env_filter = tracing_subscriber::EnvFilter::new(filter);
+
+        let fmt_layer = tracing_subscriber::fmt::layer().json();
+        let edge_layer = edge_logger_layer_from_env("gateway");
+
+        let registry = tracing_subscriber::registry()
+            .with(env_filter)
+            .with(fmt_layer);
+        let _ = match edge_layer {
+            Some(layer) => registry.with(layer).try_init(),
+            None => registry.try_init(),
+        };
+    }
+
+    pub fn init_metrics() -> PrometheusHandle {
+        PrometheusBuilder::new()
+            .install_recorder()
+            .expect("failed to install Prometheus recorder")
+    }
+
+    pub fn init_metrics_for_tests() -> PrometheusHandle {
+        PrometheusBuilder::new().build_recorder().handle()
+    }
+
+    fn edge_logger_layer_from_env(service_name: &str) -> Option<EdgeLoggerLayer> {
+        let enabled = std::env::var("EDGE_LOGGER_ENABLED")
+            .ok()
+            .map(|v| matches!(v.trim().to_ascii_lowercase().as_str(), "1" | "true" | "yes"))
+            .unwrap_or(false);
+
+        let socket_path = std::env::var("EDGE_LOGGER_SOCKET_PATH").ok();
+        if !enabled && socket_path.is_none() {
+            return None;
+        }
+
+        let environment = std::env::var("EDGE_LOGGER_ENVIRONMENT")
+            .or_else(|_| std::env::var("ENVIRONMENT"))
+            .unwrap_or_else(|_| "production".to_string());
+
+        let tenant_id =
+            std::env::var("EDGE_LOGGER_TENANT_ID").unwrap_or_else(|_| "default".to_string());
+
+        let batch_size = std::env::var("EDGE_LOGGER_BATCH_SIZE")
+            .ok()
+            .and_then(|v| v.parse::<usize>().ok())
+            .unwrap_or(100);
+
+        let flush_interval = std::env::var("EDGE_LOGGER_FLUSH_INTERVAL_MS")
+            .ok()
+            .and_then(|v| v.parse::<u64>().ok())
+            .map(Duration::from_millis)
+            .unwrap_or(Duration::from_secs(1));
+
+        Some(EdgeLoggerLayer::new(EdgeLoggerConfig {
+            socket_path: socket_path
+                .unwrap_or_else(|| "/var/run/edge-logger/logger.sock".to_string()),
+            service: service_name.to_string(),
+            environment,
+            tenant_id,
+            batch_size,
+            flush_interval,
+        }))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::sync::Arc;
+    use std::sync::OnceLock;
+    use tower::util::ServiceExt;
+
+    fn assert_send_sync<T: Send + Sync>() {}
+
+    #[test]
+    fn app_state_is_send_sync() {
+        assert_send_sync::<AppState>();
+    }
+
+    #[tokio::test]
+    async fn health_returns_200() {
+        let metrics = crate::observability::init_metrics_for_tests();
+        let routing = crate::routing::RouterState::new(Arc::new(crate::routing::FixedSource::new(
+            crate::routing::RoutingConfig::empty(),
+        )))
+        .await
+        .unwrap();
+        let storage = crate::storage::GatewayStorage::new_in_memory();
+        let authn = crate::authn::AuthnConfig::for_tests();
+        let app = app(AppState {
+            metrics,
+            routing,
+            storage,
+            authn,
+        });
+
+        let response = app
+            .oneshot(
+                axum::http::Request::builder()
+                    .method("GET")
+                    .uri("/health")
+                    .body(axum::body::Body::empty())
+                    .unwrap(),
+            )
+            .await
+            .unwrap();
+
+        assert_eq!(response.status(), axum::http::StatusCode::OK);
+    }
+
+    #[test]
+    fn docker_stack_yml_is_valid_yaml() {
+        let raw = std::fs::read_to_string("../swarm/stacks/platform.yml").unwrap();
+        let parsed: serde_yaml::Value = serde_yaml::from_str(&raw).unwrap();
+        assert!(parsed.as_mapping().is_some());
+    }
+
+    #[tokio::test]
+    async fn metrics_include_http_request_counters() {
+        static HANDLE: OnceLock<PrometheusHandle> = OnceLock::new();
+        let metrics = HANDLE
+            .get_or_init(|| {
+                metrics_exporter_prometheus::PrometheusBuilder::new()
+                    .install_recorder()
+                    .unwrap()
+            })
+            .clone();
+
+        let routing = crate::routing::RouterState::new(Arc::new(crate::routing::FixedSource::new(
+            crate::routing::RoutingConfig::empty(),
+        )))
+        .await
+        .unwrap();
+        let storage = crate::storage::GatewayStorage::new_in_memory();
+        let authn = crate::authn::AuthnConfig::for_tests();
+        let app = app(AppState {
+            metrics,
+            routing,
+            storage,
+            authn,
+        });
+
+        let _ = app
+            .clone()
+            .oneshot(
+                axum::http::Request::builder()
+                    .method("GET")
+                    .uri("/health")
+                    .body(axum::body::Body::empty())
+                    .unwrap(),
+            )
+            .await
+            .unwrap();
+
+        let resp = app
+            .oneshot(
+                axum::http::Request::builder()
+                    .method("GET")
+                    .uri("/metrics")
+                    .body(axum::body::Body::empty())
+                    .unwrap(),
+            )
+            .await
+            .unwrap();
+        let body = axum::body::to_bytes(resp.into_body(), usize::MAX)
+            .await
+            .unwrap();
+        let rendered = String::from_utf8_lossy(&body);
+        assert!(rendered.contains("gateway_http_requests_total"));
+    }
+
+    #[test]
+    fn request_path_for_logging_does_not_include_query() {
+        let req = axum::http::Request::builder()
+            .method("GET")
+            .uri("/v1/auth/oidc/google/callback?code=supersecret&state=x")
+            .body(axum::body::Body::empty())
+            .unwrap();
+        let path = request_path_for_logging(&req);
+        assert_eq!(path, "/v1/auth/oidc/google/callback");
+        assert!(!path.contains("supersecret"));
+    }
+}
--- a/gateway/src/main.rs
+++ b/gateway/src/main.rs
@@ -0,0 +1,130 @@
+use std::net::SocketAddr;
+use std::sync::Arc;
+
+use gateway::observability;
+use gateway::routing;
+use gateway::storage;
+use gateway::AppState;
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    observability::init_tracing();
+    let metrics = observability::init_metrics();
+    let authn = gateway::authn::AuthnConfig::from_env();
+
+    let build_version = option_env!("GATEWAY_BUILD_VERSION").unwrap_or("dev");
+    let build_sha = option_env!("GATEWAY_BUILD_SHA").unwrap_or("unknown");
+    tracing::info!(build_version, build_sha, "gateway starting");
+
+    let addr: SocketAddr = std::env::var("GATEWAY_ADDR")
+        .unwrap_or_else(|_| "0.0.0.0:8080".to_string())
+        .parse()?;
+
+    let storage_path =
+        std::env::var("GATEWAY_STORAGE_PATH").unwrap_or_else(|_| "./data/gateway.mdbx".to_string());
+    if let Some(parent) = std::path::Path::new(&storage_path).parent() {
+        let _ = std::fs::create_dir_all(parent);
+    }
+    let storage = storage::GatewayStorage::open_edge_storage(storage_path, "gateway")
+        .unwrap_or_else(|_| storage::GatewayStorage::new_in_memory());
+
+    let routing_source: Arc<dyn routing::RoutingSource> =
+        if let Ok(path) = std::env::var("GATEWAY_ROUTING_FILE") {
+            Arc::new(routing::StaticFileSource::new(path))
+        } else if let (Ok(nats_url), Ok(bucket), Ok(key)) = (
+            std::env::var("GATEWAY_ROUTING_NATS_URL"),
+            std::env::var("GATEWAY_ROUTING_NATS_BUCKET"),
+            std::env::var("GATEWAY_ROUTING_NATS_KEY"),
+        ) {
+            Arc::new(routing::NatsKvSource::connect(nats_url, bucket, key).await?)
+        } else {
+            Arc::new(routing::FixedSource::new(routing::RoutingConfig::empty()))
+        };
+
+    let routing = routing::RouterState::new(routing_source).await?;
+    let _routing_watcher = routing.start_watcher();
+
+    let grpc_addr: SocketAddr = std::env::var("GATEWAY_GRPC_ADDR")
+        .unwrap_or_else(|_| "0.0.0.0:8081".to_string())
+        .parse()?;
+
+    let state = AppState {
+        metrics,
+        routing,
+        storage,
+        authn,
+    };
+
+    let app = gateway::app(state.clone());
+
+    let listener = tokio::net::TcpListener::bind(addr).await?;
+    tracing::info!(%addr, "gateway listening");
+
+    tracing::info!(%grpc_addr, "gateway grpc listening");
+
+    let (shutdown_tx, _shutdown_rx) = tokio::sync::broadcast::channel::<()>(2);
+    let shutdown_task = {
+        let shutdown_tx = shutdown_tx.clone();
+        tokio::spawn(async move {
+            shutdown_signal().await;
+            let _ = shutdown_tx.send(());
+        })
+    };
+
+    let http_task = {
+        let mut shutdown_rx = shutdown_tx.subscribe();
+        tokio::spawn(async move {
+            axum::serve(listener, app)
+                .with_graceful_shutdown(async move {
+                    let _ = shutdown_rx.recv().await;
+                })
+                .await
+                .unwrap();
+        })
+    };
+
+    let grpc_task = {
+        let mut shutdown_rx = shutdown_tx.subscribe();
+        let svc = gateway::grpc::GatewayCommandService::new(state.routing.clone());
+        tokio::spawn(async move {
+            tonic::transport::Server::builder()
+                .add_service(
+                    gateway::grpc::proto::command_service_server::CommandServiceServer::new(svc),
+                )
+                .serve_with_shutdown(grpc_addr, async move {
+                    let _ = shutdown_rx.recv().await;
+                })
+                .await
+                .unwrap();
+        })
+    };
+
+    tokio::select! {
+        _ = http_task => {},
+        _ = grpc_task => {},
+    }
+    let _ = shutdown_task.await;
+
+    Ok(())
+}
+
+async fn shutdown_signal() {
+    let ctrl_c = async {
+        let _ = tokio::signal::ctrl_c().await;
+    };
+
+    #[cfg(unix)]
+    let terminate = async {
+        let mut sigterm = tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate())
+            .expect("failed to register SIGTERM handler");
+        sigterm.recv().await;
+    };
+
+    #[cfg(not(unix))]
+    let terminate = std::future::pending::<()>();
+
+    tokio::select! {
+        _ = ctrl_c => {},
+        _ = terminate => {},
+    }
+}
--- a/gateway/src/routing.rs
+++ b/gateway/src/routing.rs
@@ -0,0 +1,456 @@
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use futures::StreamExt;
+use serde::Deserialize;
+use serde::Serialize;
+use thiserror::Error;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum ServiceKind {
+    Aggregate,
+    Projection,
+    Runner,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct RoutingConfig {
+    pub revision: u64,
+
+    pub aggregate_placement: HashMap<String, String>,
+    pub projection_placement: HashMap<String, String>,
+    pub runner_placement: HashMap<String, String>,
+
+    pub aggregate_shards: HashMap<String, Vec<String>>,
+    pub projection_shards: HashMap<String, Vec<String>>,
+    pub runner_shards: HashMap<String, Vec<String>>,
+}
+
+impl RoutingConfig {
+    pub fn empty() -> Self {
+        Self {
+            revision: 0,
+            aggregate_placement: HashMap::new(),
+            projection_placement: HashMap::new(),
+            runner_placement: HashMap::new(),
+            aggregate_shards: HashMap::new(),
+            projection_shards: HashMap::new(),
+            runner_shards: HashMap::new(),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize)]
+pub struct RoutingTable {
+    pub revision: u64,
+    aggregate_placement: HashMap<String, String>,
+    projection_placement: HashMap<String, String>,
+    runner_placement: HashMap<String, String>,
+    aggregate_shards: HashMap<String, Vec<String>>,
+    projection_shards: HashMap<String, Vec<String>>,
+    runner_shards: HashMap<String, Vec<String>>,
+}
+
+impl From<RoutingConfig> for RoutingTable {
+    fn from(value: RoutingConfig) -> Self {
+        Self {
+            revision: value.revision,
+            aggregate_placement: value.aggregate_placement,
+            projection_placement: value.projection_placement,
+            runner_placement: value.runner_placement,
+            aggregate_shards: value.aggregate_shards,
+            projection_shards: value.projection_shards,
+            runner_shards: value.runner_shards,
+        }
+    }
+}
+
+#[derive(Debug, Error, Clone, PartialEq, Eq)]
+pub enum RoutingError {
+    #[error("unknown tenant")]
+    UnknownTenant,
+    #[error("missing shard directory entry")]
+    MissingShard,
+    #[error("no endpoints for shard")]
+    EmptyShard,
+}
+
+#[derive(Clone)]
+pub struct RouterState {
+    table: Arc<tokio::sync::RwLock<Arc<RoutingTable>>>,
+    source: Arc<dyn RoutingSource>,
+}
+
+impl std::fmt::Debug for RouterState {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("RouterState").finish_non_exhaustive()
+    }
+}
+
+impl RouterState {
+    pub async fn new(source: Arc<dyn RoutingSource>) -> Result<Self, RoutingSourceError> {
+        let cfg = source.load().await?;
+        Ok(Self {
+            table: Arc::new(tokio::sync::RwLock::new(Arc::new(cfg.into()))),
+            source,
+        })
+    }
+
+    pub async fn snapshot(&self) -> Arc<RoutingTable> {
+        self.table.read().await.clone()
+    }
+
+    pub async fn reload(&self) -> Result<(), RoutingSourceError> {
+        let cfg = self.source.load().await?;
+        let next = Arc::new(RoutingTable::from(cfg));
+        *self.table.write().await = next;
+        Ok(())
+    }
+
+    pub fn start_watcher(&self) -> tokio::task::JoinHandle<()> {
+        let this = self.clone();
+        tokio::spawn(async move {
+            let mut stream = match this.source.watch().await {
+                Ok(s) => s,
+                Err(_) => return,
+            };
+
+            while let Some(msg) = stream.next().await {
+                if msg.is_err() {
+                    continue;
+                }
+                let _ = this.reload().await;
+            }
+        })
+    }
+
+    pub async fn resolve(
+        &self,
+        tenant_id: &str,
+        kind: ServiceKind,
+    ) -> Result<String, RoutingError> {
+        let table = self.snapshot().await;
+        let result = table.resolve(tenant_id, kind);
+        metrics::counter!(
+            "gateway_routing_resolutions_total",
+            "kind" => kind_label(kind),
+            "result" => if result.is_ok() { "ok" } else { "err" }
+        )
+        .increment(1);
+        result
+    }
+}
+
+fn kind_label(kind: ServiceKind) -> &'static str {
+    match kind {
+        ServiceKind::Aggregate => "aggregate",
+        ServiceKind::Projection => "projection",
+        ServiceKind::Runner => "runner",
+    }
+}
+
+impl RoutingTable {
+    pub fn resolve(&self, tenant_id: &str, kind: ServiceKind) -> Result<String, RoutingError> {
+        let shard_id = match kind {
+            ServiceKind::Aggregate => self.aggregate_placement.get(tenant_id),
+            ServiceKind::Projection => self.projection_placement.get(tenant_id),
+            ServiceKind::Runner => self.runner_placement.get(tenant_id),
+        }
+        .ok_or(RoutingError::UnknownTenant)?;
+
+        let endpoints = match kind {
+            ServiceKind::Aggregate => self.aggregate_shards.get(shard_id),
+            ServiceKind::Projection => self.projection_shards.get(shard_id),
+            ServiceKind::Runner => self.runner_shards.get(shard_id),
+        }
+        .ok_or(RoutingError::MissingShard)?;
+
+        endpoints.first().cloned().ok_or(RoutingError::EmptyShard)
+    }
+}
+
+#[derive(Debug, Error)]
+pub enum RoutingSourceError {
+    #[error("source error: {0}")]
+    Source(String),
+    #[error("decode error: {0}")]
+    Decode(String),
+}
+
+#[async_trait::async_trait]
+pub trait RoutingSource: Send + Sync {
+    async fn load(&self) -> Result<RoutingConfig, RoutingSourceError>;
+    async fn watch(
+        &self,
+    ) -> Result<
+        std::pin::Pin<Box<dyn futures::Stream<Item = Result<(), RoutingSourceError>> + Send>>,
+        RoutingSourceError,
+    >;
+}
+
+#[derive(Clone)]
+pub struct FixedSource {
+    cfg: RoutingConfig,
+}
+
+impl FixedSource {
+    pub fn new(cfg: RoutingConfig) -> Self {
+        Self { cfg }
+    }
+}
+
+#[async_trait::async_trait]
+impl RoutingSource for FixedSource {
+    async fn load(&self) -> Result<RoutingConfig, RoutingSourceError> {
+        Ok(self.cfg.clone())
+    }
+
+    async fn watch(
+        &self,
+    ) -> Result<
+        std::pin::Pin<Box<dyn futures::Stream<Item = Result<(), RoutingSourceError>> + Send>>,
+        RoutingSourceError,
+    > {
+        Ok(Box::pin(futures::stream::empty()))
+    }
+}
+
+#[derive(Clone)]
+pub struct StaticFileSource {
+    path: String,
+}
+
+impl StaticFileSource {
+    pub fn new(path: impl Into<String>) -> Self {
+        Self { path: path.into() }
+    }
+}
+
+#[async_trait::async_trait]
+impl RoutingSource for StaticFileSource {
+    async fn load(&self) -> Result<RoutingConfig, RoutingSourceError> {
+        let raw = tokio::fs::read_to_string(&self.path)
+            .await
+            .map_err(|e| RoutingSourceError::Source(e.to_string()))?;
+
+        if self.path.ends_with(".json") {
+            serde_json::from_str::<RoutingConfig>(&raw)
+                .map_err(|e| RoutingSourceError::Decode(e.to_string()))
+        } else {
+            let yaml: serde_yaml::Value = serde_yaml::from_str(&raw)
+                .map_err(|e| RoutingSourceError::Decode(e.to_string()))?;
+            let json = serde_json::to_value(yaml)
+                .map_err(|e| RoutingSourceError::Decode(e.to_string()))?;
+            serde_json::from_value::<RoutingConfig>(json)
+                .map_err(|e| RoutingSourceError::Decode(e.to_string()))
+        }
+    }
+
+    async fn watch(
+        &self,
+    ) -> Result<
+        std::pin::Pin<Box<dyn futures::Stream<Item = Result<(), RoutingSourceError>> + Send>>,
+        RoutingSourceError,
+    > {
+        Ok(Box::pin(futures::stream::empty()))
+    }
+}
+
+#[derive(Clone)]
+pub struct NatsKvSource {
+    kv: async_nats::jetstream::kv::Store,
+    key: String,
+}
+
+impl NatsKvSource {
+    pub async fn connect(
+        nats_url: impl Into<String>,
+        bucket: impl Into<String>,
+        key: impl Into<String>,
+    ) -> Result<Self, RoutingSourceError> {
+        let nats_url = nats_url.into();
+        let bucket = bucket.into();
+        let key = key.into();
+
+        let client = async_nats::connect(nats_url)
+            .await
+            .map_err(|e| RoutingSourceError::Source(e.to_string()))?;
+        let jetstream = async_nats::jetstream::new(client);
+
+        let kv = match jetstream.get_key_value(&bucket).await {
+            Ok(kv) => kv,
+            Err(_) => jetstream
+                .create_key_value(async_nats::jetstream::kv::Config {
+                    bucket: bucket.clone(),
+                    ..Default::default()
+                })
+                .await
+                .map_err(|e| RoutingSourceError::Source(e.to_string()))?,
+        };
+
+        Ok(Self { kv, key })
+    }
+}
+
+#[async_trait::async_trait]
+impl RoutingSource for NatsKvSource {
+    async fn load(&self) -> Result<RoutingConfig, RoutingSourceError> {
+        let entry = self
+            .kv
+            .entry(&self.key)
+            .await
+            .map_err(|e| RoutingSourceError::Source(e.to_string()))?;
+
+        let Some(entry) = entry else {
+            return Ok(RoutingConfig::empty());
+        };
+
+        serde_json::from_slice::<RoutingConfig>(&entry.value)
+            .map_err(|e| RoutingSourceError::Decode(e.to_string()))
+    }
+
+    async fn watch(
+        &self,
+    ) -> Result<
+        std::pin::Pin<Box<dyn futures::Stream<Item = Result<(), RoutingSourceError>> + Send>>,
+        RoutingSourceError,
+    > {
+        let key = self.key.clone();
+        let watch = self
+            .kv
+            .watch(&key)
+            .await
+            .map_err(|e| RoutingSourceError::Source(e.to_string()))?;
+
+        Ok(Box::pin(watch.filter_map(|entry| async move {
+            match entry {
+                Ok(entry) => match entry.operation {
+                    async_nats::jetstream::kv::Operation::Put => Some(Ok(())),
+                    async_nats::jetstream::kv::Operation::Delete
+                    | async_nats::jetstream::kv::Operation::Purge => None,
+                },
+                Err(e) => Some(Err(RoutingSourceError::Source(e.to_string()))),
+            }
+        })))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn assert_send_sync<T: Send + Sync>() {}
+
+    #[test]
+    fn router_state_is_send_sync() {
+        assert_send_sync::<RouterState>();
+    }
+
+    #[tokio::test]
+    async fn resolves_endpoints_for_tenant_service_kind() {
+        let cfg = RoutingConfig {
+            revision: 1,
+            aggregate_placement: HashMap::from([("t1".to_string(), "a".to_string())]),
+            projection_placement: HashMap::from([("t1".to_string(), "p".to_string())]),
+            runner_placement: HashMap::from([("t1".to_string(), "r".to_string())]),
+            aggregate_shards: HashMap::from([("a".to_string(), vec!["http://a".to_string()])]),
+            projection_shards: HashMap::from([("p".to_string(), vec!["http://p".to_string()])]),
+            runner_shards: HashMap::from([("r".to_string(), vec!["http://r".to_string()])]),
+        };
+
+        let source: Arc<dyn RoutingSource> = Arc::new(TestSource::new(cfg));
+        let router = RouterState::new(source).await.unwrap();
+
+        assert_eq!(
+            router.resolve("t1", ServiceKind::Aggregate).await.unwrap(),
+            "http://a"
+        );
+        assert_eq!(
+            router.resolve("t1", ServiceKind::Projection).await.unwrap(),
+            "http://p"
+        );
+        assert_eq!(
+            router.resolve("t1", ServiceKind::Runner).await.unwrap(),
+            "http://r"
+        );
+    }
+
+    #[tokio::test]
+    async fn unknown_tenant_is_typed_error() {
+        let source: Arc<dyn RoutingSource> = Arc::new(TestSource::new(RoutingConfig::empty()));
+        let router = RouterState::new(source).await.unwrap();
+        let err = router
+            .resolve("missing", ServiceKind::Aggregate)
+            .await
+            .unwrap_err();
+        assert_eq!(err, RoutingError::UnknownTenant);
+    }
+
+    #[tokio::test]
+    async fn hot_reload_swaps_table_atomically() {
+        let cfg1 = RoutingConfig {
+            revision: 1,
+            aggregate_placement: HashMap::from([("t1".to_string(), "a".to_string())]),
+            projection_placement: HashMap::new(),
+            runner_placement: HashMap::new(),
+            aggregate_shards: HashMap::from([("a".to_string(), vec!["http://a1".to_string()])]),
+            projection_shards: HashMap::new(),
+            runner_shards: HashMap::new(),
+        };
+        let cfg2 = RoutingConfig {
+            revision: 2,
+            aggregate_placement: HashMap::from([("t1".to_string(), "a".to_string())]),
+            projection_placement: HashMap::new(),
+            runner_placement: HashMap::new(),
+            aggregate_shards: HashMap::from([("a".to_string(), vec!["http://a2".to_string()])]),
+            projection_shards: HashMap::new(),
+            runner_shards: HashMap::new(),
+        };
+
+        let test_source = Arc::new(TestSource::new(cfg1));
+        let router = RouterState::new(test_source.clone()).await.unwrap();
+
+        let before = router.resolve("t1", ServiceKind::Aggregate).await.unwrap();
+        assert_eq!(before, "http://a1");
+
+        test_source.set(cfg2).await;
+        router.reload().await.unwrap();
+
+        let after = router.resolve("t1", ServiceKind::Aggregate).await.unwrap();
+        assert_eq!(after, "http://a2");
+    }
+
+    #[derive(Clone)]
+    struct TestSource {
+        cfg: Arc<tokio::sync::RwLock<RoutingConfig>>,
+    }
+
+    impl TestSource {
+        fn new(cfg: RoutingConfig) -> Self {
+            Self {
+                cfg: Arc::new(tokio::sync::RwLock::new(cfg)),
+            }
+        }
+
+        async fn set(&self, cfg: RoutingConfig) {
+            *self.cfg.write().await = cfg;
+        }
+    }
+
+    #[async_trait::async_trait]
+    impl RoutingSource for TestSource {
+        async fn load(&self) -> Result<RoutingConfig, RoutingSourceError> {
+            Ok(self.cfg.read().await.clone())
+        }
+
+        async fn watch(
+            &self,
+        ) -> Result<
+            std::pin::Pin<Box<dyn futures::Stream<Item = Result<(), RoutingSourceError>> + Send>>,
+            RoutingSourceError,
+        > {
+            Ok(Box::pin(futures::stream::empty()))
+        }
+    }
+}
--- a/gateway/src/storage.rs
+++ b/gateway/src/storage.rs
--- a/gateway/src/upstream.rs
+++ b/gateway/src/upstream.rs
@@ -0,0 +1,99 @@
+use std::sync::OnceLock;
+use std::time::Duration;
+
+pub fn http_client() -> &'static reqwest::Client {
+    static CLIENT: OnceLock<reqwest::Client> = OnceLock::new();
+    CLIENT.get_or_init(|| {
+        let mut builder = reqwest::Client::builder().timeout(Duration::from_secs(10));
+
+        if let Some(ca_pem) = env_or_file(
+            "GATEWAY_INTERNAL_CA_CERT_PEM",
+            "GATEWAY_INTERNAL_CA_CERT_PEM_FILE",
+        ) {
+            if let Ok(cert) = reqwest::Certificate::from_pem(ca_pem.as_bytes()) {
+                builder = builder.add_root_certificate(cert);
+            }
+        }
+
+        if let Some(identity_pem) = env_or_file(
+            "GATEWAY_INTERNAL_IDENTITY_PEM",
+            "GATEWAY_INTERNAL_IDENTITY_PEM_FILE",
+        ) {
+            if let Ok(identity) = reqwest::Identity::from_pem(identity_pem.as_bytes()) {
+                builder = builder.identity(identity);
+            }
+        }
+
+        builder.build().expect("failed to build reqwest client")
+    })
+}
+
+pub fn grpc_endpoint(url: &str) -> Result<tonic::transport::Endpoint, tonic::transport::Error> {
+    let mut endpoint =
+        tonic::transport::Endpoint::from_shared(url.to_string())?.timeout(Duration::from_secs(10));
+
+    let wants_tls = url.starts_with("https://")
+        || std::env::var("GATEWAY_INTERNAL_GRPC_TLS")
+            .ok()
+            .map(|v| matches!(v.trim().to_ascii_lowercase().as_str(), "1" | "true" | "yes"))
+            .unwrap_or(false);
+
+    if wants_tls {
+        if let Some(tls) = grpc_tls_config() {
+            endpoint = endpoint.tls_config(tls)?;
+        }
+    }
+
+    Ok(endpoint)
+}
+
+fn grpc_tls_config() -> Option<tonic::transport::ClientTlsConfig> {
+    let mut tls = tonic::transport::ClientTlsConfig::new();
+    let mut configured = false;
+
+    if let Some(ca_pem) = env_or_file(
+        "GATEWAY_INTERNAL_GRPC_CA_CERT_PEM",
+        "GATEWAY_INTERNAL_GRPC_CA_CERT_PEM_FILE",
+    ) {
+        tls = tls.ca_certificate(tonic::transport::Certificate::from_pem(ca_pem));
+        configured = true;
+    }
+
+    let cert_pem = env_or_file(
+        "GATEWAY_INTERNAL_GRPC_CLIENT_CERT_PEM",
+        "GATEWAY_INTERNAL_GRPC_CLIENT_CERT_PEM_FILE",
+    );
+    let key_pem = env_or_file(
+        "GATEWAY_INTERNAL_GRPC_CLIENT_KEY_PEM",
+        "GATEWAY_INTERNAL_GRPC_CLIENT_KEY_PEM_FILE",
+    );
+    if let (Some(cert_pem), Some(key_pem)) = (cert_pem, key_pem) {
+        tls = tls.identity(tonic::transport::Identity::from_pem(cert_pem, key_pem));
+        configured = true;
+    }
+
+    if configured {
+        Some(tls)
+    } else {
+        None
+    }
+}
+
+fn env_or_file(env_key: &str, file_env_key: &str) -> Option<String> {
+    if let Ok(path) = std::env::var(file_env_key) {
+        if let Ok(raw) = std::fs::read_to_string(path) {
+            let trimmed = raw.trim().to_string();
+            if !trimmed.is_empty() {
+                return Some(trimmed);
+            }
+        }
+    }
+    std::env::var(env_key).ok().and_then(|v| {
+        let trimmed = v.trim().to_string();
+        if trimmed.is_empty() {
+            None
+        } else {
+            Some(trimmed)
+        }
+    })
+}
--- a/gateway/tests/ha_local.rs
+++ b/gateway/tests/ha_local.rs
@@ -0,0 +1,158 @@
+use std::sync::Arc;
+
+use tower::util::ServiceExt;
+
+#[tokio::test]
+async fn t9_2_and_t9_3_ready_is_healthy_on_both_replicas_and_survives_one_replica_down() {
+    let (app1, app2, _state) = build_two_replicas().await;
+
+    let r1 = app1
+        .clone()
+        .oneshot(
+            axum::http::Request::builder()
+                .method("GET")
+                .uri("/ready")
+                .body(axum::body::Body::empty())
+                .unwrap(),
+        )
+        .await
+        .unwrap();
+    assert_eq!(r1.status(), axum::http::StatusCode::OK);
+
+    let r2 = app2
+        .clone()
+        .oneshot(
+            axum::http::Request::builder()
+                .method("GET")
+                .uri("/ready")
+                .body(axum::body::Body::empty())
+                .unwrap(),
+        )
+        .await
+        .unwrap();
+    assert_eq!(r2.status(), axum::http::StatusCode::OK);
+
+    drop(app1);
+
+    let r2 = app2
+        .oneshot(
+            axum::http::Request::builder()
+                .method("GET")
+                .uri("/ready")
+                .body(axum::body::Body::empty())
+                .unwrap(),
+        )
+        .await
+        .unwrap();
+    assert_eq!(r2.status(), axum::http::StatusCode::OK);
+}
+
+#[tokio::test]
+async fn t9_4_refresh_works_across_replicas_without_sticky_sessions() {
+    let (app1, app2, state) = build_two_replicas().await;
+
+    let signup = app1
+        .clone()
+        .oneshot(
+            axum::http::Request::builder()
+                .method("POST")
+                .uri("/v1/auth/signup")
+                .header("content-type", "application/json")
+                .body(axum::body::Body::from(
+                    r#"{"email":"ha@b.com","password":"password123"}"#,
+                ))
+                .unwrap(),
+        )
+        .await
+        .unwrap();
+    assert_eq!(signup.status(), axum::http::StatusCode::OK);
+    let body = axum::body::to_bytes(signup.into_body(), usize::MAX)
+        .await
+        .unwrap();
+    let created: gateway::authn::AuthResponse = serde_json::from_slice(&body).unwrap();
+
+    let refresh_req = serde_json::to_vec(&gateway::authn::RefreshRequest {
+        session_id: created.session_id.clone(),
+        refresh_token: created.refresh_token.clone(),
+    })
+    .unwrap();
+
+    let refresh = app2
+        .clone()
+        .oneshot(
+            axum::http::Request::builder()
+                .method("POST")
+                .uri("/v1/auth/refresh")
+                .header("content-type", "application/json")
+                .body(axum::body::Body::from(refresh_req))
+                .unwrap(),
+        )
+        .await
+        .unwrap();
+    assert_eq!(refresh.status(), axum::http::StatusCode::OK);
+    let body = axum::body::to_bytes(refresh.into_body(), usize::MAX)
+        .await
+        .unwrap();
+    let refreshed: gateway::authn::AuthResponse = serde_json::from_slice(&body).unwrap();
+    assert_ne!(refreshed.refresh_token, created.refresh_token);
+
+    let refresh_again_req = serde_json::to_vec(&gateway::authn::RefreshRequest {
+        session_id: created.session_id.clone(),
+        refresh_token: created.refresh_token.clone(),
+    })
+    .unwrap();
+
+    let refresh_again = app1
+        .oneshot(
+            axum::http::Request::builder()
+                .method("POST")
+                .uri("/v1/auth/refresh")
+                .header("content-type", "application/json")
+                .body(axum::body::Body::from(refresh_again_req))
+                .unwrap(),
+        )
+        .await
+        .unwrap();
+    assert_eq!(refresh_again.status(), axum::http::StatusCode::UNAUTHORIZED);
+
+    let stored = state
+        .storage
+        .refresh_sessions
+        .get(&format!("v1/sessions/{}", created.session_id))
+        .await
+        .unwrap()
+        .unwrap();
+    let value: serde_json::Value = serde_json::from_slice(&stored.value).unwrap();
+    assert_eq!(
+        value.get("v").and_then(|v| v.as_u64()).unwrap_or(0),
+        u64::from(gateway::storage::SCHEMA_VERSION)
+    );
+}
+
+async fn build_two_replicas() -> (axum::Router, axum::Router, gateway::AppState) {
+    let metrics = gateway::observability::init_metrics_for_tests();
+    let routing = gateway::routing::RouterState::new(Arc::new(gateway::routing::FixedSource::new(
+        gateway::routing::RoutingConfig::empty(),
+    )))
+    .await
+    .unwrap();
+    let storage = gateway::storage::GatewayStorage::new_in_memory();
+    let authn = gateway::authn::AuthnConfig::for_tests();
+
+    let state = gateway::AppState {
+        metrics,
+        routing: routing.clone(),
+        storage: storage.clone(),
+        authn: authn.clone(),
+    };
+
+    let app1 = gateway::app(state.clone());
+    let app2 = gateway::app(gateway::AppState {
+        metrics: gateway::observability::init_metrics_for_tests(),
+        routing,
+        storage,
+        authn,
+    });
+
+    (app1, app2, state)
+}