feat(billing): implement tenant subscription entitlements system (milestones 0-6)
Some checks failed
ci / ui (push) Failing after 28s
ci / rust (push) Failing after 2m40s
images / build-and-push (push) Failing after 19s

This commit is contained in:
2026-03-30 18:41:23 +03:00
parent 5992044b7e
commit 2595e7f1c5
63 changed files with 8448 additions and 321 deletions

View File

@@ -0,0 +1,45 @@
name: s3-provision
on:
workflow_dispatch:
jobs:
provision-docs-bucket:
runs-on: ubuntu-latest
timeout-minutes: 10
steps:
- uses: actions/checkout@v4
- name: Install AWS CLI
run: |
sudo apt-get update
sudo apt-get install -y awscli
- name: Validate required secrets
env:
AWS_ACCESS_KEY_ID: ${{ secrets.S3_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_SECRET_ACCESS_KEY }}
S3_ENDPOINT: ${{ secrets.S3_ENDPOINT }}
S3_REGION: ${{ secrets.S3_REGION }}
S3_BUCKET_DOCS: ${{ secrets.S3_BUCKET_DOCS }}
run: |
test -n "$AWS_ACCESS_KEY_ID"
test -n "$AWS_SECRET_ACCESS_KEY"
test -n "$S3_ENDPOINT"
test -n "$S3_REGION"
test -n "$S3_BUCKET_DOCS"
- name: Provision docs bucket (idempotent)
env:
AWS_ACCESS_KEY_ID: ${{ secrets.S3_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_SECRET_ACCESS_KEY }}
S3_ENDPOINT: ${{ secrets.S3_ENDPOINT }}
S3_REGION: ${{ secrets.S3_REGION }}
S3_BUCKET_DOCS: ${{ secrets.S3_BUCKET_DOCS }}
S3_ENABLE_VERSIONING: ${{ secrets.S3_ENABLE_VERSIONING }}
S3_LIFECYCLE_JSON: docs/usage/s3_lifecycle_docs_default.json
S3_PREFIX_DOCS: docs/
run: |
sh docker/scripts/s3_create_docs_bucket.sh
sh docker/scripts/s3_verify_docs.sh

433
Cargo.lock generated
View File

@@ -13,13 +13,13 @@ name = "aggregate"
version = "0.1.0"
dependencies = [
"anyhow",
"async-nats",
"async-nats 0.39.0",
"axum 0.7.9",
"chrono",
"edge-logger-client",
"edge_storage",
"futures",
"lru",
"lru 0.12.5",
"prost 0.13.5",
"protoc-bin-vendored",
"query_engine",
@@ -150,8 +150,15 @@ checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
name = "api"
version = "0.1.0"
dependencies = [
"async-nats 0.42.0",
"async-trait",
"aws-config",
"aws-credential-types",
"aws-sdk-s3",
"axum 0.8.8",
"clap",
"futures",
"hex",
"jsonwebtoken",
"metrics 0.23.1",
"metrics-exporter-prometheus 0.16.2",
@@ -159,6 +166,7 @@ dependencies = [
"serde",
"serde_json",
"serde_yaml",
"sha2",
"shared",
"thiserror 2.0.18",
"tokio",
@@ -166,6 +174,8 @@ dependencies = [
"tower-http 0.6.8",
"tracing",
"tracing-subscriber",
"url",
"urlencoding",
"uuid",
]
@@ -229,6 +239,42 @@ dependencies = [
"url",
]
[[package]]
name = "async-nats"
version = "0.42.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08f6da6d49a956424ca4e28fe93656f790d748b469eaccbc7488fec545315180"
dependencies = [
"base64",
"bytes",
"futures",
"memchr",
"nkeys",
"nuid",
"once_cell",
"pin-project",
"portable-atomic",
"rand 0.8.5",
"regex",
"ring",
"rustls-native-certs 0.7.3",
"rustls-pemfile",
"rustls-webpki 0.102.8",
"serde",
"serde_json",
"serde_nanos",
"serde_repr",
"thiserror 1.0.69",
"time",
"tokio",
"tokio-rustls 0.26.4",
"tokio-util",
"tokio-websockets",
"tracing",
"tryhard",
"url",
]
[[package]]
name = "async-stream"
version = "0.3.6"
@@ -282,6 +328,8 @@ checksum = "11493b0bad143270fb8ad284a096dd529ba91924c5409adeac856cc1bf047dbc"
dependencies = [
"aws-credential-types",
"aws-runtime",
"aws-sdk-sso",
"aws-sdk-ssooidc",
"aws-sdk-sts",
"aws-smithy-async",
"aws-smithy-http",
@@ -292,11 +340,14 @@ dependencies = [
"aws-types",
"bytes",
"fastrand",
"hex",
"http 1.4.0",
"sha1",
"time",
"tokio",
"tracing",
"url",
"zeroize",
]
[[package]]
@@ -342,6 +393,7 @@ dependencies = [
"aws-credential-types",
"aws-sigv4",
"aws-smithy-async",
"aws-smithy-eventstream",
"aws-smithy-http",
"aws-smithy-runtime",
"aws-smithy-runtime-api",
@@ -350,7 +402,9 @@ dependencies = [
"bytes",
"bytes-utils",
"fastrand",
"http 0.2.12",
"http 1.4.0",
"http-body 0.4.6",
"http-body 1.0.1",
"percent-encoding",
"pin-project-lite",
@@ -358,6 +412,41 @@ dependencies = [
"uuid",
]
[[package]]
name = "aws-sdk-s3"
version = "1.127.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "151783f64e0dcddeb4965d08e36c276b4400a46caa88805a2e36d497deaf031a"
dependencies = [
"aws-credential-types",
"aws-runtime",
"aws-sigv4",
"aws-smithy-async",
"aws-smithy-checksums",
"aws-smithy-eventstream",
"aws-smithy-http",
"aws-smithy-json",
"aws-smithy-observability",
"aws-smithy-runtime",
"aws-smithy-runtime-api",
"aws-smithy-types",
"aws-smithy-xml",
"aws-types",
"bytes",
"fastrand",
"hex",
"hmac",
"http 0.2.12",
"http 1.4.0",
"http-body 1.0.1",
"lru 0.16.3",
"percent-encoding",
"regex-lite",
"sha2",
"tracing",
"url",
]
[[package]]
name = "aws-sdk-sesv2"
version = "1.117.0"
@@ -382,6 +471,54 @@ dependencies = [
"tracing",
]
[[package]]
name = "aws-sdk-sso"
version = "1.97.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9aadc669e184501caaa6beafb28c6267fc1baef0810fb58f9b205485ca3f2567"
dependencies = [
"aws-credential-types",
"aws-runtime",
"aws-smithy-async",
"aws-smithy-http",
"aws-smithy-json",
"aws-smithy-observability",
"aws-smithy-runtime",
"aws-smithy-runtime-api",
"aws-smithy-types",
"aws-types",
"bytes",
"fastrand",
"http 0.2.12",
"http 1.4.0",
"regex-lite",
"tracing",
]
[[package]]
name = "aws-sdk-ssooidc"
version = "1.99.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1342a7db8f358d3de0aed2007a0b54e875458e39848d54cc1d46700b2bfcb0a8"
dependencies = [
"aws-credential-types",
"aws-runtime",
"aws-smithy-async",
"aws-smithy-http",
"aws-smithy-json",
"aws-smithy-observability",
"aws-smithy-runtime",
"aws-smithy-runtime-api",
"aws-smithy-types",
"aws-types",
"bytes",
"fastrand",
"http 0.2.12",
"http 1.4.0",
"regex-lite",
"tracing",
]
[[package]]
name = "aws-sdk-sts"
version = "1.101.0"
@@ -414,19 +551,25 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b0b660013a6683ab23797778e21f1f854744fdf05f68204b4cca4c8c04b5d1f4"
dependencies = [
"aws-credential-types",
"aws-smithy-eventstream",
"aws-smithy-http",
"aws-smithy-runtime-api",
"aws-smithy-types",
"bytes",
"crypto-bigint 0.5.5",
"form_urlencoded",
"hex",
"hmac",
"http 0.2.12",
"http 1.4.0",
"p256",
"percent-encoding",
"ring",
"sha2",
"subtle",
"time",
"tracing",
"zeroize",
]
[[package]]
@@ -440,12 +583,45 @@ dependencies = [
"tokio",
]
[[package]]
name = "aws-smithy-checksums"
version = "0.64.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6750f3dd509b0694a4377f0293ed2f9630d710b1cebe281fa8bac8f099f88bc6"
dependencies = [
"aws-smithy-http",
"aws-smithy-types",
"bytes",
"crc-fast",
"hex",
"http 1.4.0",
"http-body 1.0.1",
"http-body-util",
"md-5",
"pin-project-lite",
"sha1",
"sha2",
"tracing",
]
[[package]]
name = "aws-smithy-eventstream"
version = "0.60.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "faf09d74e5e32f76b8762da505a3cd59303e367a664ca67295387baa8c1d7548"
dependencies = [
"aws-smithy-types",
"bytes",
"crc32fast",
]
[[package]]
name = "aws-smithy-http"
version = "0.63.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba1ab2dc1c2c3749ead27180d333c42f11be8b0e934058fb4b2258ee8dbe5231"
dependencies = [
"aws-smithy-eventstream",
"aws-smithy-runtime-api",
"aws-smithy-types",
"bytes",
@@ -473,13 +649,21 @@ dependencies = [
"h2 0.3.27",
"h2 0.4.13",
"http 0.2.12",
"http 1.4.0",
"http-body 0.4.6",
"hyper 0.14.32",
"hyper 1.8.1",
"hyper-rustls 0.24.2",
"hyper-rustls 0.27.7",
"hyper-util",
"pin-project-lite",
"rustls 0.21.12",
"rustls 0.23.37",
"rustls-native-certs 0.8.3",
"rustls-pki-types",
"tokio",
"tokio-rustls 0.26.4",
"tower 0.5.3",
"tracing",
]
@@ -562,6 +746,7 @@ dependencies = [
"base64-simd",
"bytes",
"bytes-utils",
"futures-core",
"http 0.2.12",
"http 1.4.0",
"http-body 0.4.6",
@@ -574,6 +759,8 @@ dependencies = [
"ryu",
"serde",
"time",
"tokio",
"tokio-util",
]
[[package]]
@@ -706,6 +893,12 @@ dependencies = [
"tracing",
]
[[package]]
name = "base16ct"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "349a06037c7bf932dd7e7d1f653678b2038b9ad46a74102f1fc7bd7872678cce"
[[package]]
name = "base32"
version = "0.5.1"
@@ -1157,6 +1350,33 @@ dependencies = [
"libc",
]
[[package]]
name = "crc"
version = "3.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9710d3b3739c2e349eb44fe848ad0b7c8cb1e42bd87ee49371df2f7acaf3e675"
dependencies = [
"crc-catalog",
]
[[package]]
name = "crc-catalog"
version = "2.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5"
[[package]]
name = "crc-fast"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2fd92aca2c6001b1bf5ba0ff84ee74ec8501b52bbef0cac80bf25a6c1d87a83d"
dependencies = [
"crc",
"digest",
"rustversion",
"spin",
]
[[package]]
name = "crc32fast"
version = "1.5.0"
@@ -1200,6 +1420,28 @@ version = "0.8.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
[[package]]
name = "crypto-bigint"
version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef2b4b23cddf68b89b8f8069890e8c270d54e2d5fe1b143820234805e4cb17ef"
dependencies = [
"generic-array",
"rand_core 0.6.4",
"subtle",
"zeroize",
]
[[package]]
name = "crypto-bigint"
version = "0.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76"
dependencies = [
"rand_core 0.6.4",
"subtle",
]
[[package]]
name = "crypto-common"
version = "0.1.7"
@@ -1242,6 +1484,16 @@ version = "2.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d7a1e2f27636f116493b8b860f5546edb47c8d8f8ea73e1d2a20be88e28d1fea"
[[package]]
name = "der"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1a467a65c5e759bce6e65eaf91cc29f466cdc57cb65777bd646872a8a1fd4de"
dependencies = [
"const-oid",
"zeroize",
]
[[package]]
name = "der"
version = "0.7.10"
@@ -1320,13 +1572,25 @@ version = "1.0.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555"
[[package]]
name = "ecdsa"
version = "0.14.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c"
dependencies = [
"der 0.6.1",
"elliptic-curve",
"rfc6979",
"signature 1.6.4",
]
[[package]]
name = "ed25519"
version = "2.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53"
dependencies = [
"signature",
"signature 2.2.0",
]
[[package]]
@@ -1338,7 +1602,7 @@ dependencies = [
"curve25519-dalek",
"ed25519",
"sha2",
"signature",
"signature 2.2.0",
"subtle",
]
@@ -1414,6 +1678,26 @@ version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
[[package]]
name = "elliptic-curve"
version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3"
dependencies = [
"base16ct",
"crypto-bigint 0.4.9",
"der 0.6.1",
"digest",
"ff",
"generic-array",
"group",
"pkcs8 0.9.0",
"rand_core 0.6.4",
"sec1",
"subtle",
"zeroize",
]
[[package]]
name = "email-encoding"
version = "0.4.1"
@@ -1477,6 +1761,16 @@ version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
[[package]]
name = "ff"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d013fc25338cc558c5c2cfbad646908fb23591e2404481826742b651c9af7160"
dependencies = [
"rand_core 0.6.4",
"subtle",
]
[[package]]
name = "fiat-crypto"
version = "0.2.9"
@@ -1524,6 +1818,12 @@ version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
[[package]]
name = "foldhash"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
[[package]]
name = "foreign-types"
version = "0.3.2"
@@ -1674,7 +1974,7 @@ version = "0.1.0"
dependencies = [
"anyhow",
"argon2",
"async-nats",
"async-nats 0.39.0",
"async-trait",
"axum 0.7.9",
"base32",
@@ -1768,6 +2068,17 @@ version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280"
[[package]]
name = "group"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5dfbfb3a6cfbd390d5c9564ab283a0349b9b9fcd46a706c1eb10e0db70bfbac7"
dependencies = [
"ff",
"rand_core 0.6.4",
"subtle",
]
[[package]]
name = "gzip-header"
version = "1.0.0"
@@ -1841,7 +2152,7 @@ checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
dependencies = [
"allocator-api2",
"equivalent",
"foldhash",
"foldhash 0.1.5",
]
[[package]]
@@ -1849,6 +2160,11 @@ name = "hashbrown"
version = "0.16.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
dependencies = [
"allocator-api2",
"equivalent",
"foldhash 0.2.0",
]
[[package]]
name = "heck"
@@ -2532,6 +2848,15 @@ dependencies = [
"hashbrown 0.15.5",
]
[[package]]
name = "lru"
version = "0.16.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1dc47f592c06f33f8e3aea9591776ec7c9f9e4124778ff8a3c3b87159f7e593"
dependencies = [
"hashbrown 0.16.1",
]
[[package]]
name = "lru-slab"
version = "0.1.2"
@@ -2559,6 +2884,16 @@ version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3"
[[package]]
name = "md-5"
version = "0.10.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf"
dependencies = [
"cfg-if",
"digest",
]
[[package]]
name = "mdbx-sys"
version = "13.11.0"
@@ -3123,6 +3458,17 @@ dependencies = [
"unicode-id-start",
]
[[package]]
name = "p256"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51f44edd08f51e2ade572f141051021c5af22677e42b7dd28a88155151c33594"
dependencies = [
"ecdsa",
"elliptic-curve",
"sha2",
]
[[package]]
name = "parking_lot"
version = "0.12.5"
@@ -3300,14 +3646,24 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]]
name = "pkcs8"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9eca2c590a5f85da82668fa685c09ce2888b9430e83299debf1f34b65fd4a4ba"
dependencies = [
"der 0.6.1",
"spki 0.6.0",
]
[[package]]
name = "pkcs8"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7"
dependencies = [
"der",
"spki",
"der 0.7.10",
"spki 0.7.3",
]
[[package]]
@@ -3379,7 +3735,7 @@ name = "projection"
version = "0.1.0"
dependencies = [
"anyhow",
"async-nats",
"async-nats 0.39.0",
"axum 0.7.9",
"chrono",
"edge-logger-client",
@@ -3937,6 +4293,17 @@ dependencies = [
"webpki-roots 1.0.6",
]
[[package]]
name = "rfc6979"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7743f17af12fa0b03b803ba12cd6a8d9483a587e89c69445e3909655c0b9fabb"
dependencies = [
"crypto-bigint 0.4.9",
"hmac",
"zeroize",
]
[[package]]
name = "ring"
version = "0.17.14"
@@ -3985,7 +4352,7 @@ name = "runner"
version = "0.1.0"
dependencies = [
"anyhow",
"async-nats",
"async-nats 0.39.0",
"aws-config",
"aws-sdk-sesv2",
"axum 0.7.9",
@@ -4306,6 +4673,20 @@ dependencies = [
"syn 2.0.117",
]
[[package]]
name = "sec1"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928"
dependencies = [
"base16ct",
"der 0.6.1",
"generic-array",
"pkcs8 0.9.0",
"subtle",
"zeroize",
]
[[package]]
name = "security-framework"
version = "2.11.1"
@@ -4535,12 +4916,22 @@ version = "0.27.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1e303f8205714074f6068773f0e29527e0453937fe837c9717d066635b65f31"
dependencies = [
"pkcs8",
"pkcs8 0.10.2",
"rand_core 0.6.4",
"signature",
"signature 2.2.0",
"zeroize",
]
[[package]]
name = "signature"
version = "1.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "74233d3b3b2f6d4b006dc19dee745e73e2a6bfb6f93607cd3b02bd5b00797d7c"
dependencies = [
"digest",
"rand_core 0.6.4",
]
[[package]]
name = "signature"
version = "2.2.0"
@@ -4625,6 +5016,22 @@ dependencies = [
"windows-sys 0.61.2",
]
[[package]]
name = "spin"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d5fe4ccb98d9c292d56fec89a5e07da7fc4cf0dc11e156b41793132775d3e591"
[[package]]
name = "spki"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67cf02bbac7a337dc36e4f5a693db6c21e7863f45070f7064577eb4367a3212b"
dependencies = [
"base64ct",
"der 0.6.1",
]
[[package]]
name = "spki"
version = "0.7.3"
@@ -4632,7 +5039,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d"
dependencies = [
"base64ct",
"der",
"der 0.7.10",
]
[[package]]

View File

@@ -11,8 +11,15 @@ docker compose down -v
To include the observability stack (Grafana/Loki/Tempo/VictoriaMetrics) with the local compose:
```bash
docker compose -f docker-compose.yml -f observability/docker-compose.yml up -d --build
docker compose -f docker-compose.yml -f observability/docker-compose.yml down -v
docker compose --profile observability up -d --build
docker compose --profile observability down -v
```
To use S3-compatible object storage (MinIO) for Loki + Tempo locally:
```bash
docker compose -f docker-compose.yml -f observability/docker-compose.s3.yml --profile observability up -d --build
docker compose -f docker-compose.yml -f observability/docker-compose.s3.yml --profile observability down -v
```
Service ports in the default compose:
@@ -23,8 +30,20 @@ Service ports in the default compose:
- Runner HTTP: `http://localhost:28080`
- Control API: `http://localhost:38080`
- Control UI: `http://localhost:8082`
- MailHog SMTP: `smtp://localhost:1025`
- MailHog UI: `http://localhost:8025`
- MinIO S3 API: `http://localhost:9000`
- MinIO console: `http://localhost:9001`
- NATS: `nats://localhost:4222`, monitoring `http://localhost:8222`
MinIO defaults:
- Credentials: `minioadmin` / `minioadmin`
- Bucket: `cloudlysis-docs-0,cloudlysis-docs-1,cloudlysis-docs-2` (comma-separated docs bucket set)
Email defaults (local):
- Runner uses SMTP backend via `RUNNER_SMTP_URL=smtp://mailhog:1025`
- Inspect emails at MailHog UI `http://localhost:8025`
## Swarm (Dev)
Build images:
@@ -56,6 +75,10 @@ Create dev secrets required by the observability stack:
sh docker/scripts/swarm_dev_secrets.sh
```
This also creates dev secrets used by the control plane for S3 document storage:
- `control_s3_access_key_id`
- `control_s3_secret_access_key`
Deploy:
```bash
@@ -66,6 +89,60 @@ docker stack deploy -c swarm/stacks/control-plane.yml cloudlysis_control
docker stack deploy -c swarm/stacks/observability.yml cloudlysis_obs
```
Production-style control plane (no MinIO in stack; S3 is external):
```bash
# create secrets (set CONTROL_S3_ACCESS_KEY_ID / CONTROL_S3_SECRET_ACCESS_KEY first)
sh docker/scripts/swarm_dev_secrets.sh
# required env for the stack
export CONTROL_S3_ENDPOINT="https://<hetzner-endpoint>"
export CONTROL_S3_REGION="eu-central-1"
export CONTROL_S3_BUCKET_DOCS="cloudlysis-docs"
docker stack deploy -c swarm/stacks/control-plane-prod.yml cloudlysis_control
```
Verify production S3 bucket/prefix permissions with AWS CLI (env-gated):
```bash
# install aws cli v2, then export creds and target
export S3_ENDPOINT="https://<hetzner-endpoint>"
export S3_REGION="eu-central-1"
export S3_BUCKET_DOCS="cloudlysis-docs"
export S3_PREFIX_DOCS="docs/"
# optionally set S3_FORCE_PATH_STYLE=true for some S3-compatible endpoints
sh docker/scripts/s3_verify_docs.sh
```
Create/provision the docs bucket (idempotent; CI/CD-friendly):
```bash
export S3_ENDPOINT="https://<hetzner-endpoint>"
export S3_REGION="eu-central-1"
export S3_BUCKET_DOCS="cloudlysis-docs"
# optional
# export S3_ENABLE_VERSIONING=true
sh docker/scripts/s3_create_docs_bucket.sh
```
Apply a lifecycle policy to the docs bucket (operator; automated):
```bash
export S3_ENDPOINT="https://<hetzner-endpoint>"
export S3_REGION="eu-central-1"
export S3_BUCKET_DOCS="cloudlysis-docs"
# optional: provide your own lifecycle JSON file
# export S3_LIFECYCLE_JSON="path/to/lifecycle.json"
sh docker/scripts/s3_apply_lifecycle_docs.sh
```
Remove:
```bash

View File

@@ -1,38 +1,55 @@
# cloudlysis (monorepo)
# Cloudlysis (monorepo)
## Layout
- Rust services (Cargo workspace): `aggregate/`, `gateway/`, `projection/`, `runner/`, `control/api/`, `shared/`
- Control UI: `control/ui/`
- Docker + Swarm + Compose: `docker/`, `docker-compose.yml`, `swarm/`, `observability/`
Production-oriented, multi-service Rust workspace with an operator-facing Control Plane (API + Admin UI), S3-backed document storage, and an optional observability stack for local parity.
## Documentation
- docs/README.md
- Architecture: docs/architecture/overview.md, docs/architecture/transport.md
- Developer: docs/developer/setup.md, docs/developer/testing.md
- Usage: docs/usage/quickstart.md, docs/usage/api.md, docs/usage/nats.md
- Gitea Wiki: run `scripts/publish_gitea_wiki.sh` (publishes `wiki/` to the repo wiki)
## Quickstart (local dev)
## Quick Start (Docker Compose)
Core stack (includes MinIO + MailHog + Control Plane):
```bash
docker compose up -d --build
```
Full local stack with observability:
Full local stack with observability (Grafana/Loki/Tempo/VictoriaMetrics):
```bash
docker compose -f docker-compose.yml -f observability/docker-compose.yml up -d --build
docker compose --profile observability up -d --build
```
## Commands
- `make compose-up`, `make compose-down`
- `make compose-up-observability`, `make compose-down-observability`
- `make docker-build-all`
- `make swarm-deploy-all`, `make swarm-rm-all`
Full local stack + Loki/Tempo using MinIO (S3 mode):
More details: `DOCKER.md`
```bash
docker compose -f docker-compose.yml -f observability/docker-compose.s3.yml --profile observability up -d --build
```
## Workspace Verification
## Local endpoints
- **Control UI**: `http://localhost:8082`
- **Control API**: `http://localhost:38080`
- **Grafana** (observability profile): `http://localhost:3000`
- **MailHog UI**: `http://localhost:8025` (SMTP on `localhost:1025`)
- **MinIO console**: `http://localhost:9001` (S3 API on `localhost:9000`)
## Repository layout (high level)
- **Rust services (Cargo workspace)**: `aggregate/`, `gateway/`, `projection/`, `runner/`, `control/api/`, `shared/`
- **Admin UI**: `control/ui/`
- **Docker / Swarm / Compose**: `docker/`, `docker-compose.yml`, `swarm/`, `observability/`
## Production (overview)
- **Control plane Swarm stack**: `swarm/stacks/control-plane-prod.yml`
- **S3 docs buckets**: `CONTROL_S3_BUCKET_DOCS` supports a comma-separated shard set (e.g. `cloudlysis-docs-0,cloudlysis-docs-1,cloudlysis-docs-2`). Bucket selection is deterministic per-tenant; keep the full shard set stable to avoid remapping tenants.
- **S3 provisioning helpers** (idempotent scripts; CI/CD friendly):
- `docker/scripts/s3_create_docs_bucket.sh`
- `docker/scripts/s3_apply_lifecycle_docs.sh`
- `docker/scripts/s3_verify_docs.sh`
- Gitea Actions workflow: `.gitea/workflows/s3-provision.yml`
## Docs
- **Docker / local dev / Swarm**: `DOCKER.md`
- **Developer docs**: `docs/developer/setup.md`, `docs/developer/testing.md`
- **Architecture**: `docs/architecture/overview.md`, `docs/architecture/transport.md`
- **Usage**: `docs/usage/quickstart.md`, `docs/usage/api.md`, `docs/usage/nats.md`
## Workspace verification
```bash
cargo fmt --check

View File

@@ -1,187 +0,0 @@
# S3-Compatible Object Storage Plan (Hetzner in Prod, MinIO Locally)
## Principles
- S3-compatible object storage is mandatory for platform document storage in every environment:
- Local development uses MinIO.
- Production uses Hetzner Object Storage (S3 API compatible).
- Each milestone is stop-the-line gated:
- All tasks completed
- All milestone tests pass
- Workspace verification commands pass
- Secrets are never committed and never logged:
- Access keys via Swarm secrets in production
- `.env` or compose env in local dev
## Goals
- Introduce a single, shared S3-compatible configuration surface for the platform.
- Make document storage always backed by S3 (no filesystem fallback for documents).
- Keep the implementation incremental and test-gated per milestone.
- Optionally expand to observability object storage after document storage is stable.
## Definitions
### Document Storage
“Documents” are versioned blobs the platform needs to store and retrieve reliably:
- Deployment bundles and artifacts
- Definitions/manifests (projection programs, saga/effects definitions, schema bundles)
- Exported audit/log bundles, diagnostics, or snapshots that are not part of the primary KV/MDBX state
Document storage must support:
- Tenant-scoped namespaces (prefixes)
- Content-addressed or versioned keys (immutability preferred)
- Listing by prefix for admin workflows
## Configuration Contract (Platform-Wide)
### Common Settings
- `S3_ENDPOINT` (Hetzner: HTTPS endpoint; MinIO: `http://minio:9000`)
- `S3_REGION` (required even for some S3-compatible providers)
- `S3_ACCESS_KEY_ID` (secret)
- `S3_SECRET_ACCESS_KEY` (secret)
- `S3_FORCE_PATH_STYLE` (`true/false`)
- `S3_INSECURE` (`true/false`, only allowed for local MinIO)
### Buckets and Prefixes
- `S3_BUCKET_DOCS` (required everywhere)
- `S3_PREFIX_DOCS` (default `docs/`)
Optional (later milestones):
- `S3_BUCKET_LOKI`, `S3_PREFIX_LOKI`
- `S3_BUCKET_TEMPO`, `S3_PREFIX_TEMPO`
## Target Architecture
### Local Development
- MinIO is part of the local stack for parity.
- Control API is the document gateway:
- Upload/download via signed URLs or streamed proxy endpoints
- Metadata stored in existing storage/KV (document index) or derived from key scheme
### Production
- Hetzner Object Storage provides S3-compatible bucket(s).
- Credentials and bucket details injected via Swarm secrets and stack env.
## Development Plan (Milestones by Dependency)
## Milestone 0: S3 Contract + Local MinIO Baseline
### Dependencies
- None
### Goal
Provide a consistent local S3-compatible endpoint and stable bucket naming to unblock higher milestones.
### Tasks
- [ ] Add MinIO to local development stack:
- [ ] Add `minio` service to compose (API + console)
- [ ] Add `minio-init` job to create required buckets
- [ ] Define standard bucket/prefix defaults for local dev:
- [ ] `S3_BUCKET_DOCS=cloudlysis-docs`
- [ ] `S3_PREFIX_DOCS=docs/`
- [ ] Document local workflow to enable MinIO-backed document storage.
### Required Tests (Gate)
- [ ] Workspace verification commands
- [ ] Local manual verification checklist:
- [ ] `cloudlysis-docs` bucket exists
- [ ] credentials work from a container in the compose network
## Milestone 1: Document Storage API (Control API)
### Dependencies
- Milestone 0
### Goal
Make document storage a first-class platform API and require it in all environments.
### Tasks
- [ ] Add an S3 client module to Control API:
- [ ] parse config from env with strict validation (endpoint, bucket, keys)
- [ ] support path-style and TLS/insecure options
- [ ] Implement document primitives:
- [ ] Put (upload) and Get (download)
- [ ] List by prefix (tenant + doc-type)
- [ ] Delete (admin-only) if needed
- [ ] Decide and document a key scheme:
- [ ] tenant-scoped prefix
- [ ] immutable keys preferred (content hash + metadata)
- [ ] Add authz rules for document operations (deny-by-default, tenant-scoped).
### Required Tests (Gate)
- [ ] Workspace verification commands
- [ ] Unit tests:
- [ ] config parsing/validation
- [ ] key generation stability
- [ ] Gated integration tests (MinIO):
- [ ] put/get roundtrip
- [ ] list by prefix
- [ ] tenant isolation (cannot read other tenant prefix)
## Milestone 2: Control UI Integration (Upload/Download Flows)
### Dependencies
- Milestone 1
### Goal
Make document workflows usable from the Control UI without leaking credentials.
### Tasks
- [ ] Add Control API endpoints for signed URLs (recommended) or streamed proxy:
- [ ] create upload URL (PUT)
- [ ] create download URL (GET)
- [ ] Implement Control UI flows for a first document type:
- [ ] upload
- [ ] list
- [ ] download
- [ ] Ensure correlation/trace propagation on Control API operations.
### Required Tests (Gate)
- [ ] Workspace verification commands
- [ ] Control UI unit tests for routing/component render stability
- [ ] Gated end-to-end checklist (local):
- [ ] upload appears in list
- [ ] download returns expected bytes
## Milestone 3: Production Rollout (Hetzner)
### Dependencies
- Milestone 2
### Goal
Deploy document storage on Hetzner S3-compatible backend with production-grade secret handling.
### Tasks
- [ ] Provision buckets and lifecycle policies (docs bucket):
- [ ] retention rules appropriate to documents
- [ ] access policy scoped to required actions
- [ ] Swarm deployment:
- [ ] add secrets for access keys
- [ ] configure Control API with endpoint/region/bucket/prefix
- [ ] Rollback plan:
- [ ] switch to a fallback bucket or MinIO-on-prod if needed
### Required Tests (Gate)
- [ ] Workspace verification commands
- [ ] Production smoke runbook:
- [ ] upload/list/download for a tenant
- [ ] verify objects exist under expected prefixes
## Milestone 4 (Optional): Observability Storage on S3 (Loki + Tempo)
### Dependencies
- Milestone 3
### Goal
Store logs and traces in S3-compatible storage (MinIO locally; Hetzner in production).
### Tasks
- [ ] Loki:
- [ ] add S3 config variant and compose overlay
- [ ] validate log query and bucket objects
- [ ] Tempo:
- [ ] add S3 config variant and compose overlay
- [ ] validate traces and bucket objects
### Required Tests (Gate)
- [ ] Workspace verification commands
- [ ] Gated local validation:
- [ ] Loki writes objects to bucket/prefix after ingest
- [ ] Tempo writes objects to bucket/prefix after ingest
## Workspace Verification Commands
- `cargo fmt --check`
- `cargo clippy --workspace --all-targets -- -D warnings`
- `cargo test --workspace`
- `cd control/ui && npm ci && npm run lint && npm run typecheck && npm run test && npm run build`

View File

@@ -1,6 +1,7 @@
use serde_json::Value as JsonValue;
use std::time::Duration;
#[allow(unreachable_code)]
pub async fn execute_decide_program(
state: &JsonValue,
command: &JsonValue,
@@ -28,6 +29,7 @@ pub async fn execute_decide_program(
}
}
#[allow(unreachable_code)]
pub async fn execute_apply_program(
state: &JsonValue,
event: &JsonValue,
@@ -60,11 +62,10 @@ async fn execute_decide_v8(
state: &JsonValue,
command: &JsonValue,
program: &str,
gas_limit: u64,
_gas_limit: u64,
timeout: Duration,
) -> Result<Vec<JsonValue>, crate::types::AggregateError> {
use std::sync::Arc;
use v8::{Array, Context, Function, HandleScope, Isolate, Object, Scope, Script};
use v8::{Context, ContextScope, Function, HandleScope, Isolate, Script};
let state_str = serde_json::to_string(state).map_err(|e| {
crate::types::AggregateError::DecideError(format!("State serialization: {}", e))
@@ -73,47 +74,45 @@ async fn execute_decide_v8(
crate::types::AggregateError::DecideError(format!("Command serialization: {}", e))
})?;
let program_owned = program.to_string();
let result = tokio::task::spawn_blocking(move || {
let isolate = &mut Isolate::new(v8::CreateParams::default());
let scope = &mut HandleScope::new(isolate);
let context = Context::new(scope);
let context = Context::new(scope, v8::ContextOptions::default());
let scope = &mut ContextScope::new(scope, context);
let source =
v8::String::new(scope, program).ok_or_else(|| "Failed to create program string")?;
v8::String::new(scope, &program_owned).ok_or("Failed to create program string")?;
let script =
Script::compile(scope, source, None).ok_or_else(|| "Failed to compile program")?;
let script = Script::compile(scope, source, None).ok_or("Failed to compile program")?;
script.run(scope).ok_or_else(|| "Failed to run program")?;
script.run(scope).ok_or("Failed to run program")?;
let global = context.global(scope);
let decide_name =
v8::String::new(scope, "decide").ok_or_else(|| "Failed to create decide string")?;
v8::String::new(scope, "decide").ok_or("Failed to create decide string")?;
let decide_fn = global
.get(scope, decide_name.into())
.and_then(|v| v8::Local::<Function>::try_from(v).ok())
.ok_or_else(|| "decide function not found")?;
.ok_or("decide function not found")?;
let state_json = v8::String::new(scope, &state_str)
.ok_or_else(|| "Failed to create state JSON string")?;
let state_obj =
v8::json::parse(scope, state_json).ok_or_else(|| "Failed to parse state JSON")?;
let state_json =
v8::String::new(scope, &state_str).ok_or("Failed to create state JSON string")?;
let state_obj = v8::json::parse(scope, state_json).ok_or("Failed to parse state JSON")?;
let command_json = v8::String::new(scope, &command_str)
.ok_or_else(|| "Failed to create command JSON string")?;
let command_json =
v8::String::new(scope, &command_str).ok_or("Failed to create command JSON string")?;
let command_obj =
v8::json::parse(scope, command_json).ok_or_else(|| "Failed to parse command JSON")?;
v8::json::parse(scope, command_json).ok_or("Failed to parse command JSON")?;
let args: [v8::Local<v8::Value>; 2] = [state_obj.into(), command_obj.into()];
let args: [v8::Local<v8::Value>; 2] = [state_obj, command_obj];
let result = decide_fn
.call(scope, global.into(), &args)
.ok_or_else(|| "decide function call failed")?;
.ok_or("decide function call failed")?;
let result_json =
v8::json::stringify(scope, result).ok_or_else(|| "Failed to stringify result")?;
let result_json = v8::json::stringify(scope, result).ok_or("Failed to stringify result")?;
let result_str = result_json.to_rust_string_lossy(scope);
let events: Vec<JsonValue> = serde_json::from_str(&result_str)
@@ -155,47 +154,43 @@ async fn execute_apply_v8(
let _ = gas_limit;
let program_owned = program.to_string();
let result = tokio::task::spawn_blocking(move || {
let isolate = &mut Isolate::new(v8::CreateParams::default());
let scope = &mut HandleScope::new(isolate);
let context = Context::new(scope);
let context = Context::new(scope, v8::ContextOptions::default());
let scope = &mut ContextScope::new(scope, context);
let source =
v8::String::new(scope, program).ok_or_else(|| "Failed to create program string")?;
v8::String::new(scope, &program_owned).ok_or("Failed to create program string")?;
let script =
Script::compile(scope, source, None).ok_or_else(|| "Failed to compile program")?;
let script = Script::compile(scope, source, None).ok_or("Failed to compile program")?;
script.run(scope).ok_or_else(|| "Failed to run program")?;
script.run(scope).ok_or("Failed to run program")?;
let global = context.global(scope);
let apply_name =
v8::String::new(scope, "apply").ok_or_else(|| "Failed to create apply string")?;
let apply_name = v8::String::new(scope, "apply").ok_or("Failed to create apply string")?;
let apply_fn = global
.get(scope, apply_name.into())
.and_then(|v| v8::Local::<Function>::try_from(v).ok())
.ok_or_else(|| "apply function not found")?;
.ok_or("apply function not found")?;
let state_json = v8::String::new(scope, &state_str)
.ok_or_else(|| "Failed to create state JSON string")?;
let state_obj =
v8::json::parse(scope, state_json).ok_or_else(|| "Failed to parse state JSON")?;
let state_json =
v8::String::new(scope, &state_str).ok_or("Failed to create state JSON string")?;
let state_obj = v8::json::parse(scope, state_json).ok_or("Failed to parse state JSON")?;
let event_json = v8::String::new(scope, &event_str)
.ok_or_else(|| "Failed to create event JSON string")?;
let event_obj =
v8::json::parse(scope, event_json).ok_or_else(|| "Failed to parse event JSON")?;
let event_json =
v8::String::new(scope, &event_str).ok_or("Failed to create event JSON string")?;
let event_obj = v8::json::parse(scope, event_json).ok_or("Failed to parse event JSON")?;
let args: [v8::Local<v8::Value>; 2] = [state_obj.into(), event_obj.into()];
let args: [v8::Local<v8::Value>; 2] = [state_obj, event_obj];
let result = apply_fn
.call(scope, global.into(), &args)
.ok_or_else(|| "apply function call failed")?;
.ok_or("apply function call failed")?;
let result_json =
v8::json::stringify(scope, result).ok_or_else(|| "Failed to stringify result")?;
let result_json = v8::json::stringify(scope, result).ok_or("Failed to stringify result")?;
let result_str = result_json.to_rust_string_lossy(scope);
let new_state: JsonValue = serde_json::from_str(&result_str)
@@ -250,6 +245,7 @@ async fn execute_apply_wasm(
mod tests {
use super::*;
use serde_json::json;
use std::time::Duration;
#[tokio::test]
async fn no_runtime_returns_error() {
@@ -257,7 +253,7 @@ mod tests {
{
let state = json!({});
let command = json!({});
let result =
let result: Result<Vec<JsonValue>, crate::types::AggregateError> =
execute_decide_program(&state, &command, "program", 1000, Duration::from_secs(1))
.await;
assert!(result.is_err());

View File

@@ -339,3 +339,119 @@ This plan is intentionally aligned with the style and gating discipline used in
- verify Grafana dashboards provisioned and VictoriaMetrics receives samples
- [x] **T7.3** End-to-end “control plane can see the fleet” test (requires docker)
- UI/API can query placement + health snapshots for all services
---
## Milestone 8: Config Registry + Safe Change Management (Plan/Apply/Rollback)
**Goal:** Make configuration first-class, versioned, validated, and safely mutable from the control plane, while keeping production and development sources consistent.
### Dependencies
- Milestone 2 (Control Plane API foundation)
- Milestone 5 (safe mutations baseline)
- Milestone 7 (Swarm deployment baseline)
### Exit Criteria
- Operators can list, view, validate, and safely apply config changes with audit + idempotent jobs
- Config changes have revision semantics and are roll-backable
- Gatekeeper safety checks prevent applying invalid or unsafe configs
### Tasks
- [x] **8.1** Inventory and classify configuration surfaces (platform-wide)
- classify as: static boot config (env/secrets), dynamic runtime config (KV), large immutable artifacts (S3/docs)
- map current sources per domain:
- Gateway routing config (`config/routing/dev.json` / production KV)
- Placement config (`config/placement/dev.json` / production KV)
- Runner definitions (effects/sagas) (documents/S3) and activation config (KV)
- Observability provisioning (Swarm configs + repo-managed assets)
- Control plane feature flags (KV)
- [~] **8.2** Define a Config Registry contract in the Control API
- **Implemented (initial)**:
- config identity: `{domain}` (routing|placement)
- metadata: `revision` (KV revision when using NATS), and `source` info (file vs nats)
- storage policy per config: `source=dev_file | nats_kv`
- **Still needed**:
- `{domain, name, scope}` and richer metadata (`updated_at`, `updated_by`, `sha256`)
- history API for KV-backed configs
- [x] **8.3** Implement config storage abstraction (dev + prod)
- dev: file-backed, atomic write (tmp + rename), hot-reload where applicable
- prod: NATS KV for dynamic configs (revisioned values + watch streams)
- consistent error model: decode/validate/source errors are distinguishable and safe
- [x] **8.4** Add read-only config APIs
- `GET /admin/v1/config` list domains
- `GET /admin/v1/config/{domain}` fetch current value + revision + source
- (history not implemented yet)
- [~] **8.5** Add validate/plan/apply/rollback mutation workflows as jobs
- **Implemented**:
- `POST /admin/v1/jobs/config/validate` (job, idempotency key required)
- `POST /admin/v1/jobs/config/apply` (job, idempotency key required, backup + apply)
- `POST /admin/v1/jobs/config/rollback` (job, idempotency key required, restore last backup)
- per-domain locking to avoid concurrent config mutations
- **Still needed**:
- `POST /admin/v1/plan/config/apply` deterministic plan (diff + impacted services)
- richer post-conditions (routing resolution sampling, fleet consistency checks, etc.)
- [~] **8.6** Implement initial config domains end-to-end
- **Gateway routing config**:
- implemented: schema validation via JSON decode
- still needed: semantic validation (tenant entries/shard directories/endpoints URL parsing) + sampled routing verification
- **Placement config**:
- implemented: schema validation via JSON decode
- still needed: semantic validation (targets non-empty, etc.) + fleet snapshot consistency checks
- [x] **8.7** Implement Admin UI “Config” page for safe operations
- list + view configs with revision/sha/audit linkage
- editor for JSON (and YAML when supported by the domain)
- validate button (server-side) and apply/rollback flows as jobs with reason required
### Tests
- [x] **T8.1** Unit tests: config decode/encode stability for each config domain
- routing/placement decode is enforced by server-side validate job (schema-level)
- [ ] **T8.2** Unit tests: validation rejects unsafe configs with stable error codes/messages
- [ ] **T8.3** Unit tests: plan generation is deterministic for same inputs
- [x] **T8.4** Integration tests (env-gated):
- NATS KV config apply + rollback via Control API (requires `CONTROL_TEST_NATS=1` + `CONTROL_TEST_NATS_URL`)
- (Gateway route-resolution E2E verification still pending)
- [x] **T8.5** UI tests: config page renders, validate/apply/rollback flows navigate to job progress
---
## Milestone 9: Control Node Management (Inventory, Drift, and Safer Ops)
**Goal:** Improve how the control plane understands and manages the live control node and platform state: node inventory, config drift detection, and safer operational guardrails.
### Dependencies
- Milestone 7 (Swarm deployment baseline)
- Milestone 8 (config registry + safe change management)
### Exit Criteria
- Control plane provides a reliable “what is running vs what should be running” view
- Config drift is detectable and actionable
- Core operational actions are guarded by preflight checks and produce audit trails
### Tasks
- [x] **9.1** Define a “desired vs observed” model for platform state
- desired: Swarm stacks + config registry revisions
- observed: live service/task state + effective runtime configs
- drift categories: missing, extra, version mismatch, config mismatch, unhealthy
- [~] **9.2** Improve Swarm observation fidelity
- implemented (initial): docker-cli-backed Swarm observation (`CONTROL_SWARM_MODE=docker`)
- still needed: direct Docker API client (avoid shelling out), richer normalization, and wiring into production stacks
- keep file source as a dev fallback for deterministic tests
- normalize service identity: `{service, image_tag, git_sha, updated_at}`
- [x] **9.3** Add drift APIs and UI views
- `GET /admin/v1/platform/drift` returns drift summary + actionable items
- UI: “Platform Drift” page with filters and links to remediate jobs
- [ ] **9.4** Add safer operational guardrails as reusable checks
- preflight checks for:
- service unhealthy / crashloop
- tenant migration safety thresholds (lag/inflight)
- config apply safety (impact radius, sampled verify)
- consistent failure modes: clear reason + audit entry, no partial side effects
- [ ] **9.5** Add operational playbooks as executable checks
- post-deploy verification suite callable as an idempotent job
- rollback verification suite callable as an idempotent job
### Tests
- [x] **T9.1** Unit tests: drift classification for synthetic desired/observed fixtures
- [x] **T9.2** Integration tests (docker-gated): drift view detects intentional mismatches in a local Swarm
- requires `CONTROL_TEST_DOCKER=1` and an active local Swarm node
- [x] **T9.3** UI tests: drift page renders in route smoke test

View File

@@ -5,22 +5,32 @@ edition = "2024"
publish = ["madapes"]
[dependencies]
async-nats = "0.42.0"
async-trait = "0.1.89"
axum = "0.8.6"
aws-config = { version = "1.8.6", features = ["behavior-version-latest"] }
aws-credential-types = "1.2.6"
aws-sdk-s3 = "1.106.0"
clap = { version = "4.5.48", features = ["derive", "env"] }
futures = "0.3.31"
jsonwebtoken = "9.3.1"
metrics = "0.23.0"
metrics-exporter-prometheus = "0.16.0"
reqwest = { version = "0.12.23", default-features = false, features = ["json", "rustls-tls"] }
serde = { version = "1.0.228", features = ["derive"] }
serde_json = "1.0.149"
sha2 = "0.10.9"
hex = "0.4.3"
shared = { path = "../../shared" }
thiserror = "2.0.16"
tokio = { version = "1.45.0", features = ["macros", "net", "process", "rt-multi-thread", "signal", "time"] }
tower-http = { version = "0.6.6", features = ["trace"] }
tracing = "0.1.41"
tracing-subscriber = { version = "0.3.20", features = ["env-filter"] }
url = "2.5.4"
uuid = { version = "1.18.1", features = ["serde", "v4"] }
[dev-dependencies]
serde_yaml = "0.9.34"
tower = "0.5.2"
urlencoding = "2.1.3"

View File

@@ -1,7 +1,9 @@
use crate::{
AppState, RequestIds,
auth::{Principal, has_permission},
fleet,
config_registry::{ConfigDomain, ConfigRegistryError},
config_schemas::RoutingConfig,
drift, fleet,
job_engine::{JobEngine, StartJobError},
jobs::{Job, JobStatus, JobStep},
placement::{PlacementResponse, ServiceKind},
@@ -15,7 +17,9 @@ use axum::{
routing::{get, post},
};
use serde::Deserialize;
use sha2::Digest;
use std::time::{SystemTime, UNIX_EPOCH};
use url::Url;
use uuid::Uuid;
const HEADER_IDEMPOTENCY_KEY: &str = "idempotency-key";
@@ -25,21 +29,125 @@ pub fn admin_router() -> Router<AppState> {
Router::new()
.route("/whoami", get(whoami))
.route("/platform/info", get(platform_info))
.route("/platform/drift", get(platform_drift))
.route("/fleet/snapshot", get(fleet_snapshot))
.route("/tenants", get(list_tenants))
.route("/placement/{kind}", get(get_placement))
.route("/config", get(list_config))
.route("/config/{domain}", get(get_config))
.route("/config/{domain}/history", get(get_config_history))
.route("/jobs/platform/verify", post(start_platform_verify))
.route("/jobs/config/validate", post(start_config_validate))
.route("/jobs/config/apply", post(start_config_apply))
.route("/jobs/config/rollback", post(start_config_rollback))
.route("/tenants/echo", get(tenant_echo))
.route(
"/tenants/{tenant_id}/billing",
get(crate::billing::get_billing),
)
.route(
"/tenants/{tenant_id}/billing/checkout",
post(crate::billing::checkout),
)
.route(
"/tenants/{tenant_id}/billing/portal",
post(crate::billing::portal),
)
.route("/jobs/echo", post(create_echo_job))
.route("/jobs/{job_id}", get(get_job))
.route("/jobs/{job_id}/cancel", post(cancel_job))
.route("/jobs/tenant/drain", post(start_tenant_drain))
.route("/jobs/tenant/migrate", post(start_tenant_migrate))
.route("/plan/tenant/migrate", post(plan_tenant_migrate))
.route("/plan/config/apply", post(plan_config_apply))
.route("/audit", get(list_audit))
.route("/swarm/services", get(list_swarm_services))
.route("/swarm/services/{name}/tasks", get(list_swarm_tasks))
}
#[derive(Debug, Deserialize)]
struct PlatformVerifyRequest {
reason: String,
}
async fn start_platform_verify(
State(state): State<AppState>,
headers: HeaderMap,
Extension(principal): Extension<Principal>,
Json(body): Json<PlatformVerifyRequest>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:write") {
return StatusCode::FORBIDDEN.into_response();
}
let key = headers
.get(HEADER_IDEMPOTENCY_KEY)
.and_then(|v| v.to_str().ok())
.ok_or(StatusCode::BAD_REQUEST);
let key = match key {
Ok(k) if !k.is_empty() => k,
_ => return StatusCode::BAD_REQUEST.into_response(),
};
let engine = JobEngine::new(
state.jobs.clone(),
state.audit.clone(),
state.tenant_locks.clone(),
state.config_locks.clone(),
);
let job_id = match engine.start_platform_verify(state.clone(), &principal, body.reason, key) {
Ok(id) => id,
Err(StartJobError::TenantLocked) => return StatusCode::CONFLICT.into_response(),
};
(
StatusCode::OK,
Json(serde_json::json!({ "job_id": job_id })),
)
.into_response()
}
async fn get_config_history(
State(state): State<AppState>,
Path(domain): Path<String>,
Extension(principal): Extension<Principal>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:read") {
return StatusCode::FORBIDDEN.into_response();
}
let domain = match domain.as_str() {
"routing" => ConfigDomain::Routing,
"placement" => ConfigDomain::Placement,
_ => return StatusCode::NOT_FOUND.into_response(),
};
let Some(source) = state.config.source(domain) else {
return StatusCode::NOT_FOUND.into_response();
};
let rows = match source.history_bytes(50).await {
Ok(items) => items
.into_iter()
.filter_map(|(rev, bytes)| {
let v = serde_json::from_slice::<serde_json::Value>(&bytes).ok()?;
Some(serde_json::json!({
"revision": rev,
"sha256": sha256_hex(&bytes),
"value": v
}))
})
.collect::<Vec<_>>(),
Err(ConfigRegistryError::Source(_)) => return StatusCode::BAD_GATEWAY.into_response(),
Err(_) => return StatusCode::NOT_IMPLEMENTED.into_response(),
};
(
StatusCode::OK,
Json(serde_json::json!({ "domain": domain.as_str(), "items": rows })),
)
.into_response()
}
async fn whoami(Extension(principal): Extension<Principal>) -> impl IntoResponse {
if !has_permission(&principal, "control:read") {
return StatusCode::FORBIDDEN.into_response();
@@ -70,6 +178,18 @@ async fn platform_info(Extension(principal): Extension<Principal>) -> impl IntoR
.into_response()
}
async fn platform_drift(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:read") {
return StatusCode::FORBIDDEN.into_response();
}
let r = drift::compute(&state).await;
(StatusCode::OK, Json(r)).into_response()
}
async fn fleet_snapshot(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
@@ -109,6 +229,434 @@ async fn get_placement(
(StatusCode::OK, Json(resp)).into_response()
}
async fn list_config(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:read") {
return StatusCode::FORBIDDEN.into_response();
}
let domains: Vec<&'static str> = [ConfigDomain::Routing, ConfigDomain::Placement]
.into_iter()
.filter(|d| state.config.source(*d).is_some())
.map(|d| d.as_str())
.collect();
(
StatusCode::OK,
Json(serde_json::json!({ "domains": domains })),
)
.into_response()
}
async fn get_config(
State(state): State<AppState>,
Path(domain): Path<String>,
Extension(principal): Extension<Principal>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:read") {
return StatusCode::FORBIDDEN.into_response();
}
let domain = match domain.as_str() {
"routing" => ConfigDomain::Routing,
"placement" => ConfigDomain::Placement,
_ => return StatusCode::NOT_FOUND.into_response(),
};
let Some(source) = state.config.source(domain) else {
return StatusCode::NOT_FOUND.into_response();
};
let loaded = source.load_bytes().await;
let (bytes, revision) = match loaded {
Ok(x) => x,
Err(ConfigRegistryError::Source(_)) => return StatusCode::BAD_GATEWAY.into_response(),
Err(ConfigRegistryError::Decode(_)) => return StatusCode::BAD_REQUEST.into_response(),
Err(ConfigRegistryError::NotConfigured) => return StatusCode::NOT_FOUND.into_response(),
};
let json_value = match bytes {
Some(ref b) => match serde_json::from_slice::<serde_json::Value>(b) {
Ok(v) => v,
Err(e) => {
return (
StatusCode::BAD_REQUEST,
Json(serde_json::json!({ "error": format!("invalid json: {e}") })),
)
.into_response();
}
},
None => serde_json::Value::Null,
};
let sha256 = bytes.as_deref().map(sha256_hex);
(
StatusCode::OK,
Json(serde_json::json!({
"domain": domain.as_str(),
"revision": revision,
"sha256": sha256,
"source": source.info(),
"value": json_value,
})),
)
.into_response()
}
#[derive(Debug, Deserialize)]
struct ConfigApplyRequest {
domain: String,
expected_revision: Option<u64>,
reason: String,
value: serde_json::Value,
}
#[derive(Debug, Deserialize)]
struct ConfigValidateRequest {
domain: String,
reason: String,
value: serde_json::Value,
}
#[derive(Debug, Deserialize)]
struct ConfigRollbackRequest {
domain: String,
reason: String,
}
fn parse_domain(domain: &str) -> Option<ConfigDomain> {
match domain {
"routing" => Some(ConfigDomain::Routing),
"placement" => Some(ConfigDomain::Placement),
_ => None,
}
}
async fn start_config_validate(
State(state): State<AppState>,
headers: HeaderMap,
Extension(principal): Extension<Principal>,
Json(body): Json<ConfigValidateRequest>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:write") {
return StatusCode::FORBIDDEN.into_response();
}
let key = headers
.get(HEADER_IDEMPOTENCY_KEY)
.and_then(|v| v.to_str().ok())
.ok_or(StatusCode::BAD_REQUEST);
let key = match key {
Ok(k) if !k.is_empty() => k,
_ => return StatusCode::BAD_REQUEST.into_response(),
};
let Some(domain) = parse_domain(body.domain.as_str()) else {
return StatusCode::BAD_REQUEST.into_response();
};
let engine = JobEngine::new(
state.jobs.clone(),
state.audit.clone(),
state.tenant_locks.clone(),
state.config_locks.clone(),
);
let job_id = match engine.start_config_validate(
state.clone(),
&principal,
domain,
body.reason,
body.value,
key,
) {
Ok(id) => id,
Err(StartJobError::TenantLocked) => return StatusCode::CONFLICT.into_response(),
};
(
StatusCode::OK,
Json(serde_json::json!({ "job_id": job_id })),
)
.into_response()
}
async fn start_config_apply(
State(state): State<AppState>,
headers: HeaderMap,
Extension(principal): Extension<Principal>,
Json(body): Json<ConfigApplyRequest>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:write") {
return StatusCode::FORBIDDEN.into_response();
}
let key = headers
.get(HEADER_IDEMPOTENCY_KEY)
.and_then(|v| v.to_str().ok())
.ok_or(StatusCode::BAD_REQUEST);
let key = match key {
Ok(k) if !k.is_empty() => k,
_ => return StatusCode::BAD_REQUEST.into_response(),
};
let Some(domain) = parse_domain(body.domain.as_str()) else {
return StatusCode::BAD_REQUEST.into_response();
};
let engine = JobEngine::new(
state.jobs.clone(),
state.audit.clone(),
state.tenant_locks.clone(),
state.config_locks.clone(),
);
let job_id = match engine.start_config_apply(
state.clone(),
&principal,
domain,
body.reason,
body.expected_revision,
body.value,
key,
) {
Ok(id) => id,
Err(StartJobError::TenantLocked) => return StatusCode::CONFLICT.into_response(),
};
(
StatusCode::OK,
Json(serde_json::json!({ "job_id": job_id })),
)
.into_response()
}
async fn start_config_rollback(
State(state): State<AppState>,
headers: HeaderMap,
Extension(principal): Extension<Principal>,
Json(body): Json<ConfigRollbackRequest>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:write") {
return StatusCode::FORBIDDEN.into_response();
}
let key = headers
.get(HEADER_IDEMPOTENCY_KEY)
.and_then(|v| v.to_str().ok())
.ok_or(StatusCode::BAD_REQUEST);
let key = match key {
Ok(k) if !k.is_empty() => k,
_ => return StatusCode::BAD_REQUEST.into_response(),
};
let Some(domain) = parse_domain(body.domain.as_str()) else {
return StatusCode::BAD_REQUEST.into_response();
};
let engine = JobEngine::new(
state.jobs.clone(),
state.audit.clone(),
state.tenant_locks.clone(),
state.config_locks.clone(),
);
let job_id =
match engine.start_config_rollback(state.clone(), &principal, domain, body.reason, key) {
Ok(id) => id,
Err(StartJobError::TenantLocked) => return StatusCode::CONFLICT.into_response(),
};
(
StatusCode::OK,
Json(serde_json::json!({ "job_id": job_id })),
)
.into_response()
}
#[derive(Debug, Deserialize)]
struct ConfigPlanApplyRequest {
domain: String,
value: serde_json::Value,
}
async fn plan_config_apply(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
Json(body): Json<ConfigPlanApplyRequest>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:write") {
return StatusCode::FORBIDDEN.into_response();
}
let domain = match body.domain.as_str() {
"routing" => ConfigDomain::Routing,
"placement" => ConfigDomain::Placement,
_ => return StatusCode::BAD_REQUEST.into_response(),
};
let Some(source) = state.config.source(domain) else {
return StatusCode::NOT_FOUND.into_response();
};
// Validate proposed config (schema + semantics).
let validate_res: Result<(), String> = match domain {
ConfigDomain::Routing => {
let cfg = match serde_json::from_value::<RoutingConfig>(body.value.clone()) {
Ok(v) => v,
Err(e) => {
return (
StatusCode::BAD_REQUEST,
Json(serde_json::json!({ "error": e.to_string() })),
)
.into_response();
}
};
validate_routing_semantics(&cfg)
}
ConfigDomain::Placement => {
let cfg =
match serde_json::from_value::<crate::placement::PlacementFile>(body.value.clone())
{
Ok(v) => v,
Err(e) => {
return (
StatusCode::BAD_REQUEST,
Json(serde_json::json!({ "error": e.to_string() })),
)
.into_response();
}
};
validate_placement_semantics(&cfg)
}
};
if let Err(e) = validate_res {
return (
StatusCode::BAD_REQUEST,
Json(serde_json::json!({ "error": e })),
)
.into_response();
}
let (cur_bytes, cur_rev) = match source.load_bytes().await {
Ok(x) => x,
Err(_) => return StatusCode::BAD_GATEWAY.into_response(),
};
let cur_value = cur_bytes
.as_deref()
.and_then(|b| serde_json::from_slice::<serde_json::Value>(b).ok())
.unwrap_or(serde_json::Value::Null);
let before = serde_json::to_string_pretty(&cur_value).unwrap_or_default();
let after = serde_json::to_string_pretty(&body.value).unwrap_or_default();
let changed = cur_value != body.value;
let impacted_services: Vec<&'static str> = match domain {
ConfigDomain::Routing => vec!["gateway"],
ConfigDomain::Placement => vec!["gateway", "control-api"],
};
(
StatusCode::OK,
Json(serde_json::json!({
"domain": domain.as_str(),
"current_revision": cur_rev,
"changed": changed,
"impacted_services": impacted_services,
"diff": {
"before": before,
"after": after,
}
})),
)
.into_response()
}
fn sha256_hex(bytes: &[u8]) -> String {
let mut h = sha2::Sha256::new();
h.update(bytes);
hex::encode(h.finalize())
}
fn validate_routing_semantics(cfg: &RoutingConfig) -> Result<(), String> {
let shard_maps = [
("aggregate_shards", &cfg.aggregate_shards),
("projection_shards", &cfg.projection_shards),
("runner_shards", &cfg.runner_shards),
];
for (name, map) in shard_maps {
for (shard_id, endpoints) in map {
if endpoints.is_empty() {
return Err(format!("{name}[{shard_id}] has no endpoints"));
}
for ep in endpoints {
let u = Url::parse(ep)
.map_err(|e| format!("{name}[{shard_id}] invalid endpoint {ep:?}: {e}"))?;
if u.scheme() != "http" && u.scheme() != "https" {
return Err(format!(
"{name}[{shard_id}] endpoint {ep:?} must be http(s)"
));
}
if u.host_str().is_none() {
return Err(format!(
"{name}[{shard_id}] endpoint {ep:?} must include host"
));
}
}
}
}
let placements = [
(
"aggregate_placement",
&cfg.aggregate_placement,
&cfg.aggregate_shards,
),
(
"projection_placement",
&cfg.projection_placement,
&cfg.projection_shards,
),
(
"runner_placement",
&cfg.runner_placement,
&cfg.runner_shards,
),
];
for (pname, pmap, shards) in placements {
for (tenant, shard_id) in pmap {
if shard_id.trim().is_empty() {
return Err(format!("{pname}[{tenant}] shard_id is empty"));
}
if !shards.contains_key(shard_id) {
return Err(format!(
"{pname}[{tenant}] references missing shard_id {shard_id:?}"
));
}
}
}
Ok(())
}
fn validate_placement_semantics(cfg: &crate::placement::PlacementFile) -> Result<(), String> {
let kinds = [
("aggregate_placement", cfg.aggregate_placement.as_ref()),
("projection_placement", cfg.projection_placement.as_ref()),
("runner_placement", cfg.runner_placement.as_ref()),
];
for (kind, k) in kinds {
let Some(k) = k else { continue };
for p in &k.placements {
if p.targets.is_empty() {
return Err(format!("{kind} tenant {} has no targets", p.tenant_id));
}
if p.targets.iter().any(|t| t.trim().is_empty()) {
return Err(format!("{kind} tenant {} has empty target", p.tenant_id));
}
}
}
Ok(())
}
async fn list_tenants(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
@@ -256,6 +804,7 @@ async fn start_tenant_drain(
state.jobs.clone(),
state.audit.clone(),
state.tenant_locks.clone(),
state.config_locks.clone(),
);
let job_id = match engine.start_tenant_drain(
state.clone(),
@@ -298,6 +847,7 @@ async fn start_tenant_migrate(
state.jobs.clone(),
state.audit.clone(),
state.tenant_locks.clone(),
state.config_locks.clone(),
);
let job_id = match engine.start_tenant_migrate(
state.clone(),

904
control/api/src/billing.rs Normal file
View File

@@ -0,0 +1,904 @@
use crate::{
AppState,
auth::{Principal, has_permission},
};
use async_trait::async_trait;
use axum::{
Json,
extract::{Extension, Path, State},
http::{HeaderMap, StatusCode},
response::IntoResponse,
};
use serde::{Deserialize, Serialize};
use std::time::Duration;
use std::{
collections::BTreeMap,
fs,
path::PathBuf,
sync::{Arc, RwLock},
time::SystemTime,
};
use thiserror::Error;
use uuid::Uuid;
const HEADER_TENANT_ID: &str = shared::HEADER_X_TENANT_ID;
fn verify_tenant_isolation(headers: &HeaderMap, path_tenant_id: Uuid) -> Result<(), StatusCode> {
let header_tenant_id = headers
.get(HEADER_TENANT_ID)
.and_then(|v| v.to_str().ok())
.ok_or(StatusCode::BAD_REQUEST)
.and_then(|s| Uuid::parse_str(s).map_err(|_| StatusCode::BAD_REQUEST))?;
if header_tenant_id != path_tenant_id {
return Err(StatusCode::FORBIDDEN);
}
Ok(())
}
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum Plan {
Free,
Pro,
Enterprise,
}
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum SubscriptionStatus {
Trialing,
Active,
PastDue,
Paused,
Canceled,
Incomplete,
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct Entitlements {
pub max_deployments: u32,
pub max_runners: u32,
pub s3_docs_enabled: bool,
pub support_tier: String,
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub enum BillingEvent {
SubscriptionCreated {
tenant_id: Uuid,
event_id: String,
provider_customer_id: String,
provider_subscription_id: String,
status: SubscriptionStatus,
plan: Plan,
current_period_end: String,
ts_ms: u64,
},
SubscriptionUpdated {
tenant_id: Uuid,
event_id: String,
status: SubscriptionStatus,
plan: Plan,
current_period_end: String,
cancel_at_period_end: bool,
ts_ms: u64,
},
SubscriptionDeleted {
tenant_id: Uuid,
event_id: String,
ts_ms: u64,
},
}
impl BillingEvent {
pub fn tenant_id(&self) -> Uuid {
match self {
Self::SubscriptionCreated { tenant_id, .. } => *tenant_id,
Self::SubscriptionUpdated { tenant_id, .. } => *tenant_id,
Self::SubscriptionDeleted { tenant_id, .. } => *tenant_id,
}
}
pub fn event_id(&self) -> &str {
match self {
Self::SubscriptionCreated { event_id, .. } => event_id,
Self::SubscriptionUpdated { event_id, .. } => event_id,
Self::SubscriptionDeleted { event_id, .. } => event_id,
}
}
pub fn ts_ms(&self) -> u64 {
match self {
Self::SubscriptionCreated { ts_ms, .. } => *ts_ms,
Self::SubscriptionUpdated { ts_ms, .. } => *ts_ms,
Self::SubscriptionDeleted { ts_ms, .. } => *ts_ms,
}
}
}
impl Entitlements {
pub fn derive(plan: Option<&Plan>, status: Option<&SubscriptionStatus>) -> Self {
let is_active = matches!(
status,
Some(SubscriptionStatus::Trialing | SubscriptionStatus::Active)
);
if !is_active {
return Self {
max_deployments: 1,
max_runners: 1,
s3_docs_enabled: false,
support_tier: "community".to_string(),
};
}
match plan.unwrap_or(&Plan::Free) {
Plan::Free => Self {
max_deployments: 3,
max_runners: 1,
s3_docs_enabled: false,
support_tier: "community".to_string(),
},
Plan::Pro => Self {
max_deployments: 10,
max_runners: 5,
s3_docs_enabled: true,
support_tier: "standard".to_string(),
},
Plan::Enterprise => Self {
max_deployments: 1000,
max_runners: 50,
s3_docs_enabled: true,
support_tier: "priority".to_string(),
},
}
}
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct TenantBillingState {
pub provider: String,
pub provider_customer_id: Option<String>,
pub provider_subscription_id: Option<String>,
pub provider_checkout_session_id: Option<String>,
pub status: Option<SubscriptionStatus>,
pub plan: Option<Plan>,
pub current_period_end: Option<String>,
pub cancel_at_period_end: Option<bool>,
pub processed_webhook_event_ids: Vec<String>,
pub updated_at: u64,
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct BillingStateFile {
pub revision: Option<String>,
pub tenants: BTreeMap<Uuid, TenantBillingState>,
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct BillingResponse {
pub configured: bool,
pub provider: Option<String>,
pub plan: Option<Plan>,
pub status: Option<SubscriptionStatus>,
pub current_period_end: Option<String>,
pub cancel_at_period_end: Option<bool>,
pub entitlements: Entitlements,
}
#[derive(Clone)]
pub struct BillingStore {
inner: Arc<RwLock<Inner>>,
}
struct Inner {
path: PathBuf,
last_modified: Option<SystemTime>,
cached: Option<BillingStateFile>,
}
impl BillingStore {
pub fn new(path: PathBuf) -> Self {
Self {
inner: Arc::new(RwLock::new(Inner {
path,
last_modified: None,
cached: None,
})),
}
}
pub fn get_for_tenant(&self, tenant_id: Uuid) -> BillingResponse {
let mut inner = self.inner.write().expect("billing lock poisoned");
inner.reload_if_changed();
if let Some(state) = inner
.cached
.as_ref()
.and_then(|file| file.tenants.get(&tenant_id))
{
return BillingResponse {
configured: true,
provider: Some(state.provider.clone()),
plan: state.plan.clone(),
status: state.status.clone(),
current_period_end: state.current_period_end.clone(),
cancel_at_period_end: state.cancel_at_period_end,
entitlements: Entitlements::derive(state.plan.as_ref(), state.status.as_ref()),
};
}
BillingResponse {
configured: false,
provider: None,
plan: None,
status: None,
current_period_end: None,
cancel_at_period_end: None,
entitlements: Entitlements::derive(None, None),
}
}
pub fn get_all_tenant_ids(&self) -> Vec<Uuid> {
let mut inner = self.inner.write().expect("billing lock poisoned");
inner.reload_if_changed();
inner
.cached
.as_ref()
.map(|f| f.tenants.keys().cloned().collect())
.unwrap_or_default()
}
pub fn get_subscription_id(&self, tenant_id: Uuid) -> Option<String> {
let mut inner = self.inner.write().expect("billing lock poisoned");
inner.reload_if_changed();
inner
.cached
.as_ref()
.and_then(|f| f.tenants.get(&tenant_id))
.and_then(|s| s.provider_subscription_id.clone())
}
pub fn apply_event(&self, event: BillingEvent) -> Result<(), String> {
let mut inner = self.inner.write().expect("billing lock poisoned");
inner.reload_if_changed();
let mut file = inner.cached.clone().unwrap_or(BillingStateFile {
revision: Some("dev".to_string()),
tenants: BTreeMap::new(),
});
let tenant_id = event.tenant_id();
let event_id = event.event_id().to_string();
let ts_ms = event.ts_ms();
let state = file.tenants.entry(tenant_id).or_insert(TenantBillingState {
provider: "unknown".to_string(), // Will be updated by Created event
provider_customer_id: None,
provider_subscription_id: None,
provider_checkout_session_id: None,
status: None,
plan: None,
current_period_end: None,
cancel_at_period_end: None,
processed_webhook_event_ids: vec![],
updated_at: 0,
});
// Deduplication
if state.processed_webhook_event_ids.contains(&event_id) {
return Ok(());
}
// Monotonicity check
if state.updated_at > ts_ms {
state.processed_webhook_event_ids.push(event_id);
state.processed_webhook_event_ids.truncate(50);
inner.save(file)?;
return Ok(());
}
match event {
BillingEvent::SubscriptionCreated {
provider_customer_id,
provider_subscription_id,
status,
plan,
current_period_end,
..
} => {
state.provider_customer_id = Some(provider_customer_id);
state.provider_subscription_id = Some(provider_subscription_id);
state.status = Some(status);
state.plan = Some(plan);
state.current_period_end = Some(current_period_end);
}
BillingEvent::SubscriptionUpdated {
status,
plan,
current_period_end,
cancel_at_period_end,
..
} => {
state.status = Some(status);
state.plan = Some(plan);
state.current_period_end = Some(current_period_end);
state.cancel_at_period_end = Some(cancel_at_period_end);
}
BillingEvent::SubscriptionDeleted { .. } => {
state.status = Some(SubscriptionStatus::Canceled);
}
}
state.updated_at = ts_ms;
state.processed_webhook_event_ids.push(event_id);
state.processed_webhook_event_ids.truncate(50);
inner.save(file)?;
Ok(())
}
#[cfg(test)]
pub fn update_tenant_state(
&self,
tenant_id: Uuid,
state: TenantBillingState,
) -> Result<String, String> {
let mut inner = self.inner.write().expect("billing lock poisoned");
inner.reload_if_changed();
let mut file = inner.cached.clone().unwrap_or(BillingStateFile {
revision: Some("dev".to_string()),
tenants: BTreeMap::new(),
});
file.tenants.insert(tenant_id, state);
inner.save(file)
}
}
impl Inner {
fn save(&mut self, mut file: BillingStateFile) -> Result<String, String> {
let revision = format!("rev-{}", Uuid::new_v4());
file.revision = Some(revision.clone());
let raw = serde_json::to_string_pretty(&file).map_err(|e| e.to_string())?;
let tmp = self.path.with_extension("json.tmp");
fs::write(&tmp, raw).map_err(|e| e.to_string())?;
fs::rename(&tmp, &self.path).map_err(|e| e.to_string())?;
self.last_modified = None;
self.cached = Some(file);
Ok(revision)
}
fn reload_if_changed(&mut self) {
let meta = fs::metadata(&self.path).ok();
let modified = meta.and_then(|m| m.modified().ok());
if self.cached.is_some() && modified.is_some() && modified == self.last_modified {
return;
}
self.last_modified = modified;
let p = &self.path;
self.cached = fs::read_to_string(p)
.ok()
.and_then(|raw| serde_json::from_str(&raw).ok());
}
}
pub async fn get_billing(
State(state): State<AppState>,
Path(tenant_id): Path<Uuid>,
headers: HeaderMap,
Extension(principal): Extension<Principal>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:read") {
return StatusCode::FORBIDDEN.into_response();
}
if let Err(status) = verify_tenant_isolation(&headers, tenant_id) {
return status.into_response();
}
let resp = state.billing.get_for_tenant(tenant_id);
(StatusCode::OK, Json(resp)).into_response()
}
#[derive(Debug, Deserialize)]
pub struct CheckoutRequest {
pub plan: Plan,
pub return_path: Option<String>,
}
pub async fn checkout(
State(state): State<AppState>,
Path(tenant_id): Path<Uuid>,
headers: HeaderMap,
Extension(principal): Extension<Principal>,
Json(body): Json<CheckoutRequest>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:write") {
return StatusCode::FORBIDDEN.into_response();
}
if let Err(status) = verify_tenant_isolation(&headers, tenant_id) {
return status.into_response();
}
// Check if subscription already exists and is active/trialing
let current = state.billing.get_for_tenant(tenant_id);
if current.configured
&& matches!(
current.status,
Some(SubscriptionStatus::Active | SubscriptionStatus::Trialing)
)
{
return (
StatusCode::CONFLICT,
Json(serde_json::json!({ "error": "tenant already has an active subscription" })),
)
.into_response();
}
// Construct full return URL
// TODO: Validate return_path against ALLOWED_RETURN_ORIGINS if provided
let return_url = body.return_path.unwrap_or_else(|| "/billing".to_string());
match state
.billing_provider
.create_checkout_session(tenant_id, body.plan, return_url)
.await
{
Ok(url) => (StatusCode::OK, Json(serde_json::json!({ "url": url }))).into_response(),
Err(e) => {
let err_msg = e.to_string();
(
StatusCode::INTERNAL_SERVER_ERROR,
Json(serde_json::json!({ "error": err_msg })),
)
.into_response()
}
}
}
pub async fn portal(
State(state): State<AppState>,
Path(tenant_id): Path<Uuid>,
headers: HeaderMap,
Extension(principal): Extension<Principal>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:write") {
return StatusCode::FORBIDDEN.into_response();
}
if let Err(status) = verify_tenant_isolation(&headers, tenant_id) {
return status.into_response();
}
let return_url = "/billing".to_string();
match state
.billing_provider
.create_portal_session(tenant_id, return_url)
.await
{
Ok(url) => (StatusCode::OK, Json(serde_json::json!({ "url": url }))).into_response(),
Err(e) => {
let err_msg = e.to_string();
(
StatusCode::INTERNAL_SERVER_ERROR,
Json(serde_json::json!({ "error": err_msg })),
)
.into_response()
}
}
}
pub async fn webhook(
State(state): State<AppState>,
Path(_provider): Path<String>,
headers: HeaderMap,
body: axum::body::Bytes,
) -> impl IntoResponse {
// Note: We don't require auth here as this is a public endpoint called by the provider.
// Security is handled via signature verification in the provider trait.
match state.billing_provider.verify_webhook(&body, &headers).await {
Ok(event) => {
metrics::counter!("billing_webhook_requests_total", "status" => "success").increment(1);
if let Err(e) = state.billing.apply_event(event) {
tracing::error!(error = %e, "failed to apply billing event from webhook");
return (
StatusCode::INTERNAL_SERVER_ERROR,
Json(serde_json::json!({ "error": e })),
)
.into_response();
}
StatusCode::OK.into_response()
}
Err(e) => {
metrics::counter!("billing_webhook_requests_total", "status" => "error").increment(1);
(
StatusCode::BAD_REQUEST,
Json(serde_json::json!({ "error": e.to_string() })),
)
.into_response()
}
}
}
pub async fn run_reconciliation_loop(state: AppState) {
let interval_secs = std::env::var("CONTROL_BILLING_RECONCILE_INTERVAL_SECS")
.ok()
.and_then(|s| s.parse().ok())
.unwrap_or(3600);
tracing::info!(interval_secs, "starting billing reconciliation loop");
loop {
tokio::time::sleep(Duration::from_secs(interval_secs)).await;
tracing::info!("starting billing reconciliation run");
reconcile_once(&state).await;
// Update tenant status gauges
// Note: This is an expensive operation if there are many tenants,
// but for reconciliation it's fine once per hour.
update_billing_gauges(&state);
}
}
pub async fn reconcile_once(state: &AppState) {
let start = std::time::Instant::now();
let tenant_ids = state.billing.get_all_tenant_ids();
let mut success = 0;
let mut error = 0;
let mut skipped = 0;
for tenant_id in tenant_ids {
let sub_id = state.billing.get_subscription_id(tenant_id);
if let Some(subscription_id) = sub_id {
match state
.billing_provider
.fetch_subscription(tenant_id, &subscription_id)
.await
{
Ok(event) => {
if let Err(e) = state.billing.apply_event(event) {
tracing::error!(?tenant_id, error = %e, "failed to apply reconciled billing event");
error += 1;
} else {
success += 1;
}
}
Err(e) => {
tracing::error!(?tenant_id, error = %e, "failed to fetch subscription for reconciliation");
error += 1;
}
}
} else {
skipped += 1;
}
}
let elapsed = start.elapsed();
metrics::counter!("billing_reconciliation_runs_total", "result" => "done").increment(1);
metrics::histogram!("billing_reconciliation_duration_ms").record(elapsed.as_millis() as f64);
tracing::info!(
success,
error,
skipped,
duration_ms = elapsed.as_millis(),
"billing reconciliation run complete"
);
}
fn update_billing_gauges(state: &AppState) {
let tenant_ids = state.billing.get_all_tenant_ids();
let mut counts: BTreeMap<(String, String), u64> = BTreeMap::new();
for tenant_id in tenant_ids {
let resp = state.billing.get_for_tenant(tenant_id);
let plan = match resp.plan {
Some(Plan::Free) => "free",
Some(Plan::Pro) => "pro",
Some(Plan::Enterprise) => "enterprise",
None => "none",
}
.to_string();
let status = match resp.status {
Some(SubscriptionStatus::Active) => "active",
Some(SubscriptionStatus::Trialing) => "trialing",
Some(SubscriptionStatus::PastDue) => "past_due",
Some(SubscriptionStatus::Paused) => "paused",
Some(SubscriptionStatus::Canceled) => "canceled",
Some(SubscriptionStatus::Incomplete) => "incomplete",
None => "none",
}
.to_string();
*counts.entry((plan, status)).or_insert(0) += 1;
}
for ((plan, status), count) in counts {
metrics::gauge!("billing_tenant_status_count", "plan" => plan, "status" => status)
.set(count as f64);
}
}
#[derive(Debug, Error)]
pub enum BillingError {
#[error("provider error: {0}")]
Provider(String),
#[error("invalid configuration: {0}")]
Config(String),
}
#[async_trait]
pub trait BillingProvider: Send + Sync {
async fn create_checkout_session(
&self,
tenant_id: Uuid,
plan: Plan,
return_url: String,
) -> Result<String, BillingError>;
async fn create_portal_session(
&self,
tenant_id: Uuid,
return_url: String,
) -> Result<String, BillingError>;
async fn verify_webhook(
&self,
payload: &[u8],
headers: &HeaderMap,
) -> Result<BillingEvent, BillingError>;
async fn fetch_subscription(
&self,
tenant_id: Uuid,
subscription_id: &str,
) -> Result<BillingEvent, BillingError>;
}
pub struct StripeProvider {
pub secret_key: String,
pub price_pro: String,
pub price_enterprise: String,
}
#[async_trait]
impl BillingProvider for StripeProvider {
async fn create_checkout_session(
&self,
tenant_id: Uuid,
plan: Plan,
_return_url: String,
) -> Result<String, BillingError> {
let _price = match plan {
Plan::Pro => &self.price_pro,
Plan::Enterprise => &self.price_enterprise,
Plan::Free => {
return Err(BillingError::Config(
"Free plan has no checkout".to_string(),
));
}
};
// TODO: Actually call Stripe API
// For now, returning a simulated Stripe checkout URL
Ok(format!(
"https://checkout.stripe.com/pay/cs_test_{}?tenant_id={}",
Uuid::new_v4(),
tenant_id
))
}
async fn create_portal_session(
&self,
tenant_id: Uuid,
_return_url: String,
) -> Result<String, BillingError> {
// TODO: Actually call Stripe API
Ok(format!(
"https://billing.stripe.com/p/session/ps_test_{}?tenant_id={}",
Uuid::new_v4(),
tenant_id
))
}
async fn verify_webhook(
&self,
_payload: &[u8],
_headers: &HeaderMap,
) -> Result<BillingEvent, BillingError> {
// TODO: Implement real Stripe signature verification
Err(BillingError::Provider("Not implemented".to_string()))
}
async fn fetch_subscription(
&self,
_tenant_id: Uuid,
_subscription_id: &str,
) -> Result<BillingEvent, BillingError> {
// TODO: Actually call Stripe API with timeout
// let client = reqwest::Client::builder().timeout(Duration::from_secs(10)).build()...
Err(BillingError::Provider("Not implemented".to_string()))
}
}
pub struct MockProvider;
#[async_trait]
impl BillingProvider for MockProvider {
async fn create_checkout_session(
&self,
tenant_id: Uuid,
_plan: Plan,
_return_url: String,
) -> Result<String, BillingError> {
Ok(format!("https://mock.stripe.com/checkout/{}", tenant_id))
}
async fn create_portal_session(
&self,
tenant_id: Uuid,
_return_url: String,
) -> Result<String, BillingError> {
Ok(format!("https://mock.stripe.com/portal/{}", tenant_id))
}
async fn verify_webhook(
&self,
payload: &[u8],
_headers: &HeaderMap,
) -> Result<BillingEvent, BillingError> {
// Mock implementation: just parse the payload as a BillingEvent
serde_json::from_slice(payload).map_err(|e| BillingError::Provider(e.to_string()))
}
async fn fetch_subscription(
&self,
tenant_id: Uuid,
_subscription_id: &str,
) -> Result<BillingEvent, BillingError> {
// Mock implementation: return a SubscriptionUpdated event with current state
// In a real mock we might want to store expectations, but for now we just return something plausible.
Ok(BillingEvent::SubscriptionUpdated {
tenant_id,
event_id: format!("reconcile-{}", Uuid::new_v4()),
status: SubscriptionStatus::Active,
plan: Plan::Pro,
current_period_end: "2099-12-31T23:59:59Z".to_string(),
cancel_at_period_end: false,
ts_ms: SystemTime::now()
.duration_since(SystemTime::UNIX_EPOCH)
.unwrap()
.as_millis() as u64,
})
}
}
impl MockProvider {
pub fn get_checkout_url(tenant: Uuid) -> String {
format!("https://mock.stripe.com/checkout/{}", tenant)
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::env::temp_dir;
#[test]
fn test_entitlement_derivation() {
let e = Entitlements::derive(Some(&Plan::Free), Some(&SubscriptionStatus::PastDue));
assert_eq!(e.max_deployments, 1);
let e = Entitlements::derive(Some(&Plan::Pro), Some(&SubscriptionStatus::Active));
assert_eq!(e.max_deployments, 10);
assert!(e.s3_docs_enabled);
let e = Entitlements::derive(Some(&Plan::Enterprise), Some(&SubscriptionStatus::Trialing));
assert_eq!(e.max_deployments, 1000);
}
#[test]
fn test_billing_state_roundtrip() {
let mut path = temp_dir();
path.push(format!("billing-{}.json", Uuid::new_v4()));
let store = BillingStore::new(path.clone());
let tenant_id = Uuid::new_v4();
let resp = store.get_for_tenant(tenant_id);
assert!(!resp.configured);
assert_eq!(resp.entitlements.max_deployments, 1);
let state = TenantBillingState {
provider: "mock".to_string(),
provider_customer_id: None,
provider_subscription_id: None,
provider_checkout_session_id: None,
status: Some(SubscriptionStatus::Active),
plan: Some(Plan::Pro),
current_period_end: None,
cancel_at_period_end: Some(false),
processed_webhook_event_ids: vec![],
updated_at: 0,
};
store.update_tenant_state(tenant_id, state).unwrap();
let resp2 = store.get_for_tenant(tenant_id);
assert!(resp2.configured);
assert_eq!(resp2.provider.as_deref(), Some("mock"));
assert_eq!(resp2.plan, Some(Plan::Pro));
assert_eq!(resp2.entitlements.max_deployments, 10);
let _ = fs::remove_file(path);
}
#[tokio::test]
async fn test_reconciliation_corrects_state() {
let mut path = temp_dir();
path.push(format!("billing-reconcile-{}.json", Uuid::new_v4()));
let store = BillingStore::new(path.clone());
let tenant_id = Uuid::new_v4();
// 1. Initial state: PastDue
store
.update_tenant_state(
tenant_id,
TenantBillingState {
provider: "mock".to_string(),
provider_customer_id: Some("cus_1".to_string()),
provider_subscription_id: Some("sub_1".to_string()),
provider_checkout_session_id: None,
status: Some(SubscriptionStatus::PastDue),
plan: Some(Plan::Pro),
current_period_end: None,
cancel_at_period_end: Some(false),
processed_webhook_event_ids: vec![],
updated_at: 100,
},
)
.unwrap();
let state = AppState {
prometheus: crate::get_test_prometheus_handle(),
auth: crate::AuthConfig { hs256_secret: None },
jobs: crate::jobs::JobStore::default(),
audit: crate::AuditStore::default(),
tenant_locks: crate::job_engine::TenantLocks::default(),
config_locks: crate::job_engine::ConfigLocks::default(),
http: reqwest::Client::new(),
placement: crate::placement::PlacementStore::new(temp_dir().join("placement.json")),
billing: store.clone(),
billing_provider: Arc::new(MockProvider),
billing_enforcement_enabled: true,
config: crate::config_registry::ConfigRegistry::new(None, None),
fleet_services: vec![],
swarm: crate::swarm::SwarmStore::new(temp_dir().join("swarm.json")),
docs: None,
};
// 2. Run reconciliation. MockProvider returns Active status.
reconcile_once(&state).await;
// 3. Verify state is now Active
let resp = store.get_for_tenant(tenant_id);
assert_eq!(resp.status, Some(SubscriptionStatus::Active));
let _ = fs::remove_file(path);
}
}

View File

@@ -0,0 +1,323 @@
use async_trait::async_trait;
use futures::StreamExt;
use serde::{Deserialize, Serialize};
use std::{path::PathBuf, sync::Arc, time::Duration};
use thiserror::Error;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ConfigDomain {
Routing,
Placement,
}
impl ConfigDomain {
pub fn as_str(&self) -> &'static str {
match self {
ConfigDomain::Routing => "routing",
ConfigDomain::Placement => "placement",
}
}
}
#[derive(Debug, Error)]
pub enum ConfigRegistryError {
#[error("source error: {0}")]
Source(String),
#[error("decode error: {0}")]
Decode(String),
#[error("domain not configured")]
NotConfigured,
}
#[derive(Debug, Clone, Serialize)]
pub struct ConfigSnapshot<T> {
pub domain: String,
pub revision: u64,
pub value: T,
pub source: ConfigSourceInfo,
}
#[derive(Debug, Clone, Serialize)]
#[serde(tag = "kind", rename_all = "snake_case")]
pub enum ConfigSourceInfo {
File { path: String },
NatsKv { bucket: String, key: String },
Fixed,
}
#[async_trait]
pub trait ConfigSource: Send + Sync {
async fn load_bytes(&self) -> Result<(Option<Vec<u8>>, u64), ConfigRegistryError>;
async fn put_bytes(
&self,
expected_revision: Option<u64>,
value: Vec<u8>,
) -> Result<u64, ConfigRegistryError>;
async fn history_bytes(&self, limit: usize)
-> Result<Vec<(u64, Vec<u8>)>, ConfigRegistryError>;
async fn watch(
&self,
) -> Result<
std::pin::Pin<Box<dyn futures::Stream<Item = Result<(), ConfigRegistryError>> + Send>>,
ConfigRegistryError,
>;
fn info(&self) -> ConfigSourceInfo;
}
#[derive(Clone)]
pub struct FixedSource {
bytes: Arc<Vec<u8>>,
}
impl FixedSource {
pub fn new(bytes: Vec<u8>) -> Self {
Self {
bytes: Arc::new(bytes),
}
}
}
#[async_trait]
impl ConfigSource for FixedSource {
async fn load_bytes(&self) -> Result<(Option<Vec<u8>>, u64), ConfigRegistryError> {
Ok((Some(self.bytes.as_ref().clone()), 1))
}
async fn put_bytes(
&self,
_expected_revision: Option<u64>,
_value: Vec<u8>,
) -> Result<u64, ConfigRegistryError> {
Err(ConfigRegistryError::Source(
"fixed source is read-only".to_string(),
))
}
async fn history_bytes(
&self,
_limit: usize,
) -> Result<Vec<(u64, Vec<u8>)>, ConfigRegistryError> {
Err(ConfigRegistryError::Source(
"fixed source has no history".to_string(),
))
}
async fn watch(
&self,
) -> Result<
std::pin::Pin<Box<dyn futures::Stream<Item = Result<(), ConfigRegistryError>> + Send>>,
ConfigRegistryError,
> {
Ok(Box::pin(futures::stream::empty()))
}
fn info(&self) -> ConfigSourceInfo {
ConfigSourceInfo::Fixed
}
}
#[derive(Clone)]
pub struct FileSource {
path: PathBuf,
}
impl FileSource {
pub fn new(path: PathBuf) -> Self {
Self { path }
}
}
#[async_trait]
impl ConfigSource for FileSource {
async fn load_bytes(&self) -> Result<(Option<Vec<u8>>, u64), ConfigRegistryError> {
let raw = tokio::fs::read(&self.path)
.await
.map_err(|e| ConfigRegistryError::Source(e.to_string()))?;
Ok((Some(raw), 0))
}
async fn put_bytes(
&self,
_expected_revision: Option<u64>,
value: Vec<u8>,
) -> Result<u64, ConfigRegistryError> {
let tmp = self.path.with_extension("tmp");
tokio::fs::write(&tmp, &value)
.await
.map_err(|e| ConfigRegistryError::Source(e.to_string()))?;
tokio::fs::rename(&tmp, &self.path)
.await
.map_err(|e| ConfigRegistryError::Source(e.to_string()))?;
Ok(0)
}
async fn history_bytes(
&self,
_limit: usize,
) -> Result<Vec<(u64, Vec<u8>)>, ConfigRegistryError> {
Err(ConfigRegistryError::Source(
"file source has no history".to_string(),
))
}
async fn watch(
&self,
) -> Result<
std::pin::Pin<Box<dyn futures::Stream<Item = Result<(), ConfigRegistryError>> + Send>>,
ConfigRegistryError,
> {
Ok(Box::pin(futures::stream::empty()))
}
fn info(&self) -> ConfigSourceInfo {
ConfigSourceInfo::File {
path: self.path.to_string_lossy().to_string(),
}
}
}
#[derive(Clone)]
pub struct NatsKvSource {
kv: async_nats::jetstream::kv::Store,
bucket: String,
key: String,
}
impl NatsKvSource {
pub async fn connect(
nats_url: impl Into<String>,
bucket: impl Into<String>,
key: impl Into<String>,
) -> Result<Self, ConfigRegistryError> {
let nats_url = nats_url.into();
let bucket = bucket.into();
let key = key.into();
let client = tokio::time::timeout(Duration::from_secs(2), async_nats::connect(nats_url))
.await
.map_err(|_| ConfigRegistryError::Source("connect timeout".to_string()))?
.map_err(|e| ConfigRegistryError::Source(e.to_string()))?;
let jetstream = async_nats::jetstream::new(client);
let kv = match jetstream.get_key_value(&bucket).await {
Ok(kv) => kv,
Err(_) => jetstream
.create_key_value(async_nats::jetstream::kv::Config {
bucket: bucket.clone(),
..Default::default()
})
.await
.map_err(|e| ConfigRegistryError::Source(e.to_string()))?,
};
Ok(Self { kv, bucket, key })
}
}
#[async_trait]
impl ConfigSource for NatsKvSource {
async fn load_bytes(&self) -> Result<(Option<Vec<u8>>, u64), ConfigRegistryError> {
let entry = self
.kv
.entry(&self.key)
.await
.map_err(|e| ConfigRegistryError::Source(e.to_string()))?;
Ok(match entry {
Some(e) => (Some(e.value.to_vec()), e.revision),
None => (None, 0),
})
}
async fn put_bytes(
&self,
expected_revision: Option<u64>,
value: Vec<u8>,
) -> Result<u64, ConfigRegistryError> {
let rev = match expected_revision {
Some(expected) if expected > 0 => self
.kv
.update(&self.key, value.into(), expected)
.await
.map_err(|e| ConfigRegistryError::Source(e.to_string()))?,
_ => self
.kv
.put(&self.key, value.into())
.await
.map_err(|e| ConfigRegistryError::Source(e.to_string()))?,
};
Ok(rev)
}
async fn history_bytes(
&self,
limit: usize,
) -> Result<Vec<(u64, Vec<u8>)>, ConfigRegistryError> {
let mut stream = self
.kv
.history(&self.key)
.await
.map_err(|e| ConfigRegistryError::Source(e.to_string()))?;
let mut out = Vec::new();
while let Some(item) = stream.next().await {
let entry = item.map_err(|e| ConfigRegistryError::Source(e.to_string()))?;
out.push((entry.revision, entry.value.to_vec()));
if out.len() >= limit {
break;
}
}
Ok(out)
}
async fn watch(
&self,
) -> Result<
std::pin::Pin<Box<dyn futures::Stream<Item = Result<(), ConfigRegistryError>> + Send>>,
ConfigRegistryError,
> {
let key = self.key.clone();
let watch = self
.kv
.watch(&key)
.await
.map_err(|e| ConfigRegistryError::Source(e.to_string()))?;
Ok(Box::pin(watch.filter_map(|entry| async move {
match entry {
Ok(entry) => match entry.operation {
async_nats::jetstream::kv::Operation::Put => Some(Ok(())),
async_nats::jetstream::kv::Operation::Delete
| async_nats::jetstream::kv::Operation::Purge => None,
},
Err(e) => Some(Err(ConfigRegistryError::Source(e.to_string()))),
}
})))
}
fn info(&self) -> ConfigSourceInfo {
ConfigSourceInfo::NatsKv {
bucket: self.bucket.clone(),
key: self.key.clone(),
}
}
}
#[derive(Clone)]
pub struct ConfigRegistry {
routing: Option<Arc<dyn ConfigSource>>,
placement: Option<Arc<dyn ConfigSource>>,
}
impl ConfigRegistry {
pub fn new(
routing: Option<Arc<dyn ConfigSource>>,
placement: Option<Arc<dyn ConfigSource>>,
) -> Self {
Self { routing, placement }
}
pub fn source(&self, domain: ConfigDomain) -> Option<Arc<dyn ConfigSource>> {
match domain {
ConfigDomain::Routing => self.routing.clone(),
ConfigDomain::Placement => self.placement.clone(),
}
}
}

View File

@@ -0,0 +1,15 @@
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct RoutingConfig {
pub revision: u64,
pub aggregate_placement: HashMap<String, String>,
pub projection_placement: HashMap<String, String>,
pub runner_placement: HashMap<String, String>,
pub aggregate_shards: HashMap<String, Vec<String>>,
pub projection_shards: HashMap<String, Vec<String>>,
pub runner_shards: HashMap<String, Vec<String>>,
}

View File

@@ -0,0 +1,353 @@
use crate::auth::{Principal, has_permission};
use crate::{AppState, RequestIds};
use axum::{
Router,
body::Bytes,
extract::{Extension, Path, Query, State},
http::{HeaderMap, StatusCode, header},
response::IntoResponse,
routing::{get, post, put},
};
use serde::{Deserialize, Serialize};
use uuid::Uuid;
const HEADER_TENANT_ID: &str = shared::HEADER_X_TENANT_ID;
pub fn router() -> Router<AppState> {
Router::new()
.route("/tenants/{tenant_id}/docs", get(list_docs))
.route(
"/tenants/{tenant_id}/docs/{doc_type}/{doc_id}/{filename}",
put(upload_doc),
)
.route(
"/tenants/{tenant_id}/docs/object/{*key}",
get(get_doc).delete(delete_doc),
)
.route(
"/tenants/{tenant_id}/docs/presign/upload",
post(presign_upload),
)
.route(
"/tenants/{tenant_id}/docs/presign/download",
post(presign_download),
)
}
fn ensure_tenant_header(headers: &HeaderMap, tenant_id: Uuid) -> Result<(), StatusCode> {
let header_tid = headers
.get(HEADER_TENANT_ID)
.and_then(|v| v.to_str().ok())
.ok_or(StatusCode::BAD_REQUEST)?;
let header_tid = Uuid::parse_str(header_tid).map_err(|_| StatusCode::BAD_REQUEST)?;
if header_tid != tenant_id {
return Err(StatusCode::FORBIDDEN);
}
Ok(())
}
fn ensure_docs_enabled(state: &AppState, tenant_id: Uuid) -> Result<(), StatusCode> {
if !state.billing_enforcement_enabled {
return Ok(());
}
let entitlements = state.billing.get_for_tenant(tenant_id).entitlements;
if !entitlements.s3_docs_enabled {
return Err(StatusCode::PAYMENT_REQUIRED);
}
Ok(())
}
#[derive(Debug, Deserialize)]
struct ListQuery {
prefix: Option<String>,
}
#[derive(Debug, Serialize)]
struct ListResponse {
objects: Vec<crate::s3_docs::DocObject>,
}
async fn list_docs(
State(state): State<AppState>,
headers: HeaderMap,
Path(tenant_id): Path<Uuid>,
Query(q): Query<ListQuery>,
Extension(principal): Extension<Principal>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:read") {
return StatusCode::FORBIDDEN.into_response();
}
if let Err(s) = ensure_tenant_header(&headers, tenant_id) {
return s.into_response();
}
if let Err(s) = ensure_docs_enabled(&state, tenant_id) {
return s.into_response();
}
let store = match state.docs.as_ref() {
Some(s) => s,
None => return StatusCode::SERVICE_UNAVAILABLE.into_response(),
};
let prefix = q.prefix.unwrap_or_default();
let prefix = prefix.trim();
if prefix.contains("..") {
return StatusCode::BAD_REQUEST.into_response();
}
let base = format!("{}{}", store_prefix(store), tenant_id);
let prefix = if prefix.is_empty() {
format!("{base}/")
} else {
format!("{base}/{prefix}")
};
match store.list_for_tenant(&tenant_id.to_string(), &prefix).await {
Ok(objects) => (StatusCode::OK, axum::Json(ListResponse { objects })).into_response(),
Err(_) => StatusCode::BAD_GATEWAY.into_response(),
}
}
async fn upload_doc(
State(state): State<AppState>,
headers: HeaderMap,
Path((tenant_id, doc_type, doc_id, filename)): Path<(Uuid, String, String, String)>,
Extension(principal): Extension<Principal>,
Extension(request_ids): Extension<RequestIds>,
body: Bytes,
) -> impl IntoResponse {
if !has_permission(&principal, "control:write") {
return StatusCode::FORBIDDEN.into_response();
}
if let Err(s) = ensure_tenant_header(&headers, tenant_id) {
return s.into_response();
}
if let Err(s) = ensure_docs_enabled(&state, tenant_id) {
return s.into_response();
}
let store = match state.docs.as_ref() {
Some(s) => s,
None => return StatusCode::SERVICE_UNAVAILABLE.into_response(),
};
let ct = headers
.get(header::CONTENT_TYPE)
.and_then(|v| v.to_str().ok())
.map(|s| s.to_string());
let key = match store.key_for(&tenant_id.to_string(), &doc_type, &doc_id, &filename) {
Ok(k) => k,
Err(_) => return StatusCode::BAD_REQUEST.into_response(),
};
let bytes = body.to_vec();
let hash = crate::s3_docs::DocsStore::content_hash_sha256_hex(&bytes);
if let Err(e) = store
.put_for_tenant(&tenant_id.to_string(), &key, bytes, ct)
.await
{
tracing::warn!(
request_id = %request_ids.request_id,
correlation_id = ?request_ids.correlation_id,
error = %e,
"docs upload failed"
);
return StatusCode::BAD_GATEWAY.into_response();
}
(
StatusCode::OK,
axum::Json(serde_json::json!({
"key": key,
"sha256": hash,
})),
)
.into_response()
}
async fn get_doc(
State(state): State<AppState>,
headers: HeaderMap,
Path((tenant_id, key)): Path<(Uuid, String)>,
Extension(principal): Extension<Principal>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:read") {
return StatusCode::FORBIDDEN.into_response();
}
if let Err(s) = ensure_tenant_header(&headers, tenant_id) {
return s.into_response();
}
if let Err(s) = ensure_docs_enabled(&state, tenant_id) {
return s.into_response();
}
let store = match state.docs.as_ref() {
Some(s) => s,
None => return StatusCode::SERVICE_UNAVAILABLE.into_response(),
};
let base = format!("{}{}", store_prefix(store), tenant_id);
if !key.starts_with(&base) {
return StatusCode::FORBIDDEN.into_response();
}
match store
.get_bytes_for_tenant(&tenant_id.to_string(), &key)
.await
{
Ok((bytes, ct)) => {
let mut res = axum::response::Response::new(axum::body::Body::from(bytes));
*res.status_mut() = StatusCode::OK;
if let Some(ct) = ct
&& let Ok(v) = axum::http::HeaderValue::from_str(&ct)
{
res.headers_mut().insert(header::CONTENT_TYPE, v);
}
res
}
Err(_) => StatusCode::NOT_FOUND.into_response(),
}
}
async fn delete_doc(
State(state): State<AppState>,
headers: HeaderMap,
Path((tenant_id, key)): Path<(Uuid, String)>,
Extension(principal): Extension<Principal>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:write") {
return StatusCode::FORBIDDEN.into_response();
}
if let Err(s) = ensure_tenant_header(&headers, tenant_id) {
return s.into_response();
}
if let Err(s) = ensure_docs_enabled(&state, tenant_id) {
return s.into_response();
}
let store = match state.docs.as_ref() {
Some(s) => s,
None => return StatusCode::SERVICE_UNAVAILABLE.into_response(),
};
let base = format!("{}{}", store_prefix(store), tenant_id);
if !key.starts_with(&base) {
return StatusCode::FORBIDDEN.into_response();
}
match store.delete_for_tenant(&tenant_id.to_string(), &key).await {
Ok(_) => StatusCode::NO_CONTENT.into_response(),
Err(_) => StatusCode::BAD_GATEWAY.into_response(),
}
}
#[derive(Debug, Deserialize)]
struct PresignUploadRequest {
doc_type: String,
doc_id: Option<String>,
filename: String,
content_type: Option<String>,
}
async fn presign_upload(
State(state): State<AppState>,
headers: HeaderMap,
Path(tenant_id): Path<Uuid>,
Extension(principal): Extension<Principal>,
axum::Json(body): axum::Json<PresignUploadRequest>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:write") {
return StatusCode::FORBIDDEN.into_response();
}
if let Err(s) = ensure_tenant_header(&headers, tenant_id) {
return s.into_response();
}
if let Err(s) = ensure_docs_enabled(&state, tenant_id) {
return s.into_response();
}
let store = match state.docs.as_ref() {
Some(s) => s,
None => return StatusCode::SERVICE_UNAVAILABLE.into_response(),
};
let doc_id = body.doc_id.unwrap_or_else(|| Uuid::new_v4().to_string());
let key = match store.key_for(
&tenant_id.to_string(),
&body.doc_type,
&doc_id,
&body.filename,
) {
Ok(k) => k,
Err(_) => return StatusCode::BAD_REQUEST.into_response(),
};
match store
.presign_put_for_tenant(
&tenant_id.to_string(),
&key,
body.content_type,
std::time::Duration::from_secs(300),
)
.await
{
Ok(url) => (
StatusCode::OK,
axum::Json(serde_json::json!({
"method": "PUT",
"url": url,
"key": key,
})),
)
.into_response(),
Err(_) => StatusCode::BAD_GATEWAY.into_response(),
}
}
#[derive(Debug, Deserialize)]
struct PresignDownloadRequest {
key: String,
}
async fn presign_download(
State(state): State<AppState>,
headers: HeaderMap,
Path(tenant_id): Path<Uuid>,
Extension(principal): Extension<Principal>,
axum::Json(body): axum::Json<PresignDownloadRequest>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:read") {
return StatusCode::FORBIDDEN.into_response();
}
if let Err(s) = ensure_tenant_header(&headers, tenant_id) {
return s.into_response();
}
if let Err(s) = ensure_docs_enabled(&state, tenant_id) {
return s.into_response();
}
let store = match state.docs.as_ref() {
Some(s) => s,
None => return StatusCode::SERVICE_UNAVAILABLE.into_response(),
};
let base = format!("{}{}", store_prefix(store), tenant_id);
if !body.key.starts_with(&base) {
return StatusCode::FORBIDDEN.into_response();
}
match store
.presign_get_for_tenant(
&tenant_id.to_string(),
&body.key,
std::time::Duration::from_secs(300),
)
.await
{
Ok(url) => (
StatusCode::OK,
axum::Json(serde_json::json!({
"method": "GET",
"url": url,
"key": body.key,
})),
)
.into_response(),
Err(_) => StatusCode::BAD_GATEWAY.into_response(),
}
}
fn store_prefix(store: &crate::s3_docs::DocsStore) -> &str {
store.prefix()
}

127
control/api/src/drift.rs Normal file
View File

@@ -0,0 +1,127 @@
use crate::{AppState, build_info::extract_build_info, fleet, swarm::SwarmService};
use serde::Serialize;
use std::collections::{BTreeMap, BTreeSet};
#[derive(Debug, Clone, Serialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum DriftKind {
Missing,
Extra,
Unhealthy,
VersionMismatch,
}
#[derive(Debug, Clone, Serialize)]
pub struct DriftItem {
pub kind: DriftKind,
pub service: String,
pub details: serde_json::Value,
}
#[derive(Debug, Clone, Serialize)]
pub struct DriftResponse {
pub summary: BTreeMap<String, u64>,
pub items: Vec<DriftItem>,
}
pub async fn compute(state: &AppState) -> DriftResponse {
let mut items: Vec<DriftItem> = Vec::new();
// Desired service set: what the Control API was configured to observe.
// (In production, this should evolve into "desired stacks + required services".)
let desired: BTreeSet<String> = state
.fleet_services
.iter()
.map(|s| s.name.clone())
.collect();
// Observed service set: what Swarm reports (dev: from file snapshot).
let observed_services: Vec<SwarmService> = state.swarm.list_services();
let observed: BTreeSet<String> = observed_services.iter().map(|s| s.name.clone()).collect();
for missing in desired.difference(&observed) {
items.push(DriftItem {
kind: DriftKind::Missing,
service: missing.clone(),
details: serde_json::json!({ "expected": true }),
});
}
for extra in observed.difference(&desired) {
items.push(DriftItem {
kind: DriftKind::Extra,
service: extra.clone(),
details: serde_json::json!({ "observed": true }),
});
}
// Health drift: based on fleet snapshot.
let snapshots = fleet::snapshot(&state.http, &state.fleet_services).await;
for s in snapshots {
if !s.health_ok || !s.ready_ok {
items.push(DriftItem {
kind: DriftKind::Unhealthy,
service: s.name.clone(),
details: serde_json::json!({
"health_ok": s.health_ok,
"ready_ok": s.ready_ok,
"metrics_ok": s.metrics_ok,
"base_url": s.base_url,
}),
});
}
}
// Version drift: compare build_info between services when present.
// Desired is not yet explicit; for now we flag when multiple versions exist for same service.
let mut versions_by_service: BTreeMap<String, BTreeSet<String>> = BTreeMap::new();
let snapshots = fleet::snapshot(&state.http, &state.fleet_services).await;
for s in snapshots {
if let Ok(metrics) = state
.http
.get(format!("{}/metrics", s.base_url))
.send()
.await
&& let Ok(body) = metrics.text().await
{
for bi in extract_build_info(&body) {
versions_by_service
.entry(bi.service.clone())
.or_default()
.insert(format!("{}@{}", bi.version, bi.git_sha));
}
}
}
for (svc, vs) in versions_by_service {
if vs.len() > 1 {
items.push(DriftItem {
kind: DriftKind::VersionMismatch,
service: svc,
details: serde_json::json!({ "seen": vs.into_iter().collect::<Vec<_>>() }),
});
}
}
fn ord(k: &DriftKind) -> u8 {
match k {
DriftKind::Missing => 0,
DriftKind::Extra => 1,
DriftKind::Unhealthy => 2,
DriftKind::VersionMismatch => 3,
}
}
items.sort_by(|a, b| (ord(&a.kind), &a.service).cmp(&(ord(&b.kind), &b.service)));
let mut summary: BTreeMap<String, u64> = BTreeMap::new();
for item in &items {
let k = match item.kind {
DriftKind::Missing => "missing",
DriftKind::Extra => "extra",
DriftKind::Unhealthy => "unhealthy",
DriftKind::VersionMismatch => "version_mismatch",
};
*summary.entry(k.to_string()).or_insert(0) += 1;
}
DriftResponse { summary, items }
}

View File

@@ -1,14 +1,19 @@
use crate::{
AppState, Principal,
audit::{AuditEvent, AuditStore},
config_registry::{ConfigDomain, ConfigRegistryError},
config_schemas::RoutingConfig,
fleet,
jobs::{Job, JobStatus, JobStep, JobStore},
placement::PlacementFile,
};
use std::{
collections::HashMap,
path::PathBuf,
sync::{Arc, Mutex},
time::{Duration, SystemTime, UNIX_EPOCH},
};
use url::Url;
use uuid::Uuid;
#[derive(Clone, Default)]
@@ -34,20 +39,52 @@ impl TenantLocks {
}
}
#[derive(Clone, Default)]
pub struct ConfigLocks {
inner: Arc<Mutex<HashMap<String, Uuid>>>,
}
impl ConfigLocks {
pub fn try_lock(&self, domain: ConfigDomain, job_id: Uuid) -> bool {
let mut map = self.inner.lock().expect("config locks poisoned");
let k = domain.as_str().to_string();
if map.contains_key(&k) {
return false;
}
map.insert(k, job_id);
true
}
pub fn unlock(&self, domain: ConfigDomain, job_id: Uuid) {
let mut map = self.inner.lock().expect("config locks poisoned");
let k = domain.as_str().to_string();
if map.get(&k).copied() == Some(job_id) {
map.remove(&k);
}
}
}
#[derive(Clone)]
pub struct JobEngine {
pub jobs: JobStore,
pub audit: AuditStore,
pub tenant_locks: TenantLocks,
pub config_locks: ConfigLocks,
pub step_timeout: Duration,
}
impl JobEngine {
pub fn new(jobs: JobStore, audit: AuditStore, tenant_locks: TenantLocks) -> Self {
pub fn new(
jobs: JobStore,
audit: AuditStore,
tenant_locks: TenantLocks,
config_locks: ConfigLocks,
) -> Self {
Self {
jobs,
audit,
tenant_locks,
config_locks,
step_timeout: Duration::from_millis(500),
}
}
@@ -93,7 +130,7 @@ impl JobEngine {
let engine = self.clone();
tokio::spawn(async move {
engine
.run_job(state, inserted, Some(tenant_id), RunSpec::Drain)
.run_job(state, inserted, Some(tenant_id), None, RunSpec::Drain)
.await;
});
@@ -152,6 +189,7 @@ impl JobEngine {
state,
inserted,
Some(tenant_id),
None,
RunSpec::Migrate { runner_target },
)
.await;
@@ -160,7 +198,238 @@ impl JobEngine {
Ok(inserted)
}
async fn run_job(&self, state: AppState, job_id: Uuid, tenant_id: Option<Uuid>, spec: RunSpec) {
#[allow(clippy::too_many_arguments)]
pub fn start_config_apply(
&self,
state: AppState,
principal: &Principal,
domain: ConfigDomain,
reason: String,
expected_revision: Option<u64>,
value: serde_json::Value,
idempotency_key: &str,
) -> Result<Uuid, StartJobError> {
if let Some(existing) = self.jobs.get_idempotent(idempotency_key) {
return Ok(existing);
}
let job_id = Uuid::new_v4();
if !self.config_locks.try_lock(domain, job_id) {
return Err(StartJobError::TenantLocked);
}
let now = now_ms();
let job = Job {
job_id,
status: JobStatus::Pending,
steps: vec![
step("preflight"),
step("validate_config"),
step("backup_config"),
step("apply_config"),
step("reload_config"),
step("verify_config"),
],
error: None,
created_at_ms: now,
started_at_ms: None,
finished_at_ms: None,
};
let inserted = self.jobs.insert_idempotent(idempotency_key, job);
self.audit.record(AuditEvent {
ts_ms: now,
principal_sub: principal.sub.clone(),
action: format!("config.{}.apply", domain.as_str()),
tenant_id: None,
reason,
job_id: Some(inserted),
});
let engine = self.clone();
tokio::spawn(async move {
engine
.run_job(
state,
inserted,
None,
Some(domain),
RunSpec::ConfigApply {
domain,
expected_revision,
value,
},
)
.await;
});
Ok(inserted)
}
pub fn start_config_validate(
&self,
state: AppState,
principal: &Principal,
domain: ConfigDomain,
reason: String,
value: serde_json::Value,
idempotency_key: &str,
) -> Result<Uuid, StartJobError> {
if let Some(existing) = self.jobs.get_idempotent(idempotency_key) {
return Ok(existing);
}
let job_id = Uuid::new_v4();
if !self.config_locks.try_lock(domain, job_id) {
return Err(StartJobError::TenantLocked);
}
let now = now_ms();
let job = Job {
job_id,
status: JobStatus::Pending,
steps: vec![step("validate_config")],
error: None,
created_at_ms: now,
started_at_ms: None,
finished_at_ms: None,
};
let inserted = self.jobs.insert_idempotent(idempotency_key, job);
self.audit.record(AuditEvent {
ts_ms: now,
principal_sub: principal.sub.clone(),
action: format!("config.{}.validate", domain.as_str()),
tenant_id: None,
reason,
job_id: Some(inserted),
});
let engine = self.clone();
tokio::spawn(async move {
engine
.run_job(
state,
inserted,
None,
Some(domain),
RunSpec::ConfigValidate { domain, value },
)
.await;
});
Ok(inserted)
}
pub fn start_config_rollback(
&self,
state: AppState,
principal: &Principal,
domain: ConfigDomain,
reason: String,
idempotency_key: &str,
) -> Result<Uuid, StartJobError> {
if let Some(existing) = self.jobs.get_idempotent(idempotency_key) {
return Ok(existing);
}
let job_id = Uuid::new_v4();
if !self.config_locks.try_lock(domain, job_id) {
return Err(StartJobError::TenantLocked);
}
let now = now_ms();
let job = Job {
job_id,
status: JobStatus::Pending,
steps: vec![
step("rollback_config"),
step("reload_config"),
step("verify_config"),
],
error: None,
created_at_ms: now,
started_at_ms: None,
finished_at_ms: None,
};
let inserted = self.jobs.insert_idempotent(idempotency_key, job);
self.audit.record(AuditEvent {
ts_ms: now,
principal_sub: principal.sub.clone(),
action: format!("config.{}.rollback", domain.as_str()),
tenant_id: None,
reason,
job_id: Some(inserted),
});
let engine = self.clone();
tokio::spawn(async move {
engine
.run_job(
state,
inserted,
None,
Some(domain),
RunSpec::ConfigRollback { domain },
)
.await;
});
Ok(inserted)
}
pub fn start_platform_verify(
&self,
state: AppState,
principal: &Principal,
reason: String,
idempotency_key: &str,
) -> Result<Uuid, StartJobError> {
if let Some(existing) = self.jobs.get_idempotent(idempotency_key) {
return Ok(existing);
}
let job_id = Uuid::new_v4();
let now = now_ms();
let job = Job {
job_id,
status: JobStatus::Pending,
steps: vec![step("preflight"), step("platform_verify")],
error: None,
created_at_ms: now,
started_at_ms: None,
finished_at_ms: None,
};
let inserted = self.jobs.insert_idempotent(idempotency_key, job);
self.audit.record(AuditEvent {
ts_ms: now,
principal_sub: principal.sub.clone(),
action: "platform.verify".to_string(),
tenant_id: None,
reason,
job_id: Some(inserted),
});
let engine = self.clone();
tokio::spawn(async move {
engine
.run_job(state, inserted, None, None, RunSpec::PlatformVerify)
.await;
});
Ok(inserted)
}
async fn run_job(
&self,
state: AppState,
job_id: Uuid,
tenant_id: Option<Uuid>,
config_domain: Option<ConfigDomain>,
spec: RunSpec,
) {
self.jobs.update(job_id, |j| {
j.status = JobStatus::Running;
j.started_at_ms = Some(now_ms());
@@ -265,6 +534,9 @@ impl JobEngine {
if let Some(tid) = tenant_id {
self.tenant_locks.unlock(tid, job_id);
}
if let Some(domain) = config_domain {
self.config_locks.unlock(domain, job_id);
}
}
}
@@ -276,7 +548,22 @@ pub enum StartJobError {
#[derive(Clone)]
enum RunSpec {
Drain,
Migrate { runner_target: String },
Migrate {
runner_target: String,
},
ConfigValidate {
domain: ConfigDomain,
value: serde_json::Value,
},
ConfigApply {
domain: ConfigDomain,
expected_revision: Option<u64>,
value: serde_json::Value,
},
ConfigRollback {
domain: ConfigDomain,
},
PlatformVerify,
}
fn step(name: &str) -> JobStep {
@@ -316,9 +603,14 @@ async fn run_step(
"update_placement" => match spec {
RunSpec::Migrate { runner_target } => {
let tenant_id = tenant_id.ok_or_else(|| "missing tenant_id".to_string())?;
let entitlements = state.billing.get_for_tenant(tenant_id).entitlements;
state
.placement
.update_runner_target(tenant_id, runner_target.clone())
.update_runner_target(
tenant_id,
runner_target.clone(),
entitlements.max_runners as usize,
)
.map(|_| ())
}
_ => Ok(()),
@@ -343,6 +635,400 @@ async fn run_step(
}
_ => Ok(()),
},
"validate_config" => match spec {
RunSpec::ConfigValidate { domain, value }
| RunSpec::ConfigApply { domain, value, .. } => match domain {
ConfigDomain::Routing => {
let cfg = serde_json::from_value::<RoutingConfig>(value.clone())
.map_err(|e| format!("invalid routing config: {e}"))?;
validate_routing_semantic(&cfg)?;
Ok(())
}
ConfigDomain::Placement => {
let cfg = serde_json::from_value::<PlacementFile>(value.clone())
.map_err(|e| format!("invalid placement config: {e}"))?;
validate_placement_semantic(state, &cfg)?;
Ok(())
}
},
_ => Ok(()),
},
"backup_config" => match spec {
RunSpec::ConfigApply { domain, .. } => {
let Some(source) = state.config.source(*domain) else {
return Err("config domain not configured".to_string());
};
let (cur, _) = source
.load_bytes()
.await
.map_err(|e| format!("failed to load config: {e}"))?;
let cur = cur.unwrap_or_else(|| b"null".to_vec());
let backup_key_value = serde_json::json!({ "backup": serde_json::from_slice::<serde_json::Value>(&cur).unwrap_or(serde_json::Value::Null) });
let bytes =
serde_json::to_vec_pretty(&backup_key_value).map_err(|e| e.to_string())?;
let backup_source = backup_source_for(&source.info(), *domain)
.await
.map_err(|e| format!("failed to build backup source: {e}"))?;
let _ = backup_source
.put_bytes(None, bytes)
.await
.map_err(|e| format!("failed to write backup: {e}"))?;
Ok(())
}
_ => Ok(()),
},
"apply_config" => match spec {
RunSpec::ConfigApply {
domain,
expected_revision,
value,
} => {
let Some(source) = state.config.source(*domain) else {
return Err("config domain not configured".to_string());
};
let bytes =
serde_json::to_vec_pretty(value).map_err(|e| format!("encode error: {e}"))?;
let _ = source
.put_bytes(*expected_revision, bytes)
.await
.map_err(|e| format!("apply failed: {e}"))?;
Ok(())
}
_ => Ok(()),
},
"rollback_config" => match spec {
RunSpec::ConfigRollback { domain } => {
let Some(source) = state.config.source(*domain) else {
return Err("config domain not configured".to_string());
};
let backup_source = backup_source_for(&source.info(), *domain)
.await
.map_err(|e| format!("failed to build backup source: {e}"))?;
let (bytes, _) = backup_source
.load_bytes()
.await
.map_err(|e| format!("failed to load backup: {e}"))?;
let Some(bytes) = bytes else {
return Err("no backup available".to_string());
};
let v: serde_json::Value = serde_json::from_slice(&bytes)
.map_err(|e| format!("invalid backup json: {e}"))?;
let backup = v.get("backup").cloned().unwrap_or(serde_json::Value::Null);
let next =
serde_json::to_vec_pretty(&backup).map_err(|e| format!("encode error: {e}"))?;
let _ = source
.put_bytes(None, next)
.await
.map_err(|e| format!("rollback failed: {e}"))?;
Ok(())
}
_ => Ok(()),
},
"reload_config" => Ok(()),
"verify_config" => match spec {
RunSpec::ConfigValidate { domain, .. }
| RunSpec::ConfigApply { domain, .. }
| RunSpec::ConfigRollback { domain } => {
let Some(source) = state.config.source(*domain) else {
return Err("config domain not configured".to_string());
};
let (bytes, _) = source
.load_bytes()
.await
.map_err(|e| format!("failed to load config: {e}"))?;
let bytes = bytes.unwrap_or_else(|| b"null".to_vec());
let v: serde_json::Value = serde_json::from_slice(&bytes)
.map_err(|e| format!("invalid stored json: {e}"))?;
match domain {
ConfigDomain::Routing => {
let cfg = serde_json::from_value::<RoutingConfig>(v)
.map_err(|e| format!("invalid routing config: {e}"))?;
validate_routing_semantic(&cfg)?;
Ok(())
}
ConfigDomain::Placement => {
let cfg = serde_json::from_value::<PlacementFile>(v)
.map_err(|e| format!("invalid placement config: {e}"))?;
validate_placement_semantic(state, &cfg)?;
Ok(())
}
}
}
_ => Ok(()),
},
"platform_verify" => match spec {
RunSpec::PlatformVerify => {
let snapshots = fleet::snapshot(&state.http, &state.fleet_services).await;
let bad: Vec<_> = snapshots
.into_iter()
.filter(|s| !(s.health_ok && s.ready_ok))
.map(|s| {
format!(
"{} health_ok={} ready_ok={}",
s.name, s.health_ok, s.ready_ok
)
})
.collect();
if !bad.is_empty() {
return Err(format!("platform verify failed: {}", bad.join("; ")));
}
Ok(())
}
_ => Ok(()),
},
_ => Ok(()),
}
}
async fn backup_source_for(
info: &crate::config_registry::ConfigSourceInfo,
domain: ConfigDomain,
) -> Result<Arc<dyn crate::config_registry::ConfigSource>, ConfigRegistryError> {
use crate::config_registry::{ConfigSource, FileSource, NatsKvSource};
match info {
crate::config_registry::ConfigSourceInfo::File { path } => Ok(Arc::new(FileSource::new(
PathBuf::from(path).with_extension(format!("{}.bak.json", domain.as_str())),
))
as Arc<dyn ConfigSource>),
crate::config_registry::ConfigSourceInfo::NatsKv { bucket, key } => {
let nats_url = std::env::var("CONTROL_CONFIG_NATS_URL").map_err(|_| {
ConfigRegistryError::Source("missing CONTROL_CONFIG_NATS_URL".to_string())
})?;
Ok(Arc::new(
NatsKvSource::connect(nats_url, bucket.clone(), format!("{key}.bak"))
.await
.map_err(|e| ConfigRegistryError::Source(e.to_string()))?,
) as Arc<dyn ConfigSource>)
}
crate::config_registry::ConfigSourceInfo::Fixed => Err(ConfigRegistryError::Source(
"no backups for fixed source".to_string(),
)),
}
}
fn validate_routing_semantic(cfg: &RoutingConfig) -> Result<(), String> {
let shard_maps = [
("aggregate_shards", &cfg.aggregate_shards),
("projection_shards", &cfg.projection_shards),
("runner_shards", &cfg.runner_shards),
];
for (name, map) in shard_maps {
for (shard_id, endpoints) in map {
if endpoints.is_empty() {
return Err(format!("{name}[{shard_id}] has no endpoints"));
}
for ep in endpoints {
let u = Url::parse(ep)
.map_err(|e| format!("{name}[{shard_id}] invalid endpoint {ep:?}: {e}"))?;
if u.scheme() != "http" && u.scheme() != "https" {
return Err(format!(
"{name}[{shard_id}] endpoint {ep:?} must be http(s)"
));
}
if u.host_str().is_none() {
return Err(format!(
"{name}[{shard_id}] endpoint {ep:?} must include host"
));
}
}
}
}
// Ensure placement references known shard ids.
let placements = [
(
"aggregate_placement",
&cfg.aggregate_placement,
&cfg.aggregate_shards,
),
(
"projection_placement",
&cfg.projection_placement,
&cfg.projection_shards,
),
(
"runner_placement",
&cfg.runner_placement,
&cfg.runner_shards,
),
];
for (pname, pmap, shards) in placements {
for (tenant, shard_id) in pmap {
if shard_id.trim().is_empty() {
return Err(format!("{pname}[{tenant}] shard_id is empty"));
}
if !shards.contains_key(shard_id) {
return Err(format!(
"{pname}[{tenant}] references missing shard_id {shard_id:?}"
));
}
}
}
Ok(())
}
fn validate_placement_semantic(state: &AppState, cfg: &PlacementFile) -> Result<(), String> {
if !state.billing_enforcement_enabled {
return Ok(());
}
let mut tenant_counts = std::collections::HashMap::new();
let kinds = [
("aggregate_placement", cfg.aggregate_placement.as_ref()),
("projection_placement", cfg.projection_placement.as_ref()),
("runner_placement", cfg.runner_placement.as_ref()),
];
for (kind_name, k) in kinds {
let Some(k) = k else { continue };
for p in &k.placements {
if p.targets.is_empty() {
return Err(format!("{kind_name} tenant {} has no targets", p.tenant_id));
}
if p.targets.iter().any(|t| t.trim().is_empty()) {
return Err(format!(
"{kind_name} tenant {} has empty target",
p.tenant_id
));
}
let entry = tenant_counts.entry(p.tenant_id).or_insert((0, 0)); // (deployments, runners)
if kind_name == "runner_placement" {
entry.1 += p.targets.len();
} else {
entry.0 += p.targets.len();
}
}
}
for (tenant_id, (deployments, runners)) in tenant_counts {
let entitlements = state.billing.get_for_tenant(tenant_id).entitlements;
if deployments > entitlements.max_deployments as usize {
return Err(format!(
"tenant {} exceeds max_deployments limit ({} > {})",
tenant_id, deployments, entitlements.max_deployments
));
}
if runners > entitlements.max_runners as usize {
return Err(format!(
"tenant {} exceeds max_runners limit ({} > {})",
tenant_id, runners, entitlements.max_runners
));
}
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use crate::billing::{BillingStore, Plan, SubscriptionStatus, TenantBillingState};
use crate::placement::{PlacementFile, PlacementKind, TenantPlacement};
fn mock_state(billing: BillingStore) -> AppState {
let handle = crate::get_test_prometheus_handle();
let root = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"));
AppState {
prometheus: handle,
auth: crate::AuthConfig {
hs256_secret: Some(b"secret".to_vec()),
},
jobs: JobStore::default(),
audit: AuditStore::default(),
tenant_locks: TenantLocks::default(),
config_locks: ConfigLocks::default(),
http: reqwest::Client::new(),
placement: crate::placement::PlacementStore::new(
std::env::temp_dir().join("placement.json"),
),
billing,
billing_provider: Arc::new(crate::billing::MockProvider),
billing_enforcement_enabled: true,
config: crate::config_registry::ConfigRegistry::new(None, None),
fleet_services: vec![],
swarm: crate::swarm::SwarmStore::new(root.join("swarm/dev.json")),
docs: None,
}
}
#[test]
fn test_validate_placement_limits() {
let tenant_id = Uuid::new_v4();
let billing_path =
std::env::temp_dir().join(format!("billing-unit-{}.json", Uuid::new_v4()));
let billing = BillingStore::new(billing_path.clone());
let state = mock_state(billing.clone());
// 1. Free plan (default): max_deployments=1, max_runners=1
let cfg = PlacementFile {
revision: Some("v1".to_string()),
aggregate_placement: Some(PlacementKind {
placements: vec![TenantPlacement {
tenant_id,
targets: vec!["a1".to_string()],
}],
}),
projection_placement: Some(PlacementKind {
placements: vec![TenantPlacement {
tenant_id,
targets: vec!["p1".to_string()],
}],
}),
runner_placement: Some(PlacementKind {
placements: vec![TenantPlacement {
tenant_id,
targets: vec!["r1".to_string()],
}],
}),
};
// aggregate(1) + projection(1) = 2 deployments. Limit is 1. Should fail.
let err = validate_placement_semantic(&state, &cfg).unwrap_err();
assert!(err.contains("exceeds max_deployments limit"));
// 2. Reduce to 1 deployment
let cfg2 = PlacementFile {
revision: Some("v2".to_string()),
aggregate_placement: Some(PlacementKind {
placements: vec![TenantPlacement {
tenant_id,
targets: vec!["a1".to_string()],
}],
}),
projection_placement: None,
runner_placement: Some(PlacementKind {
placements: vec![TenantPlacement {
tenant_id,
targets: vec!["r1".to_string()],
}],
}),
};
validate_placement_semantic(&state, &cfg2).unwrap();
// 3. Upgrade to Pro: max_deployments=10, max_runners=10
billing
.update_tenant_state(
tenant_id,
TenantBillingState {
provider: "mock".to_string(),
provider_customer_id: None,
provider_subscription_id: None,
provider_checkout_session_id: None,
status: Some(SubscriptionStatus::Active),
plan: Some(Plan::Pro),
current_period_end: None,
cancel_at_period_end: None,
processed_webhook_event_ids: vec![],
updated_at: 100,
},
)
.unwrap();
// Now the first cfg should pass
validate_placement_semantic(&state, &cfg).unwrap();
let _ = std::fs::remove_file(billing_path);
}
}

View File

@@ -1,14 +1,22 @@
mod admin;
mod audit;
mod auth;
pub mod billing;
mod build_info;
pub mod config_registry;
mod config_schemas;
mod deployments;
mod documents;
mod drift;
mod fleet;
mod job_engine;
mod jobs;
mod placement;
pub mod s3_docs;
mod swarm;
use std::sync::Arc;
pub use audit::AuditStore;
pub use auth::{AuthConfig, Principal};
use axum::{
@@ -20,8 +28,10 @@ use axum::{
routing::get,
};
pub use build_info::{BuildInfo, extract_build_info};
pub use config_registry::{ConfigDomain, ConfigRegistry};
pub use deployments::{DeployAnnotationArgs, GrafanaAnnotation, build_grafana_deploy_annotation};
pub use fleet::FleetService;
pub use job_engine::ConfigLocks;
pub use job_engine::TenantLocks;
pub use jobs::JobStore;
use metrics_exporter_prometheus::PrometheusHandle;
@@ -40,10 +50,16 @@ pub struct AppState {
pub jobs: JobStore,
pub audit: AuditStore,
pub tenant_locks: TenantLocks,
pub config_locks: ConfigLocks,
pub http: reqwest::Client,
pub placement: PlacementStore,
pub billing: billing::BillingStore,
pub billing_provider: Arc<dyn billing::BillingProvider>,
pub billing_enforcement_enabled: bool,
pub config: ConfigRegistry,
pub fleet_services: Vec<FleetService>,
pub swarm: SwarmStore,
pub docs: Option<s3_docs::DocsStore>,
}
#[derive(Clone, Debug)]
@@ -93,13 +109,18 @@ pub fn build_app(state: AppState) -> Router {
},
);
let admin =
admin::admin_router().layer(from_fn_with_state(state.clone(), auth::auth_middleware));
let admin = admin::admin_router()
.merge(documents::router())
.layer(from_fn_with_state(state.clone(), auth::auth_middleware));
Router::new()
.route("/health", get(health))
.route("/ready", get(ready))
.route("/metrics", get(metrics))
.route(
"/admin/v1/billing/webhooks/{provider}",
axum::routing::post(billing::webhook),
)
.nest("/admin/v1", admin)
.with_state(state)
.layer(trace)
@@ -167,25 +188,46 @@ async fn request_id_middleware(mut req: Request<axum::body::Body>, next: Next) -
res
}
#[cfg(test)]
static TEST_PROMETHEUS_HANDLE: std::sync::OnceLock<PrometheusHandle> = std::sync::OnceLock::new();
#[cfg(test)]
pub(crate) fn get_test_prometheus_handle() -> PrometheusHandle {
TEST_PROMETHEUS_HANDLE
.get_or_init(|| {
metrics_exporter_prometheus::PrometheusBuilder::new()
.install_recorder()
.unwrap_or_else(|_| {
// This can happen if another test already installed it.
// We might not get the ACTUAL handle to the global recorder here if we don't share it,
// but for tests it's usually fine to have a dummy one if we are not asserting on metrics.
metrics_exporter_prometheus::PrometheusBuilder::new()
.build()
.expect("failed to build prometheus recorder")
.0
.handle()
})
})
.clone()
}
#[cfg(test)]
mod tests {
use super::*;
use crate::config_registry::{FileSource, FixedSource};
use crate::jobs::JobStatus;
use axum::{
body::Body,
http::{Request, StatusCode, header},
};
use jsonwebtoken::{EncodingKey, Header, encode};
use metrics_exporter_prometheus::PrometheusBuilder;
use serde::Serialize;
use std::fs;
use std::path::PathBuf;
use std::sync::OnceLock;
use std::sync::Arc;
use tower::ServiceExt;
use uuid::Uuid;
static HANDLE: OnceLock<PrometheusHandle> = OnceLock::new();
#[derive(Serialize)]
struct TestClaims {
sub: String,
@@ -199,15 +241,10 @@ mod tests {
}
fn test_app_with_fleet(fleet_services: Vec<FleetService>) -> Router {
let handle = HANDLE
.get_or_init(|| {
PrometheusBuilder::new()
.install_recorder()
.expect("failed to install prometheus recorder")
})
.clone();
let handle = get_test_prometheus_handle();
let placement_path = temp_placement_file();
let root = repo_root();
build_app(AppState {
prometheus: handle,
@@ -217,10 +254,23 @@ mod tests {
jobs: JobStore::default(),
audit: AuditStore::default(),
tenant_locks: TenantLocks::default(),
config_locks: ConfigLocks::default(),
http: reqwest::Client::new(),
placement: PlacementStore::new(placement_path),
billing: crate::billing::BillingStore::new(
std::env::temp_dir().join(format!("billing-test-{}.json", Uuid::new_v4())),
),
billing_provider: Arc::new(crate::billing::MockProvider),
billing_enforcement_enabled: true,
config: ConfigRegistry::new(
Some(Arc::new(FileSource::new(
root.join("config/routing/dev.json"),
))),
Some(Arc::new(FixedSource::new(b"{}".to_vec()))),
),
fleet_services,
swarm: SwarmStore::new(repo_root().join("swarm/dev.json")),
docs: None,
})
}
@@ -234,14 +284,14 @@ mod tests {
fn temp_placement_file() -> PathBuf {
let root = repo_root();
let src = root.join("placement/dev.json");
let src = root.join("config/placement/dev.json");
let mut dst = std::env::temp_dir();
dst.push(format!(
"cloudlysis-control-placement-{}-{}.json",
std::process::id(),
Uuid::new_v4()
));
let raw = fs::read_to_string(src).expect("missing placement/dev.json");
let raw = fs::read_to_string(src).expect("missing config/placement/dev.json");
fs::write(&dst, raw).expect("failed to write temp placement file");
dst
}
@@ -689,4 +739,467 @@ mod tests {
&serde_json::json!(["preflight", "drain", "update_placement", "reload", "verify"])
);
}
#[tokio::test]
async fn billing_returns_not_configured_by_default() {
let token = make_token(&["control:read"]);
let tenant_id = Uuid::new_v4();
let res = test_app()
.oneshot(
Request::builder()
.uri(format!("/admin/v1/tenants/{tenant_id}/billing"))
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.header("x-tenant-id", tenant_id.to_string())
.body(Body::empty())
.unwrap(),
)
.await
.unwrap();
assert_eq!(res.status(), StatusCode::OK);
let body = axum::body::to_bytes(res.into_body(), 1024 * 1024)
.await
.unwrap();
let v: serde_json::Value = serde_json::from_slice(&body).unwrap();
assert_eq!(v.get("configured").unwrap(), &serde_json::json!(false));
assert_eq!(
v.get("entitlements")
.unwrap()
.get("max_deployments")
.unwrap(),
&serde_json::json!(1)
);
}
#[tokio::test]
async fn billing_returns_configured_state() {
let token = make_token(&["control:read"]);
let tenant_id = Uuid::new_v4();
let handle = get_test_prometheus_handle();
let billing_path =
std::env::temp_dir().join(format!("billing-test-cfg-{}.json", Uuid::new_v4()));
let billing = crate::billing::BillingStore::new(billing_path.clone());
billing
.update_tenant_state(
tenant_id,
crate::billing::TenantBillingState {
provider: "stripe".to_string(),
provider_customer_id: Some("cus_123".to_string()),
provider_subscription_id: Some("sub_123".to_string()),
provider_checkout_session_id: None,
status: Some(crate::billing::SubscriptionStatus::Active),
plan: Some(crate::billing::Plan::Pro),
current_period_end: Some("2026-04-30T00:00:00Z".to_string()),
cancel_at_period_end: Some(false),
processed_webhook_event_ids: vec![],
updated_at: 1234567890,
},
)
.unwrap();
let root = repo_root();
let app = build_app(AppState {
prometheus: handle,
auth: AuthConfig {
hs256_secret: Some(b"test_secret".to_vec()),
},
jobs: JobStore::default(),
audit: AuditStore::default(),
tenant_locks: TenantLocks::default(),
config_locks: ConfigLocks::default(),
http: reqwest::Client::new(),
placement: PlacementStore::new(temp_placement_file()),
billing,
billing_provider: Arc::new(crate::billing::MockProvider),
billing_enforcement_enabled: true,
config: ConfigRegistry::new(
Some(Arc::new(FileSource::new(
root.join("config/routing/dev.json"),
))),
Some(Arc::new(FixedSource::new(b"{}".to_vec()))),
),
fleet_services: vec![],
swarm: SwarmStore::new(repo_root().join("swarm/dev.json")),
docs: None,
});
let res = app
.oneshot(
Request::builder()
.uri(format!("/admin/v1/tenants/{tenant_id}/billing"))
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.header("x-tenant-id", tenant_id.to_string())
.body(Body::empty())
.unwrap(),
)
.await
.unwrap();
assert_eq!(res.status(), StatusCode::OK);
let body = axum::body::to_bytes(res.into_body(), 1024 * 1024)
.await
.unwrap();
let v: serde_json::Value = serde_json::from_slice(&body).unwrap();
assert_eq!(v.get("configured").unwrap(), &serde_json::json!(true));
assert_eq!(v.get("plan").unwrap(), &serde_json::json!("pro"));
assert_eq!(
v.get("entitlements")
.unwrap()
.get("max_deployments")
.unwrap(),
&serde_json::json!(10)
);
let _ = std::fs::remove_file(billing_path);
}
#[tokio::test]
async fn checkout_returns_mock_url() {
let token = make_token(&["control:write"]);
let tenant_id = Uuid::new_v4();
let body = serde_json::json!({
"plan": "pro",
"return_path": "/custom-return"
});
let res = test_app()
.oneshot(
Request::builder()
.uri(format!("/admin/v1/tenants/{tenant_id}/billing/checkout"))
.method("POST")
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.header("x-tenant-id", tenant_id.to_string())
.header(header::CONTENT_TYPE, "application/json")
.body(Body::from(body.to_string()))
.unwrap(),
)
.await
.unwrap();
assert_eq!(res.status(), StatusCode::OK);
let body = axum::body::to_bytes(res.into_body(), 1024 * 1024)
.await
.unwrap();
let v: serde_json::Value = serde_json::from_slice(&body).unwrap();
assert_eq!(
v.get("url").unwrap(),
&serde_json::json!(format!("https://mock.stripe.com/checkout/{}", tenant_id))
);
}
#[tokio::test]
async fn checkout_fails_if_already_active() {
let token = make_token(&["control:write"]);
let tenant_id = Uuid::new_v4();
// Setup app with active subscription
let billing_path =
std::env::temp_dir().join(format!("billing-test-active-{}.json", Uuid::new_v4()));
let billing = crate::billing::BillingStore::new(billing_path.clone());
billing
.update_tenant_state(
tenant_id,
crate::billing::TenantBillingState {
provider: "mock".to_string(),
provider_customer_id: None,
provider_subscription_id: None,
provider_checkout_session_id: None,
status: Some(crate::billing::SubscriptionStatus::Active),
plan: Some(crate::billing::Plan::Pro),
current_period_end: None,
cancel_at_period_end: None,
processed_webhook_event_ids: vec![],
updated_at: 0,
},
)
.unwrap();
let handle = get_test_prometheus_handle();
let root = repo_root();
let app = build_app(AppState {
prometheus: handle,
auth: AuthConfig {
hs256_secret: Some(b"test_secret".to_vec()),
},
jobs: JobStore::default(),
audit: AuditStore::default(),
tenant_locks: TenantLocks::default(),
config_locks: ConfigLocks::default(),
http: reqwest::Client::new(),
placement: PlacementStore::new(temp_placement_file()),
billing,
billing_provider: Arc::new(crate::billing::MockProvider),
billing_enforcement_enabled: true,
config: ConfigRegistry::new(
Some(Arc::new(FileSource::new(
root.join("config/routing/dev.json"),
))),
Some(Arc::new(FixedSource::new(b"{}".to_vec()))),
),
fleet_services: vec![],
swarm: SwarmStore::new(repo_root().join("swarm/dev.json")),
docs: None,
});
let body = serde_json::json!({ "plan": "pro" });
let res = app
.oneshot(
Request::builder()
.uri(format!("/admin/v1/tenants/{tenant_id}/billing/checkout"))
.method("POST")
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.header("x-tenant-id", tenant_id.to_string())
.header(header::CONTENT_TYPE, "application/json")
.body(Body::from(body.to_string()))
.unwrap(),
)
.await
.unwrap();
assert_eq!(res.status(), StatusCode::CONFLICT);
let _ = std::fs::remove_file(billing_path);
}
#[tokio::test]
async fn portal_returns_mock_url() {
let token = make_token(&["control:write"]);
let tenant_id = Uuid::new_v4();
let res = test_app()
.oneshot(
Request::builder()
.uri(format!("/admin/v1/tenants/{tenant_id}/billing/portal"))
.method("POST")
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.header("x-tenant-id", tenant_id.to_string())
.body(Body::empty())
.unwrap(),
)
.await
.unwrap();
assert_eq!(res.status(), StatusCode::OK);
let body = axum::body::to_bytes(res.into_body(), 1024 * 1024)
.await
.unwrap();
let v: serde_json::Value = serde_json::from_slice(&body).unwrap();
assert_eq!(
v.get("url").unwrap(),
&serde_json::json!(format!("https://mock.stripe.com/portal/{}", tenant_id))
);
}
#[tokio::test]
async fn webhook_updates_state_idempotently() {
let tenant_id = Uuid::new_v4();
let event_id = "evt_123".to_string();
let app = test_app();
let event = crate::billing::BillingEvent::SubscriptionCreated {
tenant_id,
event_id: event_id.clone(),
provider_customer_id: "cus_123".to_string(),
provider_subscription_id: "sub_123".to_string(),
status: crate::billing::SubscriptionStatus::Active,
plan: crate::billing::Plan::Pro,
current_period_end: "2026-04-30T00:00:00Z".to_string(),
ts_ms: 1000,
};
let body = serde_json::to_string(&event).unwrap();
// 1. Send webhook
let res = app
.clone()
.oneshot(
Request::builder()
.uri("/admin/v1/billing/webhooks/mock")
.method("POST")
.header(header::CONTENT_TYPE, "application/json")
.body(Body::from(body.clone()))
.unwrap(),
)
.await
.unwrap();
assert_eq!(res.status(), StatusCode::OK);
// 2. Verify state
let token = make_token(&["control:read"]);
let res = app
.clone()
.oneshot(
Request::builder()
.uri(format!("/admin/v1/tenants/{tenant_id}/billing"))
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.header("x-tenant-id", tenant_id.to_string())
.body(Body::empty())
.unwrap(),
)
.await
.unwrap();
let body_bytes = axum::body::to_bytes(res.into_body(), 1024 * 1024)
.await
.unwrap();
let v: serde_json::Value = serde_json::from_slice(&body_bytes).unwrap();
assert_eq!(v.get("configured").unwrap(), &serde_json::json!(true));
assert_eq!(v.get("plan").unwrap(), &serde_json::json!("pro"));
// 3. Send same webhook again (idempotency)
let res = app
.clone()
.oneshot(
Request::builder()
.uri("/admin/v1/billing/webhooks/mock")
.method("POST")
.header(header::CONTENT_TYPE, "application/json")
.body(Body::from(body))
.unwrap(),
)
.await
.unwrap();
assert_eq!(res.status(), StatusCode::OK);
}
#[tokio::test]
async fn webhook_ignores_stale_events() {
let tenant_id = Uuid::new_v4();
let app = test_app();
// 1. Send recent event (ts=2000)
let event1 = crate::billing::BillingEvent::SubscriptionUpdated {
tenant_id,
event_id: "evt_new".to_string(),
status: crate::billing::SubscriptionStatus::Active,
plan: crate::billing::Plan::Enterprise,
current_period_end: "2026-05-30T00:00:00Z".to_string(),
cancel_at_period_end: false,
ts_ms: 2000,
};
app.clone()
.oneshot(
Request::builder()
.uri("/admin/v1/billing/webhooks/mock")
.method("POST")
.body(Body::from(serde_json::to_string(&event1).unwrap()))
.unwrap(),
)
.await
.unwrap();
// 2. Send stale event (ts=1000)
let event2 = crate::billing::BillingEvent::SubscriptionUpdated {
tenant_id,
event_id: "evt_old".to_string(),
status: crate::billing::SubscriptionStatus::PastDue,
plan: crate::billing::Plan::Pro,
current_period_end: "2026-04-30T00:00:00Z".to_string(),
cancel_at_period_end: false,
ts_ms: 1000,
};
app.clone()
.oneshot(
Request::builder()
.uri("/admin/v1/billing/webhooks/mock")
.method("POST")
.body(Body::from(serde_json::to_string(&event2).unwrap()))
.unwrap(),
)
.await
.unwrap();
// 3. Verify state is still Enterprise
let token = make_token(&["control:read"]);
let res = app
.clone()
.oneshot(
Request::builder()
.uri(format!("/admin/v1/tenants/{tenant_id}/billing"))
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.header("x-tenant-id", tenant_id.to_string())
.body(Body::empty())
.unwrap(),
)
.await
.unwrap();
let body_bytes = axum::body::to_bytes(res.into_body(), 1024 * 1024)
.await
.unwrap();
let v: serde_json::Value = serde_json::from_slice(&body_bytes).unwrap();
assert_eq!(v.get("plan").unwrap(), &serde_json::json!("enterprise"));
}
#[tokio::test]
async fn s3_docs_requires_pro_plan() {
let token = make_token(&["control:read", "control:write"]);
let tenant_id = Uuid::new_v4();
let app = test_app();
// 1. Try to list docs (Free plan by default)
let res = app
.clone()
.oneshot(
Request::builder()
.uri(format!("/admin/v1/tenants/{tenant_id}/docs"))
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.header("x-tenant-id", tenant_id.to_string())
.body(Body::empty())
.unwrap(),
)
.await
.unwrap();
assert_eq!(res.status(), StatusCode::PAYMENT_REQUIRED);
// 2. Update to Pro plan via webhook
let event = crate::billing::BillingEvent::SubscriptionCreated {
tenant_id,
event_id: "evt_pro".to_string(),
provider_customer_id: "cus_pro".to_string(),
provider_subscription_id: "sub_pro".to_string(),
status: crate::billing::SubscriptionStatus::Active,
plan: crate::billing::Plan::Pro,
current_period_end: "2099-01-01T00:00:00Z".to_string(),
ts_ms: 2000,
};
app.clone()
.oneshot(
Request::builder()
.uri("/admin/v1/billing/webhooks/mock")
.method("POST")
.header(header::CONTENT_TYPE, "application/json")
.body(Body::from(serde_json::to_string(&event).unwrap()))
.unwrap(),
)
.await
.unwrap();
// 3. Try to list docs again (Should fail with 503 if S3 not configured in tests, or 200/502 if it is)
// In test_app(), docs is None by default.
let res = app
.clone()
.oneshot(
Request::builder()
.uri(format!("/admin/v1/tenants/{tenant_id}/docs"))
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.header("x-tenant-id", tenant_id.to_string())
.body(Body::empty())
.unwrap(),
)
.await
.unwrap();
// Since docs is None in test_app(), it returns SERVICE_UNAVAILABLE (503) AFTER passing the entitlement check.
// If it was still PAYMENT_REQUIRED, it would return 402.
assert_eq!(res.status(), StatusCode::SERVICE_UNAVAILABLE);
}
}

View File

@@ -1,6 +1,8 @@
use clap::Parser;
use metrics_exporter_prometheus::PrometheusBuilder;
use std::net::SocketAddr;
use std::path::PathBuf;
use std::sync::Arc;
use tracing_subscriber::EnvFilter;
#[derive(Parser, Debug)]
@@ -33,15 +35,31 @@ async fn main() {
.build()
.expect("failed to build http client");
let placement_path = std::env::var("CONTROL_PLACEMENT_PATH")
let placement_path: PathBuf = std::env::var("CONTROL_PLACEMENT_PATH")
.ok()
.unwrap_or_else(|| "placement/dev.json".to_string())
.unwrap_or_else(|| "config/placement/dev.json".to_string())
.into();
let swarm_path = std::env::var("CONTROL_SWARM_STATE_PATH")
let billing_path: PathBuf = std::env::var("CONTROL_BILLING_STATE_PATH")
.ok()
.unwrap_or_else(|| "billing/dev.json".to_string())
.into();
let routing_path: PathBuf = std::env::var("CONTROL_ROUTING_PATH")
.ok()
.unwrap_or_else(|| "config/routing/dev.json".to_string())
.into();
let swarm_mode = std::env::var("CONTROL_SWARM_MODE").ok();
let swarm = if swarm_mode.as_deref() == Some("docker") {
api::SwarmStore::new_docker_cli()
} else {
let swarm_path: PathBuf = std::env::var("CONTROL_SWARM_STATE_PATH")
.ok()
.unwrap_or_else(|| "swarm/dev.json".to_string())
.into();
api::SwarmStore::new(swarm_path)
};
let self_url = std::env::var("CONTROL_SELF_URL")
.ok()
@@ -55,7 +73,70 @@ async fn main() {
fleet_services.extend(parse_fleet_services(&spec));
}
let app = api::build_app(api::AppState {
let docs_cfg =
api::s3_docs::DocsConfig::from_env().expect("missing S3 document storage configuration");
let docs = api::s3_docs::DocsStore::new(docs_cfg)
.await
.expect("failed to initialize S3 document storage client");
let config = {
let routing = if let (Ok(nats_url), Ok(bucket), Ok(key)) = (
std::env::var("CONTROL_ROUTING_NATS_URL"),
std::env::var("CONTROL_ROUTING_NATS_BUCKET"),
std::env::var("CONTROL_ROUTING_NATS_KEY"),
) {
Some(Arc::new(
api::config_registry::NatsKvSource::connect(nats_url, bucket, key)
.await
.expect("failed to connect to routing config nats kv"),
) as Arc<dyn api::config_registry::ConfigSource>)
} else {
Some(
Arc::new(api::config_registry::FileSource::new(routing_path))
as Arc<dyn api::config_registry::ConfigSource>,
)
};
let placement = if let (Ok(nats_url), Ok(bucket), Ok(key)) = (
std::env::var("CONTROL_PLACEMENT_NATS_URL"),
std::env::var("CONTROL_PLACEMENT_NATS_BUCKET"),
std::env::var("CONTROL_PLACEMENT_NATS_KEY"),
) {
Some(Arc::new(
api::config_registry::NatsKvSource::connect(nats_url, bucket, key)
.await
.expect("failed to connect to placement config nats kv"),
) as Arc<dyn api::config_registry::ConfigSource>)
} else {
Some(Arc::new(api::config_registry::FileSource::new(
placement_path.clone(),
))
as Arc<dyn api::config_registry::ConfigSource>)
};
api::ConfigRegistry::new(routing, placement)
};
let billing_provider: Arc<dyn api::billing::BillingProvider> =
match std::env::var("CONTROL_BILLING_PROVIDER").as_deref() {
Ok("stripe") => {
let secret_key = std::env::var("CONTROL_STRIPE_SECRET_KEY")
.expect("CONTROL_STRIPE_SECRET_KEY required for stripe provider");
let price_pro = std::env::var("CONTROL_STRIPE_PRICE_ID_PRO")
.expect("CONTROL_STRIPE_PRICE_ID_PRO required for stripe provider");
let price_enterprise = std::env::var("CONTROL_STRIPE_PRICE_ID_ENTERPRISE")
.expect("CONTROL_STRIPE_PRICE_ID_ENTERPRISE required for stripe provider");
Arc::new(api::billing::StripeProvider {
secret_key,
price_pro,
price_enterprise,
})
}
_ => Arc::new(api::billing::MockProvider),
};
let state = api::AppState {
prometheus: recorder,
auth: api::AuthConfig {
hs256_secret: std::env::var("CONTROL_GATEWAY_JWT_HS256_SECRET")
@@ -65,11 +146,25 @@ async fn main() {
jobs: api::JobStore::default(),
audit: api::AuditStore::default(),
tenant_locks: api::TenantLocks::default(),
config_locks: api::ConfigLocks::default(),
http,
placement: api::PlacementStore::new(placement_path),
billing: api::billing::BillingStore::new(billing_path),
billing_provider,
billing_enforcement_enabled: std::env::var("CONTROL_BILLING_ENFORCEMENT_ENABLED")
.ok()
.and_then(|s| s.parse().ok())
.unwrap_or(false),
config,
fleet_services,
swarm: api::SwarmStore::new(swarm_path),
});
swarm,
docs: Some(docs),
};
// Spawn reconciliation loop
tokio::spawn(api::billing::run_reconciliation_loop(state.clone()));
let app = api::build_app(state);
let listener = tokio::net::TcpListener::bind(args.addr)
.await

View File

@@ -157,6 +157,7 @@ impl PlacementStore {
&self,
tenant_id: Uuid,
runner_target: String,
max_runners: usize,
) -> Result<String, String> {
let mut inner = self.inner.write().expect("placement lock poisoned");
inner.reload_if_changed();
@@ -178,8 +179,17 @@ impl PlacementStore {
.iter_mut()
.find(|p| p.tenant_id == tenant_id)
{
// If already at or above limit, and we are adding a NEW target (not replacing), it would fail.
// But here update_runner_target REPLACES the target list with a single target for now.
// If in the future we want to append, we check targets.len().
if 1 > max_runners {
return Err(format!("exceeds max_runners limit of {}", max_runners));
}
existing.targets = vec![runner_target];
} else {
if 1 > max_runners {
return Err(format!("exceeds max_runners limit of {}", max_runners));
}
runner.placements.push(TenantPlacement {
tenant_id,
targets: vec![runner_target],

508
control/api/src/s3_docs.rs Normal file
View File

@@ -0,0 +1,508 @@
use aws_config::Region;
use aws_credential_types::Credentials;
use aws_sdk_s3::presigning::PresigningConfig;
use aws_sdk_s3::types::BucketCannedAcl;
use aws_sdk_s3::{Client, config::Builder as S3ConfigBuilder};
use sha2::Digest;
use std::time::Duration;
#[derive(Clone, Debug)]
pub struct DocsConfig {
pub endpoint: String,
pub public_endpoint: Option<String>,
pub region: String,
pub access_key_id: String,
pub secret_access_key: String,
pub force_path_style: bool,
pub insecure: bool,
pub buckets: Vec<String>,
pub prefix: String,
}
impl DocsConfig {
pub fn from_env() -> Result<Self, String> {
fn get(name: &str) -> Option<String> {
std::env::var(name)
.ok()
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
}
fn get_secret(name: &str, file_name: &str) -> Result<Option<String>, String> {
if let Some(path) = get(file_name) {
let raw = std::fs::read_to_string(path).map_err(|e| e.to_string())?;
let v = raw.trim().to_string();
if v.is_empty() {
return Ok(None);
}
return Ok(Some(v));
}
Ok(get(name))
}
let endpoint = get("CONTROL_S3_ENDPOINT")
.or_else(|| get("S3_ENDPOINT"))
.ok_or_else(|| "Missing CONTROL_S3_ENDPOINT".to_string())?;
let public_endpoint =
get("CONTROL_S3_PUBLIC_ENDPOINT").or_else(|| get("S3_PUBLIC_ENDPOINT"));
let region = get("CONTROL_S3_REGION")
.or_else(|| get("S3_REGION"))
.unwrap_or_else(|| "us-east-1".to_string());
let access_key_id =
get_secret("CONTROL_S3_ACCESS_KEY_ID", "CONTROL_S3_ACCESS_KEY_ID_FILE")?
.or_else(|| {
get_secret("S3_ACCESS_KEY_ID", "S3_ACCESS_KEY_ID_FILE")
.ok()
.flatten()
})
.ok_or_else(|| "Missing CONTROL_S3_ACCESS_KEY_ID".to_string())?;
let secret_access_key = get_secret(
"CONTROL_S3_SECRET_ACCESS_KEY",
"CONTROL_S3_SECRET_ACCESS_KEY_FILE",
)?
.or_else(|| {
get_secret("S3_SECRET_ACCESS_KEY", "S3_SECRET_ACCESS_KEY_FILE")
.ok()
.flatten()
})
.ok_or_else(|| "Missing CONTROL_S3_SECRET_ACCESS_KEY".to_string())?;
let force_path_style = get("CONTROL_S3_FORCE_PATH_STYLE")
.or_else(|| get("S3_FORCE_PATH_STYLE"))
.as_deref()
.map(|v| v == "true" || v == "1")
.unwrap_or(true);
let insecure = get("CONTROL_S3_INSECURE")
.or_else(|| get("S3_INSECURE"))
.as_deref()
.map(|v| v == "true" || v == "1")
.unwrap_or(false);
let bucket_raw = get("CONTROL_S3_BUCKET_DOCS")
.or_else(|| get("S3_BUCKET_DOCS"))
.ok_or_else(|| "Missing CONTROL_S3_BUCKET_DOCS".to_string())?;
let buckets: Vec<String> = bucket_raw
.split(',')
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect();
if buckets.is_empty() {
return Err("Missing CONTROL_S3_BUCKET_DOCS".to_string());
}
let prefix = get("CONTROL_S3_PREFIX_DOCS")
.or_else(|| get("S3_PREFIX_DOCS"))
.unwrap_or_else(|| "docs/".to_string());
let prefix = if prefix.ends_with('/') {
prefix
} else {
format!("{prefix}/")
};
// SECURITY: `*_INSECURE=true` is intended for local MinIO setups that use plain HTTP.
// We currently do not disable TLS certificate verification for HTTPS endpoints.
if insecure && endpoint.trim_start().starts_with("https://") {
return Err(
"CONTROL_S3_INSECURE=true is not supported with https:// endpoints (TLS verification is not disabled). Use http:// for local MinIO, or set CONTROL_S3_INSECURE=false for production."
.to_string(),
);
}
Ok(Self {
endpoint,
public_endpoint,
region,
access_key_id,
secret_access_key,
force_path_style,
insecure,
buckets,
prefix,
})
}
}
#[derive(Clone)]
pub struct DocsStore {
cfg: DocsConfig,
client: Client,
presign_client: Client,
}
impl DocsStore {
pub async fn new(cfg: DocsConfig) -> Result<Self, String> {
let creds = Credentials::new(
cfg.access_key_id.clone(),
cfg.secret_access_key.clone(),
None,
None,
"static",
);
let shared = aws_config::from_env()
.region(Region::new(cfg.region.clone()))
.credentials_provider(creds.clone())
.endpoint_url(cfg.endpoint.clone())
.load()
.await;
let s3_conf = S3ConfigBuilder::from(&shared)
.force_path_style(cfg.force_path_style)
.build();
let client = Client::from_conf(s3_conf);
let presign_endpoint = cfg
.public_endpoint
.clone()
.unwrap_or_else(|| cfg.endpoint.clone());
let presign_shared = aws_config::from_env()
.region(Region::new(cfg.region.clone()))
.credentials_provider(creds)
.endpoint_url(presign_endpoint)
.load()
.await;
let presign_conf = S3ConfigBuilder::from(&presign_shared)
.force_path_style(cfg.force_path_style)
.build();
let presign_client = Client::from_conf(presign_conf);
Ok(Self {
cfg,
client,
presign_client,
})
}
pub fn key_for(
&self,
tenant_id: &str,
doc_type: &str,
doc_id: &str,
filename: &str,
) -> Result<String, String> {
validate_segment("tenant_id", tenant_id)?;
validate_segment("doc_type", doc_type)?;
validate_segment("doc_id", doc_id)?;
validate_filename(filename)?;
Ok(format!(
"{}{}/{}/{}/{}",
self.cfg.prefix, tenant_id, doc_type, doc_id, filename
))
}
pub fn prefix(&self) -> &str {
self.cfg.prefix.as_str()
}
pub fn buckets(&self) -> &[String] {
self.cfg.buckets.as_slice()
}
fn bucket_for_tenant(&self, tenant_id: &str) -> &str {
// Deterministic sharding across buckets. Note: if the bucket list changes, the mapping changes.
// For production, set the full planned bucket set up-front (e.g. `-0,-1,-2`) to keep mapping stable.
let n = self.cfg.buckets.len();
if n == 1 {
return self.cfg.buckets[0].as_str();
}
let mut hasher = sha2::Sha256::new();
hasher.update(tenant_id.as_bytes());
let digest = hasher.finalize();
let mut b = [0u8; 8];
b.copy_from_slice(&digest[..8]);
let v = u64::from_be_bytes(b);
let idx = (v as usize) % n;
self.cfg.buckets[idx].as_str()
}
pub fn content_hash_sha256_hex(bytes: &[u8]) -> String {
let mut hasher = sha2::Sha256::new();
hasher.update(bytes);
let digest = hasher.finalize();
let mut out = String::with_capacity(digest.len() * 2);
for b in digest {
use std::fmt::Write;
let _ = write!(&mut out, "{:02x}", b);
}
out
}
pub async fn put_for_tenant(
&self,
tenant_id: &str,
key: &str,
bytes: Vec<u8>,
content_type: Option<String>,
) -> Result<(), String> {
let mut req = self
.client
.put_object()
.bucket(self.bucket_for_tenant(tenant_id))
.key(key)
.body(aws_sdk_s3::primitives::ByteStream::from(bytes));
if let Some(ct) = content_type {
req = req.content_type(ct);
}
req.send().await.map_err(|e| e.to_string())?;
Ok(())
}
pub async fn get_bytes_for_tenant(
&self,
tenant_id: &str,
key: &str,
) -> Result<(Vec<u8>, Option<String>), String> {
let out = self
.client
.get_object()
.bucket(self.bucket_for_tenant(tenant_id))
.key(key)
.send()
.await
.map_err(|e| e.to_string())?;
let ct = out.content_type().map(|s| s.to_string());
let bytes = out
.body
.collect()
.await
.map_err(|e| e.to_string())?
.into_bytes()
.to_vec();
Ok((bytes, ct))
}
pub async fn delete_for_tenant(&self, tenant_id: &str, key: &str) -> Result<(), String> {
self.client
.delete_object()
.bucket(self.bucket_for_tenant(tenant_id))
.key(key)
.send()
.await
.map_err(|e| e.to_string())?;
Ok(())
}
pub async fn list_for_tenant(
&self,
tenant_id: &str,
prefix: &str,
) -> Result<Vec<DocObject>, String> {
let out = self
.client
.list_objects_v2()
.bucket(self.bucket_for_tenant(tenant_id))
.prefix(prefix)
.send()
.await
.map_err(|e| e.to_string())?;
let mut items = Vec::new();
for o in out.contents() {
if let Some(key) = o.key() {
items.push(DocObject {
key: key.to_string(),
size: o.size().unwrap_or(0),
last_modified: o.last_modified().map(|d| d.to_string()),
});
}
}
Ok(items)
}
pub async fn ensure_buckets_exist(&self) -> Result<(), String> {
for bucket in &self.cfg.buckets {
let head = self.client.head_bucket().bucket(bucket).send().await;
if head.is_ok() {
continue;
}
self.client
.create_bucket()
.bucket(bucket)
.acl(BucketCannedAcl::Private)
.send()
.await
.map_err(|e| e.to_string())?;
}
Ok(())
}
pub async fn presign_put_for_tenant(
&self,
tenant_id: &str,
key: &str,
content_type: Option<String>,
expires: Duration,
) -> Result<String, String> {
let mut req = self
.presign_client
.put_object()
.bucket(self.bucket_for_tenant(tenant_id))
.key(key);
if let Some(ct) = content_type {
req = req.content_type(ct);
}
let presigned = req
.presigned(PresigningConfig::expires_in(expires).map_err(|e| e.to_string())?)
.await
.map_err(|e| e.to_string())?;
Ok(presigned.uri().to_string())
}
pub async fn presign_get_for_tenant(
&self,
tenant_id: &str,
key: &str,
expires: Duration,
) -> Result<String, String> {
let req = self
.presign_client
.get_object()
.bucket(self.bucket_for_tenant(tenant_id))
.key(key);
let presigned = req
.presigned(PresigningConfig::expires_in(expires).map_err(|e| e.to_string())?)
.await
.map_err(|e| e.to_string())?;
Ok(presigned.uri().to_string())
}
}
#[derive(Clone, Debug, serde::Serialize)]
pub struct DocObject {
pub key: String,
pub size: i64,
pub last_modified: Option<String>,
}
fn validate_segment(name: &str, value: &str) -> Result<(), String> {
if value.is_empty() {
return Err(format!("{name} is required"));
}
if value.len() > 128 {
return Err(format!("{name} too long"));
}
if value.contains('/') || value.contains('\\') {
return Err(format!("{name} contains invalid characters"));
}
if value.contains("..") {
return Err(format!("{name} contains invalid characters"));
}
Ok(())
}
fn validate_filename(value: &str) -> Result<(), String> {
validate_segment("filename", value)?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
fn env_lock() -> std::sync::MutexGuard<'static, ()> {
static LOCK: std::sync::OnceLock<std::sync::Mutex<()>> = std::sync::OnceLock::new();
LOCK.get_or_init(|| std::sync::Mutex::new(()))
.lock()
.unwrap()
}
#[test]
fn config_from_env_parses_expected_fields() {
let _guard = env_lock();
unsafe {
std::env::set_var("CONTROL_S3_ENDPOINT", "http://minio:9000");
std::env::set_var("CONTROL_S3_REGION", "us-east-1");
std::env::set_var("CONTROL_S3_ACCESS_KEY_ID", "minioadmin");
std::env::set_var("CONTROL_S3_SECRET_ACCESS_KEY", "minioadmin");
std::env::set_var("CONTROL_S3_BUCKET_DOCS", "cloudlysis-docs");
std::env::set_var("CONTROL_S3_PREFIX_DOCS", "docs/");
std::env::set_var("CONTROL_S3_FORCE_PATH_STYLE", "true");
std::env::set_var("CONTROL_S3_INSECURE", "true");
}
let cfg = DocsConfig::from_env().unwrap();
assert_eq!(cfg.endpoint, "http://minio:9000");
assert_eq!(cfg.buckets, vec!["cloudlysis-docs".to_string()]);
assert_eq!(cfg.prefix, "docs/");
assert!(cfg.force_path_style);
assert!(cfg.insecure);
unsafe {
std::env::remove_var("CONTROL_S3_ENDPOINT");
std::env::remove_var("CONTROL_S3_REGION");
std::env::remove_var("CONTROL_S3_ACCESS_KEY_ID");
std::env::remove_var("CONTROL_S3_SECRET_ACCESS_KEY");
std::env::remove_var("CONTROL_S3_BUCKET_DOCS");
std::env::remove_var("CONTROL_S3_PREFIX_DOCS");
std::env::remove_var("CONTROL_S3_FORCE_PATH_STYLE");
std::env::remove_var("CONTROL_S3_INSECURE");
}
}
#[test]
fn config_rejects_insecure_with_https_endpoint() {
let _guard = env_lock();
unsafe {
std::env::set_var("CONTROL_S3_ENDPOINT", "https://s3.example.com");
std::env::set_var("CONTROL_S3_ACCESS_KEY_ID", "a");
std::env::set_var("CONTROL_S3_SECRET_ACCESS_KEY", "b");
std::env::set_var(
"CONTROL_S3_BUCKET_DOCS",
"cloudlysis-docs-0,cloudlysis-docs-1",
);
std::env::set_var("CONTROL_S3_INSECURE", "true");
}
let err = DocsConfig::from_env().unwrap_err();
assert!(
err.contains("CONTROL_S3_INSECURE=true") && err.contains("https://"),
"unexpected error: {err}"
);
unsafe {
std::env::remove_var("CONTROL_S3_ENDPOINT");
std::env::remove_var("CONTROL_S3_ACCESS_KEY_ID");
std::env::remove_var("CONTROL_S3_SECRET_ACCESS_KEY");
std::env::remove_var("CONTROL_S3_BUCKET_DOCS");
std::env::remove_var("CONTROL_S3_INSECURE");
}
}
#[tokio::test]
async fn key_scheme_is_stable() {
let cfg = DocsConfig {
endpoint: "http://minio:9000".to_string(),
public_endpoint: None,
region: "us-east-1".to_string(),
access_key_id: "x".to_string(),
secret_access_key: "y".to_string(),
force_path_style: true,
insecure: true,
buckets: vec![
"cloudlysis-docs-0".to_string(),
"cloudlysis-docs-1".to_string(),
],
prefix: "docs/".to_string(),
};
let store = DocsStore::new(cfg).await.unwrap();
let key = store
.key_for("tenant-a", "deployments", "v1", "bundle.tar.gz")
.unwrap();
assert_eq!(key, "docs/tenant-a/deployments/v1/bundle.tar.gz");
}
#[tokio::test]
async fn key_scheme_rejects_invalid_segments() {
let cfg = DocsConfig {
endpoint: "http://minio:9000".to_string(),
public_endpoint: None,
region: "us-east-1".to_string(),
access_key_id: "x".to_string(),
secret_access_key: "y".to_string(),
force_path_style: true,
insecure: true,
buckets: vec!["cloudlysis-docs".to_string()],
prefix: "docs/".to_string(),
};
let store = DocsStore::new(cfg).await.unwrap();
assert!(store.key_for("t/a", "x", "y", "z").is_err());
assert!(store.key_for("t", "x", "../y", "z").is_err());
assert!(store.key_for("t", "x", "y", "a/b").is_err());
}
}

View File

@@ -28,31 +28,49 @@ pub struct SwarmStateFile {
#[derive(Clone)]
pub struct SwarmStore {
path: std::path::PathBuf,
inner: SwarmStoreInner,
}
#[derive(Clone)]
enum SwarmStoreInner {
File { path: std::path::PathBuf },
DockerCli,
}
impl SwarmStore {
pub fn new(path: std::path::PathBuf) -> Self {
Self { path }
Self {
inner: SwarmStoreInner::File { path },
}
}
pub fn new_docker_cli() -> Self {
Self {
inner: SwarmStoreInner::DockerCli,
}
}
pub fn list_services(&self) -> Vec<SwarmService> {
self.load().map(|s| s.services).unwrap_or_default()
match &self.inner {
SwarmStoreInner::File { path } => {
load_state(path).map(|s| s.services).unwrap_or_default()
}
SwarmStoreInner::DockerCli => list_services_docker_cli().unwrap_or_default(),
}
}
pub fn list_tasks(&self, service_name: &str) -> Vec<SwarmTask> {
self.load()
match &self.inner {
SwarmStoreInner::File { path } => load_state(path)
.map(|s| {
s.tasks
.into_iter()
.filter(|t| t.service == service_name)
.collect()
})
.unwrap_or_default()
.unwrap_or_default(),
SwarmStoreInner::DockerCli => list_tasks_docker_cli(service_name).unwrap_or_default(),
}
fn load(&self) -> Option<SwarmStateFile> {
load_state(&self.path)
}
}
@@ -60,3 +78,120 @@ fn load_state(path: &Path) -> Option<SwarmStateFile> {
let raw = fs::read_to_string(path).ok()?;
serde_json::from_str(&raw).ok()
}
fn list_services_docker_cli() -> Result<Vec<SwarmService>, String> {
let out = std::process::Command::new("docker")
.args(["service", "ls", "--format", "{{json .}}"])
.output()
.map_err(|e| format!("docker exec failed: {e}"))?;
if !out.status.success() {
return Err(format!(
"docker service ls failed: {}",
String::from_utf8_lossy(&out.stderr)
));
}
#[derive(Deserialize)]
struct ServiceRow {
#[serde(rename = "Name")]
name: String,
#[serde(rename = "Image")]
image: Option<String>,
#[serde(rename = "Mode")]
mode: Option<String>,
#[serde(rename = "Replicas")]
replicas: Option<String>,
#[serde(rename = "UpdatedAt")]
updated_at: Option<String>,
}
let mut services = Vec::new();
for line in String::from_utf8_lossy(&out.stdout).lines() {
let line = line.trim();
if line.is_empty() {
continue;
}
let row: ServiceRow =
serde_json::from_str(line).map_err(|e| format!("invalid json row: {e}"))?;
services.push(SwarmService {
name: row.name,
image: row.image,
mode: row.mode,
replicas: row.replicas,
updated_at: row.updated_at,
});
}
Ok(services)
}
fn list_tasks_docker_cli(service_name: &str) -> Result<Vec<SwarmTask>, String> {
let out = std::process::Command::new("docker")
.args([
"service",
"ps",
service_name,
"--no-trunc",
"--format",
"{{json .}}",
])
.output()
.map_err(|e| format!("docker exec failed: {e}"))?;
if !out.status.success() {
return Err(format!(
"docker service ps failed: {}",
String::from_utf8_lossy(&out.stderr)
));
}
#[derive(Deserialize)]
struct TaskRow {
#[serde(rename = "ID")]
id: String,
#[serde(rename = "Name")]
name: Option<String>,
#[serde(rename = "Node")]
node: Option<String>,
#[serde(rename = "DesiredState")]
desired_state: Option<String>,
#[serde(rename = "CurrentState")]
current_state: Option<String>,
#[serde(rename = "Error")]
error: Option<String>,
}
let mut tasks = Vec::new();
for line in String::from_utf8_lossy(&out.stdout).lines() {
let line = line.trim();
if line.is_empty() {
continue;
}
let row: TaskRow =
serde_json::from_str(line).map_err(|e| format!("invalid json row: {e}"))?;
let service = row
.name
.as_deref()
.and_then(|n| n.split_once('.').map(|(svc, _)| svc.to_string()))
.unwrap_or_else(|| service_name.to_string());
tasks.push(SwarmTask {
id: row.id,
service,
node: row.node,
desired_state: row.desired_state,
current_state: row.current_state,
error: row.error,
});
}
Ok(tasks)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn state_file_parses() {
let raw = r#"{"services":[{"name":"a","image":null,"mode":null,"replicas":null,"updated_at":null}],"tasks":[]}"#;
let parsed: SwarmStateFile = serde_json::from_str(raw).unwrap();
assert_eq!(parsed.services.len(), 1);
}
}

View File

@@ -0,0 +1,174 @@
use api::{
AppState, AuditStore, AuthConfig, ConfigLocks, JobStore, PlacementStore, SwarmStore,
TenantLocks, billing::BillingStore, config_registry::ConfigRegistry,
};
use axum::{
Router,
body::Body,
http::{Request, StatusCode, header},
};
use jsonwebtoken::{EncodingKey, Header, encode};
use metrics_exporter_prometheus::PrometheusBuilder;
use serde::Serialize;
use std::{
path::PathBuf,
sync::{Arc, OnceLock},
};
use tower::ServiceExt;
use uuid::Uuid;
fn prod_enabled() -> bool {
std::env::var("CONTROL_TEST_BILLING_PROD").ok().as_deref() == Some("1")
}
static HANDLE: OnceLock<metrics_exporter_prometheus::PrometheusHandle> = OnceLock::new();
fn repo_root() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.and_then(|p| p.parent())
.expect("api crate should live under repo root")
.to_path_buf()
}
#[derive(Serialize)]
struct TestClaims {
sub: String,
session_id: String,
permissions: Vec<String>,
exp: usize,
}
fn make_token(secret: &[u8], perms: &[&str]) -> String {
let exp = (std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_secs()
+ 60) as usize;
encode(
&Header::default(),
&TestClaims {
sub: "user_1".to_string(),
session_id: "sess_1".to_string(),
permissions: perms.iter().map(|p| (*p).to_string()).collect(),
exp,
},
&EncodingKey::from_secret(secret),
)
.unwrap()
}
fn test_app() -> Router {
let handle = HANDLE
.get_or_init(|| {
PrometheusBuilder::new()
.install_recorder()
.expect("failed to install prometheus recorder")
})
.clone();
let provider_type =
std::env::var("CONTROL_BILLING_PROVIDER").unwrap_or_else(|_| "mock".to_string());
let billing_provider: Arc<dyn api::billing::BillingProvider> = match provider_type.as_str() {
"stripe" => Arc::new(api::billing::StripeProvider {
secret_key: std::env::var("CONTROL_STRIPE_SECRET_KEY").unwrap_or_default(),
price_pro: std::env::var("CONTROL_STRIPE_PRICE_ID_PRO").unwrap_or_default(),
price_enterprise: std::env::var("CONTROL_STRIPE_PRICE_ID_ENTERPRISE")
.unwrap_or_default(),
}),
_ => Arc::new(api::billing::MockProvider),
};
api::build_app(AppState {
prometheus: handle,
auth: AuthConfig {
hs256_secret: Some(b"test_secret".to_vec()),
},
jobs: JobStore::default(),
audit: AuditStore::default(),
tenant_locks: TenantLocks::default(),
config_locks: ConfigLocks::default(),
http: reqwest::Client::new(),
placement: PlacementStore::new(repo_root().join("config/placement/dev.json")),
billing: BillingStore::new(std::env::temp_dir().join("billing-prod-smoke.json")),
billing_provider,
billing_enforcement_enabled: true,
config: ConfigRegistry::new(None, None),
fleet_services: vec![],
swarm: SwarmStore::new(repo_root().join("swarm/dev.json")),
docs: None,
})
}
#[tokio::test]
async fn billing_production_smoke_test() {
if !prod_enabled() {
eprintln!("skipping: set CONTROL_TEST_BILLING_PROD=1 to enable production smoke tests");
return;
}
let app = test_app();
let token = make_token(b"test_secret", &["control:read", "control:write"]);
let tenant_id = Uuid::new_v4();
// 1. Verify GET billing works (empty initially)
let res = app
.clone()
.oneshot(
Request::builder()
.uri(format!("/admin/v1/tenants/{tenant_id}/billing"))
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.header("x-tenant-id", tenant_id.to_string())
.body(Body::empty())
.unwrap(),
)
.await
.unwrap();
assert_eq!(res.status(), StatusCode::OK);
// 2. Verify Checkout session generation
let res = app
.clone()
.oneshot(
Request::builder()
.uri(format!("/admin/v1/tenants/{tenant_id}/billing/checkout"))
.method("POST")
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.header("x-tenant-id", tenant_id.to_string())
.header(header::CONTENT_TYPE, "application/json")
.body(Body::from(
serde_json::json!({
"plan": "pro",
"return_path": "/billing"
})
.to_string(),
))
.unwrap(),
)
.await
.unwrap();
assert_eq!(res.status(), StatusCode::OK);
let body = axum::body::to_bytes(res.into_body(), 1024 * 1024)
.await
.unwrap();
let v: serde_json::Value = serde_json::from_slice(&body).unwrap();
assert!(v.get("url").and_then(|u| u.as_str()).is_some());
// 3. Verify Portal session generation (may fail if tenant has no stripe customer id yet, which is expected for fresh tenant)
let res = app
.clone()
.oneshot(
Request::builder()
.uri(format!("/admin/v1/tenants/{tenant_id}/billing/portal"))
.method("POST")
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.header("x-tenant-id", tenant_id.to_string())
.body(Body::empty())
.unwrap(),
)
.await
.unwrap();
// For smoke test, we just want to see it reached the provider and didn't crash
assert!(res.status() == StatusCode::OK || res.status() == StatusCode::INTERNAL_SERVER_ERROR);
}

View File

@@ -0,0 +1,250 @@
use api::{
AppState, AuditStore, AuthConfig, ConfigLocks, ConfigRegistry, JobStore, PlacementStore,
SwarmStore, TenantLocks, config_registry::NatsKvSource,
};
use axum::{
Router,
body::Body,
http::{Request, StatusCode, header},
};
use jsonwebtoken::{EncodingKey, Header, encode};
use metrics_exporter_prometheus::PrometheusBuilder;
use serde::Serialize;
use std::{path::PathBuf, sync::OnceLock, time::Duration};
use tower::ServiceExt;
use uuid::Uuid;
fn enabled() -> bool {
std::env::var("CONTROL_TEST_NATS").ok().as_deref() == Some("1")
&& std::env::var("CONTROL_TEST_NATS_URL").is_ok()
}
#[derive(Serialize)]
struct TestClaims {
sub: String,
session_id: String,
permissions: Vec<String>,
exp: usize,
}
fn make_token(secret: &[u8], perms: &[&str]) -> String {
let exp = (std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_secs()
+ 60) as usize;
encode(
&Header::default(),
&TestClaims {
sub: "user_1".to_string(),
session_id: "sess_1".to_string(),
permissions: perms.iter().map(|p| (*p).to_string()).collect(),
exp,
},
&EncodingKey::from_secret(secret),
)
.unwrap()
}
static HANDLE: OnceLock<metrics_exporter_prometheus::PrometheusHandle> = OnceLock::new();
fn repo_root() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.and_then(|p| p.parent())
.expect("api crate should live under repo root")
.to_path_buf()
}
async fn wait_done(app: Router, job_id: Uuid, token: &str) -> serde_json::Value {
let start = tokio::time::Instant::now();
loop {
let res = app
.clone()
.oneshot(
Request::builder()
.uri(format!("/admin/v1/jobs/{job_id}"))
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.body(Body::empty())
.unwrap(),
)
.await
.unwrap();
assert_eq!(res.status(), StatusCode::OK);
let body = axum::body::to_bytes(res.into_body(), 1024 * 1024)
.await
.unwrap();
let job: serde_json::Value = serde_json::from_slice(&body).unwrap();
let status = job
.get("status")
.and_then(|v| v.as_str())
.unwrap_or("unknown");
if status != "pending" && status != "running" {
return job;
}
if start.elapsed() > Duration::from_secs(2) {
return job;
}
tokio::time::sleep(Duration::from_millis(25)).await;
}
}
#[tokio::test]
async fn config_jobs_with_nats_kv_are_env_gated() {
if !enabled() {
eprintln!(
"skipping: set CONTROL_TEST_NATS=1 and CONTROL_TEST_NATS_URL=nats://... to enable nats config tests"
);
return;
}
let nats_url = std::env::var("CONTROL_TEST_NATS_URL").unwrap();
unsafe {
std::env::set_var("CONTROL_CONFIG_NATS_URL", &nats_url);
}
let bucket = format!("cloudlysis-test-config-{}", Uuid::new_v4());
let routing_key = format!("routing/{}", Uuid::new_v4());
let placement_key = format!("placement/{}", Uuid::new_v4());
let routing_src = NatsKvSource::connect(nats_url.clone(), bucket.clone(), routing_key)
.await
.expect("connect routing kv");
let placement_src = NatsKvSource::connect(nats_url.clone(), bucket.clone(), placement_key)
.await
.expect("connect placement kv");
let config = ConfigRegistry::new(
Some(std::sync::Arc::new(routing_src)),
Some(std::sync::Arc::new(placement_src)),
);
let secret = b"test_secret".to_vec();
let token = make_token(&secret, &["control:write", "control:read"]);
let handle = HANDLE
.get_or_init(|| {
PrometheusBuilder::new()
.install_recorder()
.expect("failed to install prometheus recorder")
})
.clone();
let app = api::build_app(AppState {
prometheus: handle,
auth: AuthConfig {
hs256_secret: Some(secret),
},
jobs: JobStore::default(),
audit: AuditStore::default(),
tenant_locks: TenantLocks::default(),
config_locks: ConfigLocks::default(),
http: reqwest::Client::new(),
placement: PlacementStore::new(repo_root().join("config/placement/dev.json")),
billing: api::billing::BillingStore::new(std::env::temp_dir().join("billing-test.json")),
billing_provider: std::sync::Arc::new(api::billing::MockProvider),
billing_enforcement_enabled: false,
config,
fleet_services: vec![],
swarm: SwarmStore::new(repo_root().join("swarm/dev.json")),
docs: None,
});
let routing_value = serde_json::json!({
"revision": 1,
"aggregate_placement": { "t1": "local" },
"projection_placement": { "t1": "local" },
"runner_placement": { "t1": "local" },
"aggregate_shards": { "local": ["http://aggregate:50051"] },
"projection_shards": { "local": ["http://projection:8080"] },
"runner_shards": { "local": ["http://runner:8080"] }
});
let apply = app
.clone()
.oneshot(
Request::builder()
.uri("/admin/v1/jobs/config/apply")
.method("POST")
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.header("idempotency-key", format!("k-{}", Uuid::new_v4()))
.header(header::CONTENT_TYPE, "application/json")
.body(Body::from(
serde_json::json!({
"domain": "routing",
"expected_revision": null,
"reason": "test apply",
"value": routing_value
})
.to_string(),
))
.unwrap(),
)
.await
.unwrap();
assert_eq!(apply.status(), StatusCode::OK);
let body = axum::body::to_bytes(apply.into_body(), 1024 * 1024)
.await
.unwrap();
let v: serde_json::Value = serde_json::from_slice(&body).unwrap();
let job_id = Uuid::parse_str(v.get("job_id").unwrap().as_str().unwrap()).unwrap();
let job = wait_done(app.clone(), job_id, &token).await;
assert_eq!(
job.get("status").and_then(|v| v.as_str()),
Some("succeeded")
);
let get = app
.clone()
.oneshot(
Request::builder()
.uri("/admin/v1/config/routing")
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.body(Body::empty())
.unwrap(),
)
.await
.unwrap();
assert_eq!(get.status(), StatusCode::OK);
let body = axum::body::to_bytes(get.into_body(), 1024 * 1024)
.await
.unwrap();
let got: serde_json::Value = serde_json::from_slice(&body).unwrap();
assert_eq!(got.get("domain").unwrap().as_str().unwrap(), "routing");
assert!(got.get("revision").unwrap().as_u64().unwrap_or(0) > 0);
let rollback = app
.clone()
.oneshot(
Request::builder()
.uri("/admin/v1/jobs/config/rollback")
.method("POST")
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.header("idempotency-key", format!("k-{}", Uuid::new_v4()))
.header(header::CONTENT_TYPE, "application/json")
.body(Body::from(
serde_json::json!({
"domain": "routing",
"reason": "test rollback"
})
.to_string(),
))
.unwrap(),
)
.await
.unwrap();
assert_eq!(rollback.status(), StatusCode::OK);
let body = axum::body::to_bytes(rollback.into_body(), 1024 * 1024)
.await
.unwrap();
let v: serde_json::Value = serde_json::from_slice(&body).unwrap();
let rb_id = Uuid::parse_str(v.get("job_id").unwrap().as_str().unwrap()).unwrap();
let rb_job = wait_done(app.clone(), rb_id, &token).await;
assert_eq!(
rb_job.get("status").and_then(|v| v.as_str()),
Some("succeeded")
);
}

View File

@@ -0,0 +1,157 @@
use jsonwebtoken::{EncodingKey, Header, encode};
use reqwest::StatusCode;
use serde::Serialize;
use serde_json::json;
use std::time::Duration;
use uuid::Uuid;
#[derive(Serialize)]
struct TestClaims {
sub: String,
session_id: String,
permissions: Vec<String>,
exp: usize,
}
fn make_token(secret: &[u8], perms: &[&str]) -> String {
let exp = (std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_secs()
+ 300) as usize;
encode(
&Header::default(),
&TestClaims {
sub: "smoke".to_string(),
session_id: "smoke".to_string(),
permissions: perms.iter().map(|p| (*p).to_string()).collect(),
exp,
},
&EncodingKey::from_secret(secret),
)
.unwrap()
}
#[tokio::test]
async fn control_api_docs_smoke_is_env_gated() {
let enabled = std::env::var("CONTROL_TEST_SMOKE").ok();
if enabled.as_deref() != Some("1") {
eprintln!("skipping: set CONTROL_TEST_SMOKE=1 to enable env smoke tests");
return;
}
let base_url =
std::env::var("CONTROL_TEST_BASE_URL").expect("CONTROL_TEST_BASE_URL is required");
let base_url = base_url.trim_end_matches('/').to_string();
// Either provide a token directly, or provide secret+perms to mint one.
let token = if let Ok(t) = std::env::var("CONTROL_TEST_TOKEN") {
t
} else {
let secret = std::env::var("CONTROL_TEST_JWT_SECRET")
.expect("CONTROL_TEST_TOKEN or CONTROL_TEST_JWT_SECRET is required");
make_token(secret.as_bytes(), &["control:read", "control:write"])
};
let tenant_id = std::env::var("CONTROL_TEST_TENANT_ID")
.ok()
.unwrap_or_else(|| Uuid::new_v4().to_string());
let http = reqwest::Client::builder()
.timeout(Duration::from_secs(15))
.build()
.unwrap();
// Health.
let health = http
.get(format!("{base_url}/health"))
.send()
.await
.expect("health request failed");
assert!(health.status().is_success(), "health not ok");
// Presign upload.
let doc_id = Uuid::new_v4().to_string();
let filename = "smoke.txt";
let presign_up = http
.post(format!(
"{base_url}/admin/v1/tenants/{tenant_id}/docs/presign/upload"
))
.header("authorization", format!("Bearer {token}"))
.header("x-tenant-id", &tenant_id)
.json(&json!({
"doc_type": "deployments",
"doc_id": doc_id,
"filename": filename,
"content_type": "text/plain",
}))
.send()
.await
.expect("presign upload failed");
assert!(
presign_up.status().is_success(),
"presign upload not ok: {}",
presign_up.status()
);
let up_json: serde_json::Value = presign_up.json().await.unwrap();
let put_url = up_json.get("url").and_then(|v| v.as_str()).unwrap();
let key = up_json
.get("key")
.and_then(|v| v.as_str())
.unwrap()
.to_string();
// PUT bytes to S3 directly.
let payload = b"hello-smoke".to_vec();
let put = http
.put(put_url)
.header("content-type", "text/plain")
.body(payload.clone())
.send()
.await
.expect("s3 put failed");
assert!(put.status().is_success(), "s3 put not ok: {}", put.status());
// List should include key.
let list = http
.get(format!(
"{base_url}/admin/v1/tenants/{tenant_id}/docs?prefix=deployments/"
))
.header("authorization", format!("Bearer {token}"))
.header("x-tenant-id", &tenant_id)
.send()
.await
.expect("list failed");
assert!(list.status().is_success(), "list not ok");
let list_json: serde_json::Value = list.json().await.unwrap();
let objects = list_json.get("objects").and_then(|v| v.as_array()).unwrap();
assert!(
objects
.iter()
.any(|o| o.get("key").and_then(|k| k.as_str()) == Some(key.as_str())),
"expected list to include presigned upload key"
);
// Presign download and fetch bytes.
let presign_down = http
.post(format!(
"{base_url}/admin/v1/tenants/{tenant_id}/docs/presign/download"
))
.header("authorization", format!("Bearer {token}"))
.header("x-tenant-id", &tenant_id)
.json(&json!({ "key": key }))
.send()
.await
.expect("presign download failed");
assert!(
presign_down.status().is_success(),
"presign download not ok"
);
let down_json: serde_json::Value = presign_down.json().await.unwrap();
let get_url = down_json.get("url").and_then(|v| v.as_str()).unwrap();
let got = http.get(get_url).send().await.expect("s3 get failed");
assert_eq!(got.status(), StatusCode::OK);
let got_bytes = got.bytes().await.unwrap().to_vec();
assert_eq!(got_bytes, payload);
}

View File

@@ -11,7 +11,7 @@ fn repo_root() -> PathBuf {
#[test]
fn docker_compose_files_parse_and_include_required_services() {
let root = repo_root();
let compose = fs::read_to_string(root.join("observability/docker-compose.yml")).unwrap();
let compose = fs::read_to_string(root.join("docker-compose.yml")).unwrap();
let v: serde_yaml::Value = serde_yaml::from_str(&compose).unwrap();
let services = v
@@ -19,7 +19,15 @@ fn docker_compose_files_parse_and_include_required_services() {
.and_then(|x| x.as_mapping())
.expect("missing services");
for required in ["grafana", "victoria-metrics", "vmagent", "loki", "tempo"] {
// Core + optional observability services are all declared in one compose file.
for required in [
"grafana",
"victoria-metrics",
"vmagent",
"loki",
"tempo",
"mailhog",
] {
assert!(
services.contains_key(serde_yaml::Value::String(required.to_string())),
"missing service {required}"
@@ -28,17 +36,19 @@ fn docker_compose_files_parse_and_include_required_services() {
}
#[tokio::test]
#[ignore]
async fn docker_compose_config_validation_is_gated_and_fast() {
let enabled = std::env::var("CONTROL_TEST_DOCKER").ok();
assert_eq!(enabled.as_deref(), Some("1"));
if enabled.as_deref() != Some("1") {
eprintln!("skipping: set CONTROL_TEST_DOCKER=1 to enable docker compose validation");
return;
}
let root = repo_root();
let compose = root.join("observability/docker-compose.yml");
let compose = root.join("docker-compose.yml");
let cmd = tokio::process::Command::new("docker")
.args(["compose", "-f"])
.arg(compose)
.arg(&compose)
.args(["config"])
.output();
@@ -52,4 +62,22 @@ async fn docker_compose_config_validation_is_gated_and_fast() {
"docker compose config failed: {}",
String::from_utf8_lossy(&out.stderr)
);
// Validate full-stack profile wiring too.
let cmd = tokio::process::Command::new("docker")
.args(["compose", "-f"])
.arg(&compose)
.args(["--profile", "observability", "config"])
.output();
let out = tokio::time::timeout(Duration::from_secs(10), cmd)
.await
.expect("docker compose config (observability profile) timed out")
.expect("failed to run docker compose config (observability profile)");
assert!(
out.status.success(),
"docker compose config (observability profile) failed: {}",
String::from_utf8_lossy(&out.stderr)
);
}

View File

@@ -1,6 +1,9 @@
#[test]
#[ignore]
fn docker_integration_tests_are_gated() {
let enabled = std::env::var("CONTROL_TEST_DOCKER").ok();
if enabled.as_deref() != Some("1") {
eprintln!("skipping: set CONTROL_TEST_DOCKER=1 to enable docker integration tests");
return;
}
assert_eq!(enabled.as_deref(), Some("1"));
}

View File

@@ -0,0 +1,169 @@
use jsonwebtoken::{EncodingKey, Header, encode};
use reqwest::header::{HeaderMap, HeaderValue};
use serde::Serialize;
use std::{path::PathBuf, process::Command, time::Duration};
use uuid::Uuid;
fn repo_root() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.and_then(|p| p.parent())
.expect("api crate should live under repo root")
.to_path_buf()
}
fn docker_enabled() -> bool {
std::env::var("CONTROL_TEST_DOCKER")
.ok()
.is_some_and(|v| v.trim() == "1")
}
fn compose_file() -> PathBuf {
repo_root().join("docker-compose.yml")
}
#[derive(Serialize)]
struct TestClaims {
sub: String,
session_id: String,
permissions: Vec<String>,
exp: usize,
}
fn make_token(secret: &[u8], perms: &[&str]) -> String {
let exp = (std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_secs()
+ 300) as usize;
encode(
&Header::default(),
&TestClaims {
sub: "user_1".to_string(),
session_id: "sess_1".to_string(),
permissions: perms.iter().map(|p| (*p).to_string()).collect(),
exp,
},
&EncodingKey::from_secret(secret),
)
.unwrap()
}
#[tokio::test]
async fn documents_upload_list_download_roundtrip_via_control_api_compose() {
if !docker_enabled() {
eprintln!("skipping: set CONTROL_TEST_DOCKER=1 to enable docker compose tests");
return;
}
// Must match docker-compose.yml CONTROL_GATEWAY_JWT_HS256_SECRET.
let jwt_secret = b"dev_secret";
let token = make_token(jwt_secret, &["control:read", "control:write"]);
let compose = compose_file();
let up = Command::new("docker")
.args(["compose", "-f"])
.arg(&compose)
.args(["up", "-d", "control-api"])
.status()
.expect("failed to run docker compose up control-api");
assert!(up.success(), "docker compose up control-api failed");
// Wait for control-api to be reachable (port publish is in compose).
let http = reqwest::Client::builder()
.timeout(Duration::from_secs(10))
.build()
.unwrap();
let base = "http://127.0.0.1:38080";
let health_deadline = tokio::time::Instant::now() + Duration::from_secs(30);
loop {
if tokio::time::Instant::now() > health_deadline {
panic!("control-api did not become healthy in time");
}
match http.get(format!("{base}/health")).send().await {
Ok(res) if res.status().is_success() => break,
_ => tokio::time::sleep(Duration::from_millis(250)).await,
}
}
let tenant_id = Uuid::new_v4().to_string();
let doc_type = "deployments";
let doc_id = Uuid::new_v4().to_string();
let filename = "hello.txt";
let bytes = b"hello-docs".to_vec();
let mut headers = HeaderMap::new();
headers.insert(
"authorization",
HeaderValue::from_str(&format!("Bearer {token}")).unwrap(),
);
headers.insert("x-tenant-id", HeaderValue::from_str(&tenant_id).unwrap());
// Upload (proxy endpoint).
let put_url =
format!("{base}/admin/v1/tenants/{tenant_id}/docs/{doc_type}/{doc_id}/{filename}");
let put = http
.put(&put_url)
.headers(headers.clone())
.header("content-type", "text/plain")
.body(bytes.clone())
.send()
.await
.expect("upload request failed");
assert!(
put.status().is_success(),
"upload failed: {}",
put.text().await.unwrap_or_default()
);
let put_json: serde_json::Value = put.json().await.expect("invalid upload json");
let key = put_json
.get("key")
.and_then(|v| v.as_str())
.expect("missing key")
.to_string();
// List should include the key.
let list_url = format!("{base}/admin/v1/tenants/{tenant_id}/docs?prefix={doc_type}/");
let list = http
.get(&list_url)
.headers(headers.clone())
.send()
.await
.expect("list request failed");
assert!(list.status().is_success(), "list failed");
let list_json: serde_json::Value = list.json().await.expect("invalid list json");
let objects = list_json
.get("objects")
.and_then(|v| v.as_array())
.expect("missing objects");
assert!(
objects
.iter()
.any(|o| o.get("key").and_then(|k| k.as_str()) == Some(key.as_str())),
"expected list to include uploaded key"
);
// Download (proxy endpoint) returns same bytes.
let get_url = format!(
"{base}/admin/v1/tenants/{tenant_id}/docs/object/{}",
urlencoding::encode(&key)
);
let got = http
.get(&get_url)
.headers(headers.clone())
.send()
.await
.expect("download request failed");
assert!(got.status().is_success(), "download failed");
let got_bytes = got.bytes().await.expect("download bytes failed").to_vec();
assert_eq!(got_bytes, bytes);
// Best-effort cleanup.
let _ = Command::new("docker")
.args(["compose", "-f"])
.arg(&compose)
.args(["down", "-v"])
.status();
}

View File

@@ -0,0 +1,123 @@
use api::{
AppState, AuditStore, AuthConfig, ConfigLocks, ConfigRegistry, JobStore, PlacementStore,
SwarmStore, TenantLocks,
};
use axum::{
Router,
body::Body,
http::{Request, StatusCode, header},
};
use jsonwebtoken::{EncodingKey, Header, encode};
use metrics_exporter_prometheus::PrometheusBuilder;
use serde::Serialize;
use std::{fs, path::PathBuf, sync::OnceLock};
use tower::ServiceExt;
static HANDLE: OnceLock<metrics_exporter_prometheus::PrometheusHandle> = OnceLock::new();
fn repo_root() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.and_then(|p| p.parent())
.expect("api crate should live under repo root")
.to_path_buf()
}
#[derive(Serialize)]
struct TestClaims {
sub: String,
session_id: String,
permissions: Vec<String>,
exp: usize,
}
fn make_token(perms: &[&str]) -> String {
let exp = (std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_secs()
+ 60) as usize;
encode(
&Header::default(),
&TestClaims {
sub: "user_1".to_string(),
session_id: "sess_1".to_string(),
permissions: perms.iter().map(|p| (*p).to_string()).collect(),
exp,
},
&EncodingKey::from_secret(b"test_secret"),
)
.unwrap()
}
fn temp_swarm_file(raw: &str) -> PathBuf {
let mut dst = std::env::temp_dir();
dst.push(format!(
"cloudlysis-control-swarm-{}-{}.json",
std::process::id(),
uuid::Uuid::new_v4()
));
fs::write(&dst, raw).expect("failed to write temp swarm file");
dst
}
fn test_app_with_swarm(swarm_path: PathBuf) -> Router {
let handle = HANDLE
.get_or_init(|| {
PrometheusBuilder::new()
.install_recorder()
.expect("failed to install prometheus recorder")
})
.clone();
api::build_app(AppState {
prometheus: handle,
auth: AuthConfig {
hs256_secret: Some(b"test_secret".to_vec()),
},
jobs: JobStore::default(),
audit: AuditStore::default(),
tenant_locks: TenantLocks::default(),
config_locks: ConfigLocks::default(),
http: reqwest::Client::new(),
placement: PlacementStore::new(repo_root().join("config/placement/dev.json")),
billing: api::billing::BillingStore::new(
std::env::temp_dir().join("billing-drift-test.json"),
),
billing_provider: std::sync::Arc::new(api::billing::MockProvider),
billing_enforcement_enabled: false,
config: ConfigRegistry::new(None, None),
fleet_services: vec![],
swarm: SwarmStore::new(swarm_path),
docs: None,
})
}
#[tokio::test]
async fn drift_marks_extra_services_vs_desired_observation_set() {
let swarm = temp_swarm_file(
r#"{ "services": [{"name":"extra-1","image":null,"mode":null,"replicas":null,"updated_at":null}], "tasks": [] }"#,
);
let app = test_app_with_swarm(swarm);
let token = make_token(&["control:read"]);
let res = app
.oneshot(
Request::builder()
.uri("/admin/v1/platform/drift")
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.body(Body::empty())
.unwrap(),
)
.await
.unwrap();
assert_eq!(res.status(), StatusCode::OK);
let body = axum::body::to_bytes(res.into_body(), 1024 * 1024)
.await
.unwrap();
let v: serde_json::Value = serde_json::from_slice(&body).unwrap();
let items = v.get("items").and_then(|x| x.as_array()).unwrap();
assert!(items.iter().any(|i| {
i.get("kind").and_then(|k| k.as_str()) == Some("extra")
&& i.get("service").and_then(|s| s.as_str()) == Some("extra-1")
}));
}

View File

@@ -0,0 +1,137 @@
#[tokio::test]
async fn platform_drift_docker_test_is_gated() {
use tower::ServiceExt;
let enabled = std::env::var("CONTROL_TEST_DOCKER").ok();
if enabled.as_deref() != Some("1") {
eprintln!("skipping: set CONTROL_TEST_DOCKER=1 to enable docker drift tests");
return;
}
// We only run the "real" drift check when Swarm is available locally.
// If Swarm isn't active, we skip to keep CI/dev machines happy.
let info = std::process::Command::new("docker")
.args(["info", "--format", "{{.Swarm.LocalNodeState}}"])
.output();
let Ok(info) = info else {
eprintln!("skipping: docker not available");
return;
};
if !info.status.success() {
eprintln!("skipping: docker info failed");
return;
}
let state = String::from_utf8_lossy(&info.stdout).trim().to_string();
if state != "active" {
eprintln!("skipping: docker swarm not active (LocalNodeState={state})");
return;
}
// Create a short-lived service so drift can see an "extra" observed service.
let name = format!("cloudlysis-drift-extra-{}", uuid::Uuid::new_v4());
let create = std::process::Command::new("docker")
.args([
"service",
"create",
"--name",
&name,
"--restart-condition",
"none",
"busybox:1.36",
"sh",
"-c",
"sleep 60",
])
.output()
.expect("docker service create");
if !create.status.success() {
eprintln!("skipping: failed to create swarm service (maybe permissions?)");
return;
}
// Ensure cleanup even if assertion fails.
struct Cleanup(String);
impl Drop for Cleanup {
fn drop(&mut self) {
let _ = std::process::Command::new("docker")
.args(["service", "rm", &self.0])
.output();
}
}
let _cleanup = Cleanup(name.clone());
// Now call drift via a minimal in-process app configured for docker-cli swarm observation.
let handle = metrics_exporter_prometheus::PrometheusBuilder::new()
.install_recorder()
.expect("failed to install prometheus recorder");
let app = api::build_app(api::AppState {
prometheus: handle,
auth: api::AuthConfig {
hs256_secret: Some(b"test_secret".to_vec()),
},
jobs: api::JobStore::default(),
audit: api::AuditStore::default(),
tenant_locks: api::TenantLocks::default(),
config_locks: api::ConfigLocks::default(),
http: reqwest::Client::new(),
placement: api::PlacementStore::new(
std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.and_then(|p| p.parent())
.unwrap()
.join("config/placement/dev.json"),
),
billing: api::billing::BillingStore::new(
std::env::temp_dir().join("billing-drift-test.json"),
),
billing_provider: std::sync::Arc::new(api::billing::MockProvider),
billing_enforcement_enabled: false,
config: api::ConfigRegistry::new(None, None),
fleet_services: vec![],
swarm: api::SwarmStore::new_docker_cli(),
docs: None,
});
// Auth token (control:read).
let exp = (std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_secs()
+ 60) as usize;
let token = jsonwebtoken::encode(
&jsonwebtoken::Header::default(),
&serde_json::json!({
"sub": "user_1",
"session_id": "sess_1",
"permissions": ["control:read"],
"exp": exp
}),
&jsonwebtoken::EncodingKey::from_secret(b"test_secret"),
)
.unwrap();
let res = app
.oneshot(
axum::http::Request::builder()
.uri("/admin/v1/platform/drift")
.header(axum::http::header::AUTHORIZATION, format!("Bearer {token}"))
.body(axum::body::Body::empty())
.unwrap(),
)
.await
.unwrap();
assert_eq!(res.status(), axum::http::StatusCode::OK);
let body = axum::body::to_bytes(res.into_body(), 1024 * 1024)
.await
.unwrap();
let v: serde_json::Value = serde_json::from_slice(&body).unwrap();
let items = v.get("items").and_then(|x| x.as_array()).unwrap();
assert!(
items.iter().any(|i| {
i.get("kind").and_then(|k| k.as_str()) == Some("extra")
&& i.get("service").and_then(|s| s.as_str()) == Some(name.as_str())
}),
"expected drift to include extra service {name}, got: {v}"
);
}

View File

@@ -0,0 +1,77 @@
use std::{path::PathBuf, process::Command, time::Duration};
fn repo_root() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.and_then(|p| p.parent())
.expect("api crate should live under repo root")
.to_path_buf()
}
fn docker_enabled() -> bool {
std::env::var("CONTROL_TEST_DOCKER")
.ok()
.is_some_and(|v| v.trim() == "1")
}
fn compose_file() -> PathBuf {
repo_root().join("docker-compose.yml")
}
#[test]
fn minio_docs_bucket_exists_and_credentials_work_in_compose_network() {
if !docker_enabled() {
eprintln!("skipping: set CONTROL_TEST_DOCKER=1 to enable docker compose tests");
return;
}
let compose = compose_file();
let up = Command::new("docker")
.args(["compose", "-f"])
.arg(&compose)
.args(["up", "-d", "minio"])
.status()
.expect("failed to run docker compose up minio");
assert!(up.success(), "docker compose up minio failed");
// The `minio-init` service runs `mc` inside the compose network.
let out = Command::new("docker")
.args(["compose", "-f"])
.arg(&compose)
.args([
"run",
"--rm",
"minio-init",
"/bin/sh",
"-lc",
"mc alias set local http://minio:9000 minioadmin minioadmin && mc ls local/cloudlysis-docs-0 && mc ls local/cloudlysis-docs-1 && mc ls local/cloudlysis-docs-2",
])
.output()
.expect("failed to run docker compose run minio-init");
// Best-effort cleanup (keep it short; other docker tests may reuse this env).
let _ = Command::new("docker")
.args(["compose", "-f"])
.arg(&compose)
.args(["down", "-v"])
.status();
assert!(
out.status.success(),
"minio-init bucket check failed: {}",
String::from_utf8_lossy(&out.stderr)
);
// `mc ls` prints at least one line when the bucket exists (even if empty it prints the bucket line).
let stdout = String::from_utf8_lossy(&out.stdout);
assert!(
stdout.contains("cloudlysis-docs-0")
&& stdout.contains("cloudlysis-docs-1")
&& stdout.contains("cloudlysis-docs-2"),
"expected mc ls output to mention bucket: {stdout}"
);
// Avoid tests hanging due to docker flakiness.
std::thread::sleep(Duration::from_millis(10));
}

View File

@@ -8,6 +8,20 @@ fn repo_root() -> PathBuf {
.to_path_buf()
}
#[test]
fn loki_and_tempo_s3_config_variants_are_syntactically_valid() {
let root = repo_root();
for file in [
root.join("observability/loki/config.s3.yml"),
root.join("observability/tempo/config.s3.yml"),
] {
let raw = fs::read_to_string(&file).unwrap_or_else(|e| panic!("{file:?}: {e}"));
let _: serde_yaml::Value =
serde_yaml::from_str(&raw).unwrap_or_else(|e| panic!("{file:?}: {e}"));
}
}
#[test]
fn grafana_provisioning_files_are_syntactically_valid() {
let root = repo_root();

View File

@@ -0,0 +1,218 @@
use reqwest::StatusCode;
use serde_json::json;
use std::{
net::TcpStream,
path::PathBuf,
process::Command,
time::{Duration, Instant},
};
fn repo_root() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.and_then(|p| p.parent())
.expect("api crate should live under repo root")
.to_path_buf()
}
fn docker_enabled() -> bool {
std::env::var("CONTROL_TEST_DOCKER")
.ok()
.is_some_and(|v| v.trim() == "1")
}
fn wait_for_tcp(addr: &str, timeout: Duration) -> bool {
let start = Instant::now();
while start.elapsed() < timeout {
if TcpStream::connect_timeout(
&addr.parse().expect("invalid socket addr"),
Duration::from_secs(1),
)
.is_ok()
{
return true;
}
std::thread::sleep(Duration::from_millis(250));
}
false
}
fn mc_ls_bucket(compose: &PathBuf, bucket: &str) -> std::process::Output {
// Run inside compose network so it can reach `minio:9000`.
Command::new("docker")
.args(["compose", "-f"])
.arg(compose)
.args([
"run",
"--rm",
"minio-init",
"/bin/sh",
"-lc",
&format!(
"mc alias set local http://minio:9000 minioadmin minioadmin >/dev/null && mc ls --recursive local/{bucket}"
),
])
.output()
.expect("failed to run mc ls")
}
#[tokio::test]
async fn loki_and_tempo_write_objects_to_minio_in_s3_mode() {
if !docker_enabled() {
eprintln!("skipping: set CONTROL_TEST_DOCKER=1 to enable docker tests");
return;
}
let root = repo_root();
let base = root.join("docker-compose.yml");
let obs = root.join("observability/docker-compose.yml");
let obs_s3 = root.join("observability/docker-compose.s3.yml");
let up = Command::new("docker")
.args(["compose", "-f"])
.arg(&base)
.args(["-f"])
.arg(&obs)
.args(["-f"])
.arg(&obs_s3)
.args(["up", "-d"])
.status()
.expect("failed to run docker compose up");
assert!(up.success(), "docker compose up failed");
let reachable = wait_for_tcp("127.0.0.1:3100", Duration::from_secs(45))
&& wait_for_tcp("127.0.0.1:3200", Duration::from_secs(45))
&& wait_for_tcp("127.0.0.1:9411", Duration::from_secs(45))
&& wait_for_tcp("127.0.0.1:9000", Duration::from_secs(45));
assert!(reachable, "loki/tempo/minio ports not reachable in time");
let http = reqwest::Client::builder()
.timeout(Duration::from_secs(10))
.build()
.unwrap();
// Push one log line into Loki.
let ts_ns = (std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_nanos())
.to_string();
let push = http
.post("http://127.0.0.1:3100/loki/api/v1/push")
.json(&json!({
"streams": [{
"stream": { "app": "cloudlysis-test" },
"values": [[ts_ns, "hello from test"]]
}]
}))
.send()
.await
.expect("loki push request failed");
assert!(
push.status() == StatusCode::NO_CONTENT,
"unexpected loki push status: {}",
push.status()
);
// Emit one trace span via Zipkin v2.
let zipkin = http
.post("http://127.0.0.1:9411/api/v2/spans")
.json(&json!([{
"traceId": "463ac35c9f6413ad48485a3953bb6124",
"id": "a2fb4a1d1a96d312",
"name": "test-span",
"timestamp": 1700000000000000u64,
"duration": 1000u64,
"localEndpoint": { "serviceName": "cloudlysis-test" }
}]))
.send()
.await
.expect("zipkin post failed");
assert!(
zipkin.status().is_success(),
"zipkin ingest failed: {}",
zipkin.status()
);
// Query Loki back to ensure the line is retrievable (not just accepted).
// Loki may need a short delay to index.
let loki_deadline = Instant::now() + Duration::from_secs(30);
let mut loki_ok = false;
while Instant::now() < loki_deadline && !loki_ok {
let q = http
.get("http://127.0.0.1:3100/loki/api/v1/query")
.query(&[("query", r#"{app="cloudlysis-test"}"#)])
.send()
.await
.expect("loki query failed");
if q.status().is_success() {
let v: serde_json::Value = q.json().await.expect("invalid loki query json");
// We only need to see any non-empty result.
let has = v
.get("data")
.and_then(|d| d.get("result"))
.and_then(|r| r.as_array())
.is_some_and(|a| !a.is_empty());
if has {
loki_ok = true;
break;
}
}
tokio::time::sleep(Duration::from_millis(500)).await;
}
// Query Tempo back by trace id (Zipkin traceId used above).
let tempo_deadline = Instant::now() + Duration::from_secs(30);
let mut tempo_ok = false;
while Instant::now() < tempo_deadline && !tempo_ok {
let res = http
.get("http://127.0.0.1:3200/api/traces/463ac35c9f6413ad48485a3953bb6124")
.send()
.await
.expect("tempo get trace failed");
if res.status().is_success() {
tempo_ok = true;
break;
}
tokio::time::sleep(Duration::from_millis(500)).await;
}
// Poll buckets until at least one object appears.
let deadline = Instant::now() + Duration::from_secs(45);
let mut loki_has_objects = false;
let mut tempo_has_objects = false;
while Instant::now() < deadline && (!loki_has_objects || !tempo_has_objects) {
let loki_out = mc_ls_bucket(&base, "cloudlysis-loki");
if loki_out.status.success() && !loki_out.stdout.is_empty() {
loki_has_objects = true;
}
let tempo_out = mc_ls_bucket(&base, "cloudlysis-tempo");
if tempo_out.status.success() && !tempo_out.stdout.is_empty() {
tempo_has_objects = true;
}
if !loki_has_objects || !tempo_has_objects {
tokio::time::sleep(Duration::from_millis(500)).await;
}
}
let _ = Command::new("docker")
.args(["compose", "-f"])
.arg(&base)
.args(["-f"])
.arg(&obs)
.args(["-f"])
.arg(&obs_s3)
.args(["down", "-v"])
.status();
assert!(loki_has_objects, "expected Loki to write objects to MinIO");
assert!(
tempo_has_objects,
"expected Tempo to write objects to MinIO"
);
assert!(loki_ok, "expected Loki query to return a result");
assert!(tempo_ok, "expected Tempo to return the ingested trace");
}

View File

@@ -30,10 +30,12 @@ fn wait_for_tcp(addr: &str, timeout: Duration) -> bool {
}
#[test]
#[ignore]
fn observability_stack_reaches_healthy_state_fast() {
let enabled = std::env::var("CONTROL_TEST_DOCKER").ok();
assert_eq!(enabled.as_deref(), Some("1"));
if enabled.as_deref() != Some("1") {
eprintln!("skipping: set CONTROL_TEST_DOCKER=1 to enable docker observability smoke test");
return;
}
let root = repo_root();
let compose = root.join("observability/docker-compose.yml");

View File

@@ -0,0 +1,116 @@
use api::s3_docs::{DocsConfig, DocsStore};
use uuid::Uuid;
fn s3_env_ready() -> bool {
// Gate integration tests without requiring `-- --ignored`.
// If CI/local wants these tests to run, it must provide S3 env vars.
let required = [
"CONTROL_S3_ENDPOINT",
"CONTROL_S3_ACCESS_KEY_ID",
"CONTROL_S3_SECRET_ACCESS_KEY",
"CONTROL_S3_BUCKET_DOCS",
];
required
.iter()
.all(|k| std::env::var(k).ok().is_some_and(|v| !v.trim().is_empty()))
}
#[tokio::test]
async fn s3_docs_roundtrip_put_get_list_delete() {
if !s3_env_ready() {
eprintln!("skipping: missing S3 env (see S3_PLAN.md)");
return;
}
let cfg = DocsConfig::from_env().expect("missing S3 env (see S3_PLAN.md)");
let store = DocsStore::new(cfg)
.await
.expect("failed to init docs store");
let tenant_id = Uuid::new_v4().to_string();
let doc_type = "test";
let doc_id = Uuid::new_v4().to_string();
let filename = "hello.txt";
let key = store
.key_for(&tenant_id, doc_type, &doc_id, filename)
.expect("invalid key");
store
.put_for_tenant(
&tenant_id,
&key,
b"hello".to_vec(),
Some("text/plain".to_string()),
)
.await
.expect("put failed");
let (bytes, _ct) = store
.get_bytes_for_tenant(&tenant_id, &key)
.await
.expect("get failed");
assert_eq!(bytes, b"hello");
let prefix = format!("{}{}", store.prefix(), tenant_id);
let objects = store
.list_for_tenant(&tenant_id, &format!("{prefix}/"))
.await
.expect("list failed");
assert!(objects.iter().any(|o| o.key == key));
store
.delete_for_tenant(&tenant_id, &key)
.await
.expect("delete failed");
}
#[tokio::test]
async fn s3_docs_tenant_prefix_isolation() {
if !s3_env_ready() {
eprintln!("skipping: missing S3 env (see S3_PLAN.md)");
return;
}
let cfg = DocsConfig::from_env().expect("missing S3 env (see S3_PLAN.md)");
let store = DocsStore::new(cfg)
.await
.expect("failed to init docs store");
let tenant_a = Uuid::new_v4().to_string();
let tenant_b = Uuid::new_v4().to_string();
let doc_type = "test";
let doc_id = Uuid::new_v4().to_string();
let filename = "hello.txt";
let key_a = store
.key_for(&tenant_a, doc_type, &doc_id, filename)
.expect("invalid key");
store
.put_for_tenant(
&tenant_a,
&key_a,
b"hello-a".to_vec(),
Some("text/plain".to_string()),
)
.await
.expect("put failed");
let prefix_a = format!("{}{tenant_a}/", store.prefix());
let prefix_b = format!("{}{tenant_b}/", store.prefix());
let objects_a = store
.list_for_tenant(&tenant_a, &prefix_a)
.await
.expect("list a failed");
let objects_b = store
.list_for_tenant(&tenant_b, &prefix_b)
.await
.expect("list b failed");
assert!(objects_a.iter().any(|o| o.key == key_a));
assert!(!objects_b.iter().any(|o| o.key == key_a));
store
.delete_for_tenant(&tenant_a, &key_a)
.await
.expect("delete failed");
}

View File

@@ -0,0 +1,36 @@
use std::{path::PathBuf, process::Command};
fn repo_root() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.and_then(|p| p.parent())
.expect("api crate should live under repo root")
.to_path_buf()
}
fn is_enabled() -> bool {
std::env::var("CONTROL_TEST_AWSCLI")
.ok()
.is_some_and(|v| v.trim() == "1")
}
#[test]
fn s3_docs_permissions_can_be_verified_with_aws_cli() {
if !is_enabled() {
eprintln!("skipping: set CONTROL_TEST_AWSCLI=1 to enable aws-cli S3 permission checks");
return;
}
let script = repo_root().join("docker/scripts/s3_verify_docs.sh");
let out = Command::new("sh")
.arg(script)
.output()
.expect("failed to run s3_verify_docs.sh (requires aws cli and S3_* env)");
assert!(
out.status.success(),
"s3 verify script failed: {}\n{}",
String::from_utf8_lossy(&out.stdout),
String::from_utf8_lossy(&out.stderr)
);
}

View File

@@ -13,6 +13,7 @@ fn stack_files_parse_as_yaml() {
let root = repo_root();
for file in [
root.join("swarm/stacks/control-plane.yml"),
root.join("swarm/stacks/control-plane-prod.yml"),
root.join("swarm/stacks/observability.yml"),
] {
let raw = fs::read_to_string(&file).unwrap();
@@ -38,3 +39,36 @@ fn control_plane_stack_has_required_services() {
);
}
}
#[test]
fn control_plane_prod_stack_has_control_api_and_external_s3_secrets() {
let root = repo_root();
let raw = fs::read_to_string(root.join("swarm/stacks/control-plane-prod.yml")).unwrap();
let v: serde_yaml::Value = serde_yaml::from_str(&raw).unwrap();
let services = v
.get("services")
.and_then(|x| x.as_mapping())
.expect("missing services");
assert!(services.contains_key(serde_yaml::Value::String("control-api".to_string())));
assert!(services.contains_key(serde_yaml::Value::String("control-ui".to_string())));
assert!(
!services.contains_key(serde_yaml::Value::String("minio".to_string())),
"prod stack must not bundle MinIO"
);
let secrets = v
.get("secrets")
.and_then(|x| x.as_mapping())
.expect("missing secrets");
for name in ["control_s3_access_key_id", "control_s3_secret_access_key"] {
let entry = secrets
.get(serde_yaml::Value::String(name.to_string()))
.unwrap_or_else(|| panic!("missing secret {name}"));
let external = entry
.get(serde_yaml::Value::String("external".to_string()))
.and_then(|x| x.as_bool())
.unwrap_or(false);
assert!(external, "secret {name} must be external: true");
}
}

View File

@@ -26,6 +26,48 @@ async function apiJson<T>(path: string): Promise<T> {
}
}
async function apiJsonWithHeaders<T>(path: string, extra: HeadersInit): Promise<T> {
const controller = new AbortController()
const t = window.setTimeout(() => controller.abort(), 5000)
const token = getAccessToken()
const headers: HeadersInit = { ...(token ? { Authorization: `Bearer ${token}` } : {}), ...extra }
try {
const res = await apiFetch(`${baseUrl()}${path}`, {
headers,
signal: controller.signal,
useLastCorrelationId: true,
useLastTraceparent: true,
})
return (await res.json()) as T
} finally {
window.clearTimeout(t)
}
}
async function apiFetchWithHeaders(path: string, init: RequestInit, extra: Record<string, string>) {
const controller = new AbortController()
const t = window.setTimeout(() => controller.abort(), 15000)
const token = getAccessToken()
const headers = new Headers(init.headers)
if (token) headers.set('authorization', `Bearer ${token}`)
for (const [k, v] of Object.entries(extra)) headers.set(k, v)
try {
return await apiFetch(`${baseUrl()}${path}`, {
...init,
headers,
signal: controller.signal,
useLastCorrelationId: true,
useLastTraceparent: true,
})
} finally {
window.clearTimeout(t)
}
}
async function apiPostJson<T>(path: string, body: unknown, idempotencyKey?: string): Promise<T> {
const controller = new AbortController()
const t = window.setTimeout(() => controller.abort(), 2000)
@@ -100,6 +142,65 @@ export function getFleetSnapshot(): Promise<FleetSnapshot> {
return apiJson('/admin/v1/fleet/snapshot')
}
export type DriftKind = 'missing' | 'extra' | 'unhealthy' | 'version_mismatch'
export type DriftResponse = {
summary: Record<string, number>
items: Array<{ kind: DriftKind; service: string; details: unknown }>
}
export function getPlatformDrift(): Promise<DriftResponse> {
return apiJson('/admin/v1/platform/drift')
}
export type ConfigDomain = 'routing' | 'placement'
export type ConfigGetResponse = {
domain: ConfigDomain
revision: number
source: unknown
value: unknown
}
export function listConfigDomains(): Promise<{ domains: ConfigDomain[] }> {
return apiJson('/admin/v1/config')
}
export function getConfig(domain: ConfigDomain): Promise<ConfigGetResponse> {
return apiJson(`/admin/v1/config/${domain}`)
}
export function startConfigValidateJob(args: {
domain: ConfigDomain
reason: string
value: unknown
idempotencyKey: string
}): Promise<{ job_id: string }> {
return apiPostJson('/admin/v1/jobs/config/validate', { domain: args.domain, reason: args.reason, value: args.value }, args.idempotencyKey)
}
export function startConfigApplyJob(args: {
domain: ConfigDomain
reason: string
expectedRevision?: number
value: unknown
idempotencyKey: string
}): Promise<{ job_id: string }> {
return apiPostJson(
'/admin/v1/jobs/config/apply',
{ domain: args.domain, reason: args.reason, expected_revision: args.expectedRevision, value: args.value },
args.idempotencyKey,
)
}
export function startConfigRollbackJob(args: {
domain: ConfigDomain
reason: string
idempotencyKey: string
}): Promise<{ job_id: string }> {
return apiPostJson('/admin/v1/jobs/config/rollback', { domain: args.domain, reason: args.reason }, args.idempotencyKey)
}
export function getPlacement(kind: 'aggregate' | 'projection' | 'runner'): Promise<PlacementResponse> {
return apiJson(`/admin/v1/placement/${kind}`)
}
@@ -177,3 +278,111 @@ export function getSwarmServices(): Promise<{ services: SwarmService[] }> {
export function getSwarmTasks(serviceName: string): Promise<{ service: string; tasks: SwarmTask[] }> {
return apiJson(`/admin/v1/swarm/services/${encodeURIComponent(serviceName)}/tasks`)
}
export type DocumentObject = {
key: string
size: number
last_modified?: string | null
}
export function listDocuments(args: { tenantId: string; prefix?: string }): Promise<{ objects: DocumentObject[] }> {
const qs = args.prefix ? `?prefix=${encodeURIComponent(args.prefix)}` : ''
return apiJsonWithHeaders(`/admin/v1/tenants/${encodeURIComponent(args.tenantId)}/docs${qs}`, {
'x-tenant-id': args.tenantId,
})
}
export async function uploadDocument(args: {
tenantId: string
docType: string
docId: string
filename: string
file: File
}): Promise<{ key: string; sha256: string }> {
const path = `/admin/v1/tenants/${encodeURIComponent(args.tenantId)}/docs/${encodeURIComponent(
args.docType,
)}/${encodeURIComponent(args.docId)}/${encodeURIComponent(args.filename)}`
const res = await apiFetchWithHeaders(
path,
{
method: 'PUT',
headers: { 'content-type': args.file.type || 'application/octet-stream' },
body: args.file,
},
{ 'x-tenant-id': args.tenantId },
)
return (await res.json()) as { key: string; sha256: string }
}
export async function downloadDocument(args: { tenantId: string; key: string }): Promise<Blob> {
const res = await apiFetchWithHeaders(
`/admin/v1/tenants/${encodeURIComponent(args.tenantId)}/docs/object/${encodeURIComponent(args.key)}`,
{ method: 'GET' },
{ 'x-tenant-id': args.tenantId },
)
return await res.blob()
}
export async function deleteDocument(args: { tenantId: string; key: string }): Promise<void> {
await apiFetchWithHeaders(
`/admin/v1/tenants/${encodeURIComponent(args.tenantId)}/docs/object/${encodeURIComponent(args.key)}`,
{ method: 'DELETE' },
{ 'x-tenant-id': args.tenantId },
)
}
export type PresignResponse = {
method: 'PUT' | 'GET'
url: string
key: string
}
export function presignUpload(args: {
tenantId: string
docType: string
docId?: string
filename: string
contentType?: string
}): Promise<PresignResponse> {
return apiPostJsonWithTenant(`/admin/v1/tenants/${encodeURIComponent(args.tenantId)}/docs/presign/upload`, args.tenantId, {
doc_type: args.docType,
doc_id: args.docId,
filename: args.filename,
content_type: args.contentType,
})
}
export function presignDownload(args: { tenantId: string; key: string }): Promise<PresignResponse> {
return apiPostJsonWithTenant(
`/admin/v1/tenants/${encodeURIComponent(args.tenantId)}/docs/presign/download`,
args.tenantId,
{ key: args.key },
)
}
async function apiPostJsonWithTenant<T>(path: string, tenantId: string, body: unknown): Promise<T> {
const controller = new AbortController()
const t = window.setTimeout(() => controller.abort(), 5000)
const token = getAccessToken()
const headers: HeadersInit = {
'content-type': 'application/json',
...(token ? { Authorization: `Bearer ${token}` } : {}),
'x-tenant-id': tenantId,
}
try {
const res = await apiFetch(`${baseUrl()}${path}`, {
method: 'POST',
headers,
body: JSON.stringify(body),
signal: controller.signal,
useLastCorrelationId: true,
useLastTraceparent: true,
})
return (await res.json()) as T
} finally {
window.clearTimeout(t)
}
}

View File

@@ -16,9 +16,11 @@ const navItems: NavItem[] = [
{ label: 'Roles & Permissions', to: '/roles-permissions' },
{ label: 'Config', to: '/config' },
{ label: 'Definitions', to: '/definitions' },
{ label: 'Documents', to: '/documents' },
{ label: 'Scale & Placement', to: '/scale-placement' },
{ label: 'Deployments', to: '/deployments' },
{ label: 'Observability', to: '/observability' },
{ label: 'Platform Drift', to: '/drift' },
{ label: 'Audit Log', to: '/audit-log' },
{ label: 'Settings', to: '/settings' },
]

View File

@@ -15,9 +15,11 @@ const paths = [
'/roles-permissions',
'/config',
'/definitions',
'/documents',
'/scale-placement',
'/deployments',
'/observability',
'/drift',
'/audit-log',
'/settings',
]

View File

@@ -6,10 +6,12 @@ import {
DefinitionsPage,
DeploymentDetailPage,
DeploymentsPage,
DocumentsPage,
JobPage,
NotFoundPage,
ObservabilityPage,
OverviewPage,
PlatformDriftPage,
RolesPermissionsPage,
ScalePlacementPage,
SessionsPage,
@@ -30,10 +32,12 @@ export const routes: RouteObject[] = [
{ path: 'roles-permissions', element: <RolesPermissionsPage /> },
{ path: 'config', element: <ConfigPage /> },
{ path: 'definitions', element: <DefinitionsPage /> },
{ path: 'documents', element: <DocumentsPage /> },
{ path: 'scale-placement', element: <ScalePlacementPage /> },
{ path: 'deployments', element: <DeploymentsPage /> },
{ path: 'deployments/:serviceName', element: <DeploymentDetailPage /> },
{ path: 'observability', element: <ObservabilityPage /> },
{ path: 'drift', element: <PlatformDriftPage /> },
{ path: 'audit-log', element: <AuditLogPage /> },
{ path: 'jobs/:jobId', element: <JobPage /> },
{ path: 'settings', element: <SettingsPage /> },

View File

@@ -9,6 +9,18 @@ import {
listAudit,
getSwarmServices,
getSwarmTasks,
listConfigDomains,
getConfig,
startConfigValidateJob,
startConfigApplyJob,
startConfigRollbackJob,
getPlatformDrift,
listDocuments,
uploadDocument,
downloadDocument,
deleteDocument,
presignUpload,
presignDownload,
startTenantDrainJob,
startTenantMigrateJob,
type FleetSnapshot,
@@ -18,6 +30,10 @@ import {
type AuditEvent,
type SwarmService,
type SwarmTask,
type DocumentObject,
type ConfigDomain,
type ConfigGetResponse,
type DriftResponse,
} from './api/control'
import { getAccessToken, setAccessToken } from './auth/token'
import { Button, Code, ErrorText, Modal, MutedText, Table, TextInput } from './components/primitives'
@@ -226,13 +242,443 @@ export function RolesPermissionsPage() {
}
export function ConfigPage() {
return <PageShell title="Config" />
const [domains, setDomains] = useState<ConfigDomain[] | undefined>(undefined)
const [selected, setSelected] = useState<ConfigDomain>('routing')
const [cfg, setCfg] = useState<ConfigGetResponse | undefined>(undefined)
const [draft, setDraft] = useState('')
const [reason, setReason] = useState('')
const [error, setError] = useState<string | undefined>(undefined)
const [busy, setBusy] = useState(false)
const navigate = useNavigate()
function newIdempotencyKey() {
if (typeof crypto !== 'undefined' && 'randomUUID' in crypto) return crypto.randomUUID()
return `${Date.now()}-${Math.random().toString(16).slice(2)}`
}
useEffect(() => {
let cancelled = false
listConfigDomains()
.then((d) => {
if (cancelled) return
setDomains(d.domains)
if (d.domains.length > 0 && !d.domains.includes(selected)) {
setSelected(d.domains[0] ?? 'routing')
}
})
.catch((e: unknown) => {
if (cancelled) return
setError(e instanceof Error ? e.message : 'failed to load domains')
})
return () => {
cancelled = true
}
}, [])
async function refresh(domain: ConfigDomain) {
setBusy(true)
try {
const c = await getConfig(domain)
setCfg(c)
setDraft(JSON.stringify(c.value ?? null, null, 2))
setError(undefined)
} catch (e: unknown) {
setError(e instanceof Error ? e.message : 'failed to load config')
} finally {
setBusy(false)
}
}
useEffect(() => {
void refresh(selected)
}, [selected])
return (
<PageShell title="Config">
{error ? <ErrorText>{error}</ErrorText> : null}
<div style={{ display: 'flex', flexDirection: 'column', gap: 12, maxWidth: 980 }}>
<div style={{ display: 'flex', gap: 10, flexWrap: 'wrap', alignItems: 'flex-end' }}>
<div style={{ display: 'flex', flexDirection: 'column', gap: 6 }}>
<label htmlFor="domain" style={{ fontSize: 12, color: '#666' }}>
Domain
</label>
<select
id="domain"
value={selected}
onChange={(e) => setSelected(e.target.value as ConfigDomain)}
style={{ padding: '8px 10px', borderRadius: 10, border: '1px solid #ddd' }}
disabled={!domains || domains.length === 0}
>
{(domains ?? ['routing', 'placement']).map((d) => (
<option key={d} value={d}>
{d}
</option>
))}
</select>
</div>
<Button onClick={() => void refresh(selected)} disabled={busy}>
Refresh
</Button>
<div style={{ display: 'flex', flexDirection: 'column', gap: 6, minWidth: 320 }}>
<label htmlFor="reason" style={{ fontSize: 12, color: '#666' }}>
Reason (required for jobs)
</label>
<TextInput id="reason" value={reason} onChange={setReason} placeholder="why are you changing this?" />
</div>
</div>
<MutedText>
Current revision: <Code>{String(cfg?.revision ?? '')}</Code>
</MutedText>
<div style={{ display: 'flex', flexDirection: 'column', gap: 6 }}>
<label htmlFor="json" style={{ fontSize: 12, color: '#666' }}>
JSON
</label>
<textarea
id="json"
value={draft}
onChange={(e) => setDraft(e.target.value)}
spellCheck={false}
style={{
width: '100%',
minHeight: 340,
fontFamily: 'ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace',
fontSize: 12,
borderRadius: 12,
border: '1px solid #ddd',
padding: 12,
}}
/>
</div>
<div style={{ display: 'flex', gap: 10, flexWrap: 'wrap' }}>
<Button
disabled={busy || reason.trim().length === 0}
onClick={async () => {
setBusy(true)
try {
const value = JSON.parse(draft || 'null') as unknown
const job = await startConfigValidateJob({
domain: selected,
reason,
value,
idempotencyKey: newIdempotencyKey(),
})
navigate(`/jobs/${job.job_id}`)
} catch (e: unknown) {
setError(e instanceof Error ? e.message : 'validate failed')
} finally {
setBusy(false)
}
}}
>
Validate
</Button>
<Button
disabled={busy || reason.trim().length === 0}
onClick={async () => {
setBusy(true)
try {
const value = JSON.parse(draft || 'null') as unknown
const job = await startConfigApplyJob({
domain: selected,
reason,
expectedRevision: cfg?.revision,
value,
idempotencyKey: newIdempotencyKey(),
})
navigate(`/jobs/${job.job_id}`)
} catch (e: unknown) {
setError(e instanceof Error ? e.message : 'apply failed')
} finally {
setBusy(false)
}
}}
>
Apply
</Button>
<Button
disabled={busy || reason.trim().length === 0}
onClick={async () => {
setBusy(true)
try {
const job = await startConfigRollbackJob({
domain: selected,
reason,
idempotencyKey: newIdempotencyKey(),
})
navigate(`/jobs/${job.job_id}`)
} catch (e: unknown) {
setError(e instanceof Error ? e.message : 'rollback failed')
} finally {
setBusy(false)
}
}}
>
Rollback (to last backup)
</Button>
</div>
</div>
</PageShell>
)
}
export function DefinitionsPage() {
return <PageShell title="Definitions" />
}
export function DocumentsPage() {
const [tenantId, setTenantId] = useState('')
const [docType, setDocType] = useState('deployments')
const [docId, setDocId] = useState('')
const [prefix, setPrefix] = useState('')
const [file, setFile] = useState<File | undefined>(undefined)
const [objects, setObjects] = useState<DocumentObject[] | undefined>(undefined)
const [error, setError] = useState<string | undefined>(undefined)
const [busy, setBusy] = useState(false)
const [confirmDelete, setConfirmDelete] = useState<{ key: string } | undefined>(undefined)
const [usePresign, setUsePresign] = useState(false)
function newId(): string {
if (typeof crypto !== 'undefined' && 'randomUUID' in crypto) return crypto.randomUUID()
return `${Date.now()}-${Math.random().toString(16).slice(2)}`
}
async function refresh() {
const tid = tenantId.trim()
if (!tid) return
setBusy(true)
try {
const d = await listDocuments({ tenantId: tid, prefix: prefix.trim() || undefined })
setObjects(d.objects)
setError(undefined)
} catch (e: unknown) {
setError(e instanceof Error ? e.message : 'failed to load')
} finally {
setBusy(false)
}
}
return (
<PageShell title="Documents">
{error ? <ErrorText>{error}</ErrorText> : null}
<div style={{ display: 'flex', flexDirection: 'column', gap: 14, maxWidth: 880 }}>
<div style={{ display: 'flex', gap: 12, flexWrap: 'wrap', alignItems: 'flex-end' }}>
<div style={{ display: 'flex', flexDirection: 'column', gap: 6 }}>
<label htmlFor="tenantId" style={{ fontSize: 12, color: '#666' }}>
Tenant ID
</label>
<TextInput id="tenantId" value={tenantId} onChange={setTenantId} placeholder="uuid" />
</div>
<div style={{ display: 'flex', flexDirection: 'column', gap: 6 }}>
<label htmlFor="prefix" style={{ fontSize: 12, color: '#666' }}>
Prefix (optional)
</label>
<TextInput
id="prefix"
value={prefix}
onChange={setPrefix}
placeholder="e.g. deployments/"
/>
</div>
<Button onClick={refresh} disabled={busy || !tenantId.trim()}>
Refresh
</Button>
</div>
<div style={{ borderTop: '1px solid #eee', paddingTop: 12 }} />
<div style={{ display: 'flex', flexDirection: 'column', gap: 10 }}>
<div style={{ display: 'flex', gap: 12, flexWrap: 'wrap', alignItems: 'flex-end' }}>
<div style={{ display: 'flex', flexDirection: 'column', gap: 6 }}>
<label htmlFor="docType" style={{ fontSize: 12, color: '#666' }}>
Doc type
</label>
<TextInput id="docType" value={docType} onChange={setDocType} />
</div>
<div style={{ display: 'flex', flexDirection: 'column', gap: 6 }}>
<label htmlFor="docId" style={{ fontSize: 12, color: '#666' }}>
Doc id
</label>
<TextInput id="docId" value={docId} onChange={setDocId} placeholder="auto" />
</div>
<div style={{ display: 'flex', flexDirection: 'column', gap: 6 }}>
<label htmlFor="file" style={{ fontSize: 12, color: '#666' }}>
File
</label>
<input
id="file"
type="file"
onChange={(e) => {
const f = e.target.files?.item(0) ?? undefined
setFile(f)
}}
/>
</div>
<Button
disabled={busy || !tenantId.trim() || !docType.trim() || !file}
onClick={async () => {
const tid = tenantId.trim()
if (!tid || !file) return
setBusy(true)
try {
const id = docId.trim() || newId()
if (!usePresign) {
await uploadDocument({
tenantId: tid,
docType: docType.trim(),
docId: id,
filename: file.name,
file,
})
} else {
const p = await presignUpload({
tenantId: tid,
docType: docType.trim(),
docId: id,
filename: file.name,
contentType: file.type || 'application/octet-stream',
})
await fetch(p.url, {
method: 'PUT',
headers: { 'content-type': file.type || 'application/octet-stream' },
body: file,
})
}
setDocId(id)
await refresh()
} catch (e: unknown) {
setError(e instanceof Error ? e.message : 'upload failed')
} finally {
setBusy(false)
}
}}
>
{usePresign ? 'Upload (presigned)' : 'Upload'}
</Button>
</div>
<div style={{ display: 'flex', gap: 8, alignItems: 'center' }}>
<input
id="usePresign"
type="checkbox"
checked={usePresign}
onChange={(e) => setUsePresign(e.target.checked)}
/>
<label htmlFor="usePresign" style={{ fontSize: 12, color: '#666' }}>
Use presigned URLs (recommended for large files)
</label>
</div>
<MutedText>
Documents are stored under <Code>docs/&lt;tenant&gt;/&lt;type&gt;/&lt;id&gt;/&lt;filename&gt;</Code>.
</MutedText>
</div>
<div style={{ borderTop: '1px solid #eee', paddingTop: 12 }} />
{!objects ? <div>{tenantId.trim() ? 'No data loaded.' : 'Enter a tenant id to list documents.'}</div> : null}
{objects ? (
<Table
columns={['Key', 'Size', 'Last Modified', 'Actions']}
rows={objects.map((o) => [
<Code key="k">{o.key}</Code>,
String(o.size ?? 0),
o.last_modified ?? '',
<div key="a" style={{ display: 'flex', gap: 8 }}>
<Button
onClick={async () => {
const tid = tenantId.trim()
if (!tid) return
setBusy(true)
try {
const blob = !usePresign
? await downloadDocument({ tenantId: tid, key: o.key })
: await (async () => {
const p = await presignDownload({ tenantId: tid, key: o.key })
const res = await fetch(p.url, { method: 'GET' })
return await res.blob()
})()
const url = URL.createObjectURL(blob)
const a = document.createElement('a')
a.href = url
const name = o.key.split('/').slice(-1)[0] ?? 'download'
a.download = name
document.body.appendChild(a)
a.click()
a.remove()
URL.revokeObjectURL(url)
} finally {
setBusy(false)
}
}}
disabled={busy}
>
{usePresign ? 'Download (presigned)' : 'Download'}
</Button>
<Button
disabled={busy}
onClick={() => {
setConfirmDelete({ key: o.key })
}}
>
Delete
</Button>
</div>,
])}
/>
) : null}
</div>
<Modal
open={!!confirmDelete}
title="Confirm delete"
onClose={() => setConfirmDelete(undefined)}
footer={
<div style={{ display: 'flex', gap: 10, justifyContent: 'flex-end' }}>
<Button onClick={() => setConfirmDelete(undefined)} disabled={busy}>
Cancel
</Button>
<Button
disabled={busy || !tenantId.trim() || !confirmDelete}
onClick={async () => {
const tid = tenantId.trim()
const k = confirmDelete?.key
if (!tid || !k) return
setBusy(true)
try {
await deleteDocument({ tenantId: tid, key: k })
setConfirmDelete(undefined)
await refresh()
} catch (e: unknown) {
setError(e instanceof Error ? e.message : 'delete failed')
} finally {
setBusy(false)
}
}}
>
Delete permanently
</Button>
</div>
}
>
<div style={{ display: 'flex', flexDirection: 'column', gap: 10 }}>
<MutedText>
Tenant: <Code>{tenantId.trim() || '(unset)'}</Code>
</MutedText>
<MutedText>
Key: <Code>{confirmDelete?.key}</Code>
</MutedText>
</div>
</Modal>
</PageShell>
)
}
export function ScalePlacementPage() {
const [aggregate, setAggregate] = useState<PlacementResponse | undefined>(undefined)
const [projection, setProjection] = useState<PlacementResponse | undefined>(undefined)
@@ -332,6 +778,53 @@ export function ObservabilityPage() {
return <PageShell title="Observability" />
}
export function PlatformDriftPage() {
const [data, setData] = useState<DriftResponse | undefined>(undefined)
const [error, setError] = useState<string | undefined>(undefined)
useEffect(() => {
let cancelled = false
getPlatformDrift()
.then((d) => {
if (cancelled) return
setError(undefined)
setData(d)
})
.catch((e: unknown) => {
if (cancelled) return
setError(e instanceof Error ? e.message : 'failed to load')
})
return () => {
cancelled = true
}
}, [])
return (
<PageShell title="Platform Drift">
{error ? <ErrorText>{error}</ErrorText> : null}
{!data ? <div>Loading</div> : null}
{data ? (
<div style={{ display: 'flex', flexDirection: 'column', gap: 16 }}>
<Table
columns={['Kind', 'Count']}
rows={Object.entries(data.summary ?? {}).map(([k, v]) => [k, String(v)])}
/>
<Table
columns={['Kind', 'Service', 'Details']}
rows={data.items.map((i, idx) => [
i.kind,
<Code key={`svc-${idx}`}>{i.service}</Code>,
<pre key={`d-${idx}`} style={{ margin: 0, fontSize: 12, overflowX: 'auto' }}>
{JSON.stringify(i.details, null, 2)}
</pre>,
])}
/>
</div>
) : null}
</PageShell>
)
}
export function AuditLogPage() {
const [data, setData] = useState<AuditEvent[] | undefined>(undefined)
const [error, setError] = useState<string | undefined>(undefined)

View File

@@ -6,6 +6,44 @@ services:
- "4222:4222"
- "8222:8222"
mailhog:
image: mailhog/mailhog:v1.0.1
ports:
- "1025:1025" # SMTP
- "8025:8025" # Web UI
minio:
image: minio/minio:RELEASE.2025-02-28T09-55-16Z
command: ["server", "/data", "--console-address", ":9001"]
environment:
MINIO_ROOT_USER: minioadmin
MINIO_ROOT_PASSWORD: minioadmin
ports:
- "9000:9000"
- "9001:9001"
volumes:
- minio_data:/data
minio-init:
image: minio/mc:RELEASE.2025-02-21T16-00-46Z
depends_on:
- minio
entrypoint:
- /bin/sh
- -c
command:
- |
set -euo pipefail
mc alias set local http://minio:9000 minioadmin minioadmin
mc mb -p local/cloudlysis-docs || true
mc mb -p local/cloudlysis-loki || true
mc mb -p local/cloudlysis-tempo || true
mc mb -p local/cloudlysis-docs-0 || true
mc mb -p local/cloudlysis-docs-1 || true
mc mb -p local/cloudlysis-docs-2 || true
mc anonymous set download local/cloudlysis-docs || true
echo "minio init done"
gateway:
build:
context: .
@@ -22,7 +60,7 @@ services:
GATEWAY_ROUTING_FILE: /config/routing.json
volumes:
- gateway_data:/data
- ./routing/dev.json:/config/routing.json:ro
- ./config/routing/dev.json:/config/routing.json:ro
ports:
- "8080:8080"
- "8081:8081"
@@ -86,6 +124,7 @@ services:
RUNNER_HTTP_ADDR: 0.0.0.0:8080
RUNNER_SAGA_MANIFEST_PATH: /config/sagas.yaml
RUNNER_EFFECTS_MANIFEST_PATH: /config/effects.yaml
RUNNER_SMTP_URL: smtp://mailhog:1025
volumes:
- runner_data:/data
- ./runner/config:/config:ro
@@ -99,13 +138,25 @@ services:
args:
PACKAGE: api
BIN: api
depends_on:
- minio-init
environment:
CONTROL_API_ADDR: 0.0.0.0:8080
CONTROL_GATEWAY_JWT_HS256_SECRET: dev_secret
CONTROL_PLACEMENT_PATH: /etc/control/placement.json
CONTROL_SWARM_STATE_PATH: /etc/control/swarm_state.json
CONTROL_SELF_URL: http://control-api:8080
CONTROL_S3_ENDPOINT: http://minio:9000
CONTROL_S3_PUBLIC_ENDPOINT: http://localhost:9000
CONTROL_S3_REGION: us-east-1
CONTROL_S3_ACCESS_KEY_ID: minioadmin
CONTROL_S3_SECRET_ACCESS_KEY: minioadmin
CONTROL_S3_FORCE_PATH_STYLE: "true"
CONTROL_S3_INSECURE: "true"
CONTROL_S3_BUCKET_DOCS: cloudlysis-docs-0,cloudlysis-docs-1,cloudlysis-docs-2
CONTROL_S3_PREFIX_DOCS: docs/
volumes:
- ./placement/dev.json:/etc/control/placement.json:ro
- ./config/placement/dev.json:/etc/control/placement.json:ro
- ./swarm/dev.json:/etc/control/swarm_state.json:ro
ports:
- "38080:8080"
@@ -119,8 +170,75 @@ services:
ports:
- "8082:80"
victoria-metrics:
image: victoriametrics/victoria-metrics:v1.120.0
profiles: ["observability"]
ports:
- "8428:8428"
command:
- "-retentionPeriod=30d"
volumes:
- victoria_metrics_data:/victoria-metrics-data
vmagent:
image: victoriametrics/vmagent:v1.120.0
profiles: ["observability"]
depends_on:
- victoria-metrics
ports:
- "8429:8429"
command:
- "-promscrape.config=/etc/vmagent/scrape.yml"
- "-remoteWrite.url=http://victoria-metrics:8428/api/v1/write"
volumes:
- ./observability/vmagent/scrape.yml:/etc/vmagent/scrape.yml:ro
loki:
image: grafana/loki:3.5.5
profiles: ["observability"]
ports:
- "3100:3100"
command:
- "-config.file=/etc/loki/config.yml"
volumes:
- ./observability/loki/config.yml:/etc/loki/config.yml:ro
- loki_data:/loki
tempo:
image: grafana/tempo:2.8.2
profiles: ["observability"]
ports:
- "3200:3200"
- "4317:4317"
- "4318:4318"
- "9411:9411"
command:
- "-config.file=/etc/tempo/config.yml"
volumes:
- ./observability/tempo/config.yml:/etc/tempo/config.yml:ro
- tempo_data:/var/tempo
grafana:
image: grafana/grafana:12.1.1
profiles: ["observability"]
depends_on:
- victoria-metrics
- loki
- tempo
ports:
- "3000:3000"
volumes:
- grafana_data:/var/lib/grafana
- ./observability/grafana/provisioning:/etc/grafana/provisioning:ro
- ./observability/grafana/dashboards:/var/lib/grafana/dashboards:ro
volumes:
aggregate_data:
gateway_data:
projection_data:
runner_data:
minio_data:
grafana_data:
loki_data:
tempo_data:
victoria_metrics_data:

View File

@@ -0,0 +1,56 @@
#!/bin/sh
set -eu
# Applies an S3 lifecycle configuration to the docs bucket.
#
# This is an operator tool (it has side effects). It is still automatable and scriptable.
#
# Required env:
# - S3_ENDPOINT
# - S3_REGION
# - S3_BUCKET_DOCS
#
# Optional env:
# - S3_LIFECYCLE_JSON (path to JSON file; default: docs/usage/s3_lifecycle_docs_default.json)
#
# Usage:
# export S3_ENDPOINT=...
# export S3_REGION=...
# export S3_BUCKET_DOCS=...
# sh docker/scripts/s3_apply_lifecycle_docs.sh
need() {
name="$1"
val="$(printenv "$name" 2>/dev/null || true)"
if [ -z "$val" ]; then
echo "missing env: $name" >&2
exit 2
fi
}
need S3_ENDPOINT
need S3_REGION
need S3_BUCKET_DOCS
if ! command -v aws >/dev/null 2>&1; then
echo "missing dependency: aws (AWS CLI v2 recommended)" >&2
exit 2
fi
export AWS_EC2_METADATA_DISABLED=true
export AWS_DEFAULT_REGION="$S3_REGION"
export AWS_REGION="$S3_REGION"
S3_LIFECYCLE_JSON="${S3_LIFECYCLE_JSON:-docs/usage/s3_lifecycle_docs_default.json}"
if [ ! -f "$S3_LIFECYCLE_JSON" ]; then
echo "missing lifecycle config file: $S3_LIFECYCLE_JSON" >&2
exit 2
fi
aws s3api put-bucket-lifecycle-configuration \
--endpoint-url "$S3_ENDPOINT" \
--bucket "$S3_BUCKET_DOCS" \
--lifecycle-configuration "file://$S3_LIFECYCLE_JSON" >/dev/null
echo "ok: applied lifecycle config to bucket $S3_BUCKET_DOCS"

View File

@@ -0,0 +1,89 @@
#!/bin/sh
set -eu
# Idempotently provisions the S3 docs bucket with sane defaults.
#
# This script is intended for CI/CD (Gitea Actions) or operator usage.
# It is safe to run repeatedly:
# - If the bucket exists, it will NOT recreate it.
# - It will (re)apply public-access-block and optional versioning/lifecycle.
#
# Required env:
# - S3_ENDPOINT
# - S3_REGION
# - S3_BUCKET_DOCS
#
# Optional env:
# - S3_ENABLE_VERSIONING (true/false; default false)
# - S3_LIFECYCLE_JSON (path; default docs/usage/s3_lifecycle_docs_default.json)
#
# Credentials:
# - AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY (or AWS_PROFILE)
#
# Notes:
# - Some S3-compatible providers ignore LocationConstraint; this script tries to be compatible.
need() {
name="$1"
val="$(printenv "$name" 2>/dev/null || true)"
if [ -z "$val" ]; then
echo "missing env: $name" >&2
exit 2
fi
}
need S3_ENDPOINT
need S3_REGION
need S3_BUCKET_DOCS
if ! command -v aws >/dev/null 2>&1; then
echo "missing dependency: aws (AWS CLI v2 recommended)" >&2
exit 2
fi
export AWS_EC2_METADATA_DISABLED=true
export AWS_DEFAULT_REGION="$S3_REGION"
export AWS_REGION="$S3_REGION"
endpoint_args="--endpoint-url=$S3_ENDPOINT"
bucket="$S3_BUCKET_DOCS"
echo "== ensure bucket exists =="
if aws s3api head-bucket $endpoint_args --bucket "$bucket" >/dev/null 2>&1; then
echo "bucket exists: $bucket"
else
# Try create-bucket without LocationConstraint first (works for many S3-compatible providers).
if aws s3api create-bucket $endpoint_args --bucket "$bucket" >/dev/null 2>&1; then
echo "created bucket: $bucket"
else
# Fallback for AWS-style regions.
aws s3api create-bucket $endpoint_args --bucket "$bucket" \
--create-bucket-configuration "LocationConstraint=$S3_REGION" >/dev/null
echo "created bucket (with location constraint): $bucket"
fi
fi
echo "== apply public access block =="
aws s3api put-public-access-block $endpoint_args --bucket "$bucket" --public-access-block-configuration \
"BlockPublicAcls=true,IgnorePublicAcls=true,BlockPublicPolicy=true,RestrictPublicBuckets=true" >/dev/null
S3_ENABLE_VERSIONING="${S3_ENABLE_VERSIONING:-false}"
if [ "$S3_ENABLE_VERSIONING" = "true" ] || [ "$S3_ENABLE_VERSIONING" = "1" ]; then
echo "== enable versioning =="
aws s3api put-bucket-versioning $endpoint_args --bucket "$bucket" --versioning-configuration Status=Enabled >/dev/null
fi
echo "== apply lifecycle (optional) =="
S3_LIFECYCLE_JSON="${S3_LIFECYCLE_JSON:-docs/usage/s3_lifecycle_docs_default.json}"
if [ -f "$S3_LIFECYCLE_JSON" ]; then
aws s3api put-bucket-lifecycle-configuration \
--endpoint-url "$S3_ENDPOINT" \
--bucket "$bucket" \
--lifecycle-configuration "file://$S3_LIFECYCLE_JSON" >/dev/null
else
echo "lifecycle file missing, skipping: $S3_LIFECYCLE_JSON" >&2
fi
echo "ok: provisioned bucket $bucket"

View File

@@ -0,0 +1,77 @@
#!/bin/sh
set -eu
# Verifies Control API S3 document storage permissions using `aws` CLI.
#
# This script is intentionally parameterized so it can run against Hetzner or any S3-compatible backend.
# It does NOT require Control API to be running; it validates the underlying bucket/prefix permissions.
#
# Required env:
# - S3_ENDPOINT (e.g. https://<hetzner-endpoint>)
# - S3_REGION
# - S3_BUCKET_DOCS
# Optional env:
# - S3_PREFIX_DOCS (default docs/)
# - S3_FORCE_PATH_STYLE (true/false; default false)
# - AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY (or AWS_PROFILE)
#
# Notes:
# - For S3-compatible providers, prefer `aws s3api` with `--endpoint-url`.
# - We set `AWS_EC2_METADATA_DISABLED=true` to avoid IMDS delays in containers/CI.
need() {
name="$1"
val="$(printenv "$name" 2>/dev/null || true)"
if [ -z "$val" ]; then
echo "missing env: $name" >&2
exit 2
fi
}
need S3_ENDPOINT
need S3_REGION
need S3_BUCKET_DOCS
S3_PREFIX_DOCS="${S3_PREFIX_DOCS:-docs/}"
case "$S3_PREFIX_DOCS" in
*/) ;;
*) S3_PREFIX_DOCS="${S3_PREFIX_DOCS}/" ;;
esac
S3_FORCE_PATH_STYLE="${S3_FORCE_PATH_STYLE:-false}"
if ! command -v aws >/dev/null 2>&1; then
echo "missing dependency: aws (AWS CLI v2 recommended)" >&2
exit 2
fi
export AWS_EC2_METADATA_DISABLED=true
export AWS_DEFAULT_REGION="$S3_REGION"
export AWS_REGION="$S3_REGION"
endpoint_args="--endpoint-url=$S3_ENDPOINT"
path_style_args=""
if [ "$S3_FORCE_PATH_STYLE" = "true" ] || [ "$S3_FORCE_PATH_STYLE" = "1" ]; then
path_style_args="--no-verify-ssl --cli-connect-timeout 10 --cli-read-timeout 30"
# NOTE: AWS CLI doesn't have a universal "force path style" flag for all s3api calls.
# For S3-compatible endpoints it generally works as long as the endpoint expects path-style.
# If your provider requires it and aws CLI fails, consider setting AWS_S3_FORCE_PATH_STYLE=1
# in newer CLIs or using s3cmd/minio client for validation.
fi
key="${S3_PREFIX_DOCS}smoke/$(date +%s)-$$.txt"
tmp="$(mktemp)"
trap 'rm -f "$tmp" >/dev/null 2>&1 || true' EXIT
printf "cloudlysis s3 verify\n" >"$tmp"
echo "== docs bucket head/list prefix =="
aws s3api head-bucket $endpoint_args --bucket "$S3_BUCKET_DOCS" >/dev/null
aws s3api list-objects-v2 $endpoint_args --bucket "$S3_BUCKET_DOCS" --prefix "$S3_PREFIX_DOCS" --max-items 1 >/dev/null
echo "== put/get/delete object under prefix =="
aws s3api put-object $endpoint_args --bucket "$S3_BUCKET_DOCS" --key "$key" --body "$tmp" >/dev/null
aws s3api get-object $endpoint_args --bucket "$S3_BUCKET_DOCS" --key "$key" /dev/null >/dev/null
aws s3api delete-object $endpoint_args --bucket "$S3_BUCKET_DOCS" --key "$key" >/dev/null
echo "ok: verified S3 docs permissions for s3://$S3_BUCKET_DOCS/$S3_PREFIX_DOCS"

View File

@@ -11,3 +11,7 @@ ensure_secret() {
}
ensure_secret grafana_admin_password "${GRAFANA_ADMIN_PASSWORD:-admin}"
# Control plane S3 document storage (dev defaults: MinIO in swarm/stacks/control-plane.yml).
ensure_secret control_s3_access_key_id "${CONTROL_S3_ACCESS_KEY_ID:-minioadmin}"
ensure_secret control_s3_secret_access_key "${CONTROL_S3_SECRET_ACCESS_KEY:-minioadmin}"

View File

@@ -36,3 +36,64 @@ curl -sS -X POST \
-H "authorization: Bearer <token>" \
http://localhost:8080/admin/runner/drain?wait_ms=0
```
## Document Storage via Control API (S3-backed)
List documents for a tenant (Control API uses UUID tenant ids):
```bash
curl -sS \
-H "authorization: Bearer <token>" \
-H "x-tenant-id: <tenant-uuid>" \
"http://localhost:38080/admin/v1/tenants/<tenant-uuid>/docs"
```
Upload a document (stores at `docs/<tenant>/<type>/<id>/<filename>`):
```bash
curl -sS -X PUT \
-H "authorization: Bearer <token>" \
-H "x-tenant-id: <tenant-uuid>" \
-H "content-type: application/octet-stream" \
--data-binary @./bundle.tar.gz \
"http://localhost:38080/admin/v1/tenants/<tenant-uuid>/docs/deployments/<doc-id>/bundle.tar.gz"
```
Download by object key (streamed proxy; key must belong to the tenant prefix):
```bash
curl -sS -o ./out.tar.gz \
-H "authorization: Bearer <token>" \
-H "x-tenant-id: <tenant-uuid>" \
"http://localhost:38080/admin/v1/tenants/<tenant-uuid>/docs/object/<url-encoded-key>"
```
Delete by object key (requires `control:write`):
```bash
curl -sS -X DELETE \
-H "authorization: Bearer <token>" \
-H "x-tenant-id: <tenant-uuid>" \
"http://localhost:38080/admin/v1/tenants/<tenant-uuid>/docs/object/<url-encoded-key>"
```
Presign upload (JSON body; returns `PUT` URL and `key`):
```bash
curl -sS -X POST \
-H "authorization: Bearer <token>" \
-H "content-type: application/json" \
-H "x-tenant-id: <tenant-uuid>" \
-d '{"doc_type":"deployments","doc_id":"<doc-id>","filename":"bundle.tar.gz","content_type":"application/gzip"}' \
"http://localhost:38080/admin/v1/tenants/<tenant-uuid>/docs/presign/upload"
```
Presign download (JSON body with full `key` under that tenant):
```bash
curl -sS -X POST \
-H "authorization: Bearer <token>" \
-H "content-type: application/json" \
-H "x-tenant-id: <tenant-uuid>" \
-d '{"key":"docs/<tenant-uuid>/deployments/<doc-id>/bundle.tar.gz"}' \
"http://localhost:38080/admin/v1/tenants/<tenant-uuid>/docs/presign/download"
```
Environment variables for the Control API (also accept `S3_*` names without the `CONTROL_` prefix; see `S3_PLAN.md`):
- `CONTROL_S3_ENDPOINT` — S3 API base URL used by the server client
- `CONTROL_S3_PUBLIC_ENDPOINT` — optional; host used in presigned URLs when browsers must reach a different host than the API (e.g. `localhost:9000` vs `minio:9000` in compose)
- `CONTROL_S3_REGION`, `CONTROL_S3_BUCKET_DOCS`, `CONTROL_S3_PREFIX_DOCS`, `CONTROL_S3_FORCE_PATH_STYLE`, `CONTROL_S3_INSECURE`
- Secrets may be mounted as files: `CONTROL_S3_ACCESS_KEY_ID_FILE`, `CONTROL_S3_SECRET_ACCESS_KEY_FILE`

View File

@@ -0,0 +1,11 @@
{
"Rules": [
{
"ID": "AbortIncompleteMultipartUploads",
"Status": "Enabled",
"AbortIncompleteMultipartUpload": {
"DaysAfterInitiation": 7
}
}
]
}

View File

@@ -0,0 +1,22 @@
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "DocsListPrefixOnly",
"Effect": "Allow",
"Action": ["s3:ListBucket"],
"Resource": "arn:aws:s3:::${S3_BUCKET_DOCS}",
"Condition": {
"StringLike": {
"s3:prefix": ["${S3_PREFIX_DOCS}*", "${S3_PREFIX_DOCS}"]
}
}
},
{
"Sid": "DocsObjectRWUnderPrefix",
"Effect": "Allow",
"Action": ["s3:GetObject", "s3:PutObject", "s3:DeleteObject"],
"Resource": "arn:aws:s3:::${S3_BUCKET_DOCS}/${S3_PREFIX_DOCS}*"
}
]
}

View File

@@ -0,0 +1,28 @@
services:
loki:
command:
- "-config.file=/etc/loki/config.s3.yml"
- "-config.expand-env=true"
environment:
# Loki expects a single URL for S3-compatible endpoints:
# s3://<access_key>:<secret_key>@<host>:<port>/<bucket>
LOKI_S3_URL: "s3://minioadmin:minioadmin@minio:9000/cloudlysis-loki"
volumes:
- ./loki/config.s3.yml:/etc/loki/config.s3.yml:ro
depends_on:
- minio-init
tempo:
command:
- "-config.file=/etc/tempo/config.s3.yml"
- "-config.expand-env=true"
environment:
TEMPO_S3_ENDPOINT: "minio:9000"
TEMPO_S3_BUCKET: "cloudlysis-tempo"
TEMPO_S3_ACCESS_KEY: "minioadmin"
TEMPO_S3_SECRET_KEY: "minioadmin"
volumes:
- ./tempo/config.s3.yml:/etc/tempo/config.s3.yml:ro
depends_on:
- minio-init

View File

@@ -36,6 +36,7 @@ services:
- "3200:3200"
- "4317:4317"
- "4318:4318"
- "9411:9411"
command:
- "-config.file=/etc/tempo/config.yml"
volumes:

View File

@@ -0,0 +1,30 @@
auth_enabled: false
server:
http_listen_port: 3100
common:
path_prefix: /loki
replication_factor: 1
ring:
kvstore:
store: inmemory
schema_config:
configs:
- from: 2025-01-01
store: tsdb
object_store: s3
schema: v13
index:
prefix: index_
period: 24h
storage_config:
aws:
# MinIO (docker compose). Use `-config.expand-env=true`.
s3: ${LOKI_S3_URL}
s3forcepathstyle: true
limits_config:
allow_structured_metadata: true

View File

@@ -0,0 +1,37 @@
server:
http_listen_port: 3200
distributor:
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
zipkin:
endpoint: 0.0.0.0:9411
ingester:
max_block_bytes: 1000000
trace_idle_period: 10s
compactor:
compaction:
block_retention: 24h
storage:
trace:
backend: s3
s3:
# MinIO (docker compose). Use `-config.expand-env=true`.
endpoint: ${TEMPO_S3_ENDPOINT}
bucket: ${TEMPO_S3_BUCKET}
access_key: ${TEMPO_S3_ACCESS_KEY}
secret_key: ${TEMPO_S3_SECRET_KEY}
insecure: true
overrides:
defaults:
metrics_generator:
processors: []

View File

@@ -9,6 +9,8 @@ distributor:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
zipkin:
endpoint: 0.0.0.0:9411
ingester:
max_block_bytes: 1000000

399
plans/SUBSCRIPTIONS_PLAN.md Normal file
View File

@@ -0,0 +1,399 @@
# Tenant Subscriptions Plan (1 Tenant = 1 Subscription)
## Principles
- Tenant-based billing is built-in and enforced consistently:
- Exactly one “primary” subscription per tenant.
- Subscription state is authoritative for entitlements.
- Provider-agnostic core with a single “billing provider” adapter:
- Stripe or Polar can be plugged in without rewriting the rest of the platform.
- Tasks are prioritized by ordering:
- Within each milestone, tasks are listed top-to-bottom in priority order.
- Each milestone is stop-the-line gated:
- All tasks completed
- All milestone tests pass
- Workspace verification commands pass
- Webhooks are treated as untrusted input:
- Verified signatures
- Idempotent processing
- No secrets are ever committed or logged
- Fluent development progression:
- Start with local-only, file-backed state + mocked provider
- Add real provider sandbox integration behind env-gated tests
- Add UI self-service once the state machine is stable
- Enforce entitlements only after billing state is reliable
## Goals
- Allow a tenant admin to self-serve billing:
- Start a subscription (checkout)
- Manage subscription and payment method (customer portal)
- View current plan and billing status
- Support Stripe or Polar as the billing backend.
- Provide a strict, test-gated integration that is safe to deploy incrementally.
- Keep API routes consistent with existing Control API conventions:
- Tenant-scoped routes are under `/admin/v1/tenants/{tenant_id}/...` and require auth + tenant header.
- Provider webhooks are unauthenticated but signature-verified.
## Non-Goals (Initial)
- Multiple subscriptions per tenant.
- Per-seat billing.
- Multiple concurrent plans per tenant.
- Usage-based metered billing (can be added later as a separate plan).
## Definitions
### Tenant
A logical customer boundary identified by `tenant_id` (UUID) and carried via the tenant header already used by Control API endpoints.
### Tenant Admin (Actor)
An authenticated principal with permission to manage billing for a tenant:
- Read: requires `control:read`
- Mutate (checkout/portal): requires `control:write`
### Subscription
The provider subscription object mapped 1:1 to a tenant, with a local cached state:
- `status`: `trialing | active | past_due | paused | canceled | incomplete`
- `plan`: internal plan identifier (maps to provider price/product)
- `current_period_end` / `cancel_at_period_end`
### Entitlements
An internal set of feature gates derived from the subscription plan and status:
- Examples: max deployments, max runners, S3 docs enabled, support tier, etc.
### Billing Provider
An adapter that supplies:
- Checkout session creation
- Portal session creation
- Webhook event verification + parsing
- Optional reconciliation reads (fetch subscription/customer state)
## Configuration Contract (Control API)
### Common Settings
- `CONTROL_BILLING_PROVIDER` = `stripe | polar`
- `CONTROL_BILLING_STATE_PATH` (default `billing/dev.json`)
- `CONTROL_BILLING_SELF_URL` (default `CONTROL_SELF_URL`, used for return URLs)
- `CONTROL_BILLING_ENFORCEMENT` = `0 | 1` (default `0`, gates entitlement enforcement)
- `CONTROL_BILLING_WEBHOOK_PUBLIC_URL` (optional; if unset, derive from `CONTROL_BILLING_SELF_URL`)
- `CONTROL_BILLING_ALLOWED_RETURN_ORIGINS` (comma-separated; optional safety check for return URLs)
### Stripe Settings (if provider = stripe)
- `CONTROL_STRIPE_SECRET_KEY` (secret)
- `CONTROL_STRIPE_WEBHOOK_SECRET` (secret)
- `CONTROL_STRIPE_PRICE_ID_<PLAN>` (e.g. `CONTROL_STRIPE_PRICE_ID_PRO`, env mapping per plan)
- Optional:
- `CONTROL_STRIPE_CUSTOMER_PORTAL_CONFIGURATION_ID`
### Polar Settings (if provider = polar)
- `CONTROL_POLAR_ACCESS_TOKEN` (secret)
- `CONTROL_POLAR_WEBHOOK_SECRET` (secret, if Polar provides webhook signing secret)
- `CONTROL_POLAR_PRODUCT_ID_<PLAN>` or equivalent plan mapping
## Data Model (MVP: File-Backed, Tenant-Scoped)
Persist subscription mappings in a JSON file, similar to `PlacementStore`s atomic write pattern, to support:
- Local development without requiring a database
- Deterministic integration tests
- Simple operational inspection
*Note: For production, this should eventually adopt the `ConfigRegistry` pattern (e.g. backed by NATS KV) to avoid reliance on persistent file storage in Docker Swarm.*
Suggested persisted structure:
- `BillingStateFile`:
- `revision` (uuid-based)
- `tenants: { <tenant_id>: TenantBillingState }`
- `TenantBillingState`:
- `provider: stripe | polar`
- `provider_customer_id`
- `provider_subscription_id`
- `provider_checkout_session_id` (last initiated; optional)
- `status`
- `plan`
- `current_period_end`
- `cancel_at_period_end`
- `processed_webhook_event_ids` (bounded set; for idempotency)
- `updated_at`
Idempotency constraints:
- Webhook event IDs are stored per tenant, capped to a fixed size (e.g. last 256 IDs) to prevent unbounded growth.
- Updates are monotonic:
- prefer provider event timestamps to ignore out-of-order “older” state transitions.
## Target Architecture
### Control API (Rust)
- New billing routes:
- `GET /admin/v1/tenants/{tenant_id}/billing` (read current billing + entitlements)
- `POST /admin/v1/tenants/{tenant_id}/billing/checkout` (create checkout session URL)
- `POST /admin/v1/tenants/{tenant_id}/billing/portal` (create portal session URL)
- `POST /billing/v1/webhooks/{provider}` (provider webhook ingress; does not require auth)
- Billing policy enforcement:
- Entitlements derived server-side
- Per-endpoint enforcement can be introduced gradually behind a feature flag
### Control UI (Vite + React)
- New “Billing” page scoped to a tenant:
- Current plan + status
- “Upgrade / Subscribe” (checkout)
- “Manage billing” (portal)
- Clear error states when billing is not configured
## Provider Contract (Adapter Surface)
Define a small provider interface so the platform remains stable even if switching providers:
- `create_checkout_session(tenant_id, plan, return_url) -> url`
- `create_portal_session(tenant_id, return_url) -> url`
- `verify_and_parse_webhook(headers, body) -> BillingEvent`
- `apply_event(event) -> TenantBillingState mutation`
- Optional: `reconcile(tenant_id) -> TenantBillingState` (periodic correction)
Provider mapping requirements:
- Persist tenant identity at the provider level:
- Prefer setting `tenant_id` as provider customer metadata.
- If customer metadata is not available, store an internal mapping from `provider_customer_id -> tenant_id`.
- Ensure subscription creation is single-flight per tenant:
- Prevent duplicate active subscriptions by checking local state before creating new sessions.
- Use provider idempotency keys where supported (or internal idempotency per tenant+plan).
## Security & Abuse Controls
- AuthZ:
- Tenant routes require the existing tenant header to match the path tenant ID.
- `control:read` required for viewing billing status.
- `control:write` required for checkout and portal actions.
- Return URL safety:
- Only allow return URLs whose origin is in `CONTROL_BILLING_ALLOWED_RETURN_ORIGINS`.
- Default return URL points to Control UI, derived from `CONTROL_BILLING_SELF_URL`.
- Webhook safety & observability:
- Verify signatures before parsing payloads.
- Enforce JSON size limits on webhook bodies.
- Always return `2xx` for already-processed events (idempotency).
- Never log full webhook payloads.
- Propagate provider event IDs as `x-correlation-id` in logs and spans to integrate seamlessly with the platform's VictoriaMetrics/Loki/Tempo observability stack (as standard in `DEVELOPMENT_PLAN.md`).
## API Contract (MVP)
### GET /admin/v1/tenants/{tenant_id}/billing
Returns a stable shape whether billing is configured or not:
- `configured: bool`
- `provider: stripe | polar | null`
- `plan: string | null`
- `status: string | null`
- `current_period_end: string | null`
- `cancel_at_period_end: bool | null`
- `entitlements: { ... }`
### POST /admin/v1/tenants/{tenant_id}/billing/checkout
Request:
- `plan: string`
- `return_path: string` (optional; appended to `CONTROL_BILLING_SELF_URL`)
Response:
- `url: string`
### POST /admin/v1/tenants/{tenant_id}/billing/portal
Request:
- `return_path: string` (optional)
Response:
- `url: string`
### POST /billing/v1/webhooks/{provider}
Provider-defined payload; must:
- verify signature
- map to internal events
- update local billing state atomically
## Development Plan (Milestones by Dependency)
## Milestone 0: Billing Domain + Storage + Read API
### Dependencies
- None
### Goal
Ship a provider-agnostic billing domain model and a safe persistence mechanism without contacting Stripe/Polar yet.
### Tasks
- [x] Add billing domain types in Control API:
- [x] `Plan`, `SubscriptionStatus`, `Entitlements`
- [x] provider-agnostic `BillingEvent` enum for webhook mapping
- [x] Add `BillingStore` patterned after `PlacementStore`/`ConfigRegistry`:
- [x] atomic write (tmp + rename) for dev file fallback
- [x] in-process locking
- [x] stable JSON schema + `revision`
- [x] Add `GET /admin/v1/tenants/{tenant_id}/billing`:
- [x] permission gate: requires `control:read`
- [x] tenant header enforcement consistent with existing routes
- [x] returns “not configured” when no subscription exists
- [x] Add a mock billing provider for tests:
- [x] deterministic checkout/portal URLs
- [x] deterministic webhook events without real signatures
### Required Tests (Gate)
- [x] Workspace verification commands
- [x] Unit tests (Control API):
- [x] billing state read/write roundtrip (atomic update)
- [x] entitlement derivation from `status + plan`
- [x] tenant isolation checks for billing routes (header vs path mismatch)
- [x] permission gates: `control:read` vs `control:write`
## Milestone 1: Checkout Flow (Create Subscription)
### Dependencies
- Milestone 0
### Goal
Allow tenant admins to initiate a subscription via the providers hosted checkout.
### Tasks
- [x] Add provider configuration parsing and validation:
- [x] strict env parsing with actionable errors
- [x] plan-to-price/product mapping via env
- [x] Add `POST /admin/v1/tenants/{tenant_id}/billing/checkout`:
- [x] permission gate: requires `control:write`
- [x] create or reuse provider customer for the tenant
- [x] create checkout session and return redirect URL
- [x] include tenant identifier in provider metadata (for webhook routing)
- [x] internal idempotency: do not create a new checkout if tenant already has an active/trialing subscription
- [x] Define return URL contract:
- [x] checkout success/cancel landing routes in Control UI
- [x] validate `return_path` against `CONTROL_BILLING_ALLOWED_RETURN_ORIGINS`
### Required Tests (Gate)
- [x] Workspace verification commands
- [x] Unit tests (Control API):
- [x] config validation (missing keys, invalid mapping)
- [x] provider request construction (return URLs, metadata)
- [x] checkout idempotency rules per tenant
- [x] Env-gated integration tests (sandbox; auto-skip unless env vars are set):
- [x] `CONTROL_TEST_STRIPE=1` or `CONTROL_TEST_POLAR=1` starts checkout and returns a valid URL
- [x] tenant metadata roundtrips through the provider (where supported)
## Milestone 2: Webhook Ingestion + Subscription State Sync
### Dependencies
- Milestone 1
### Goal
Make subscription state reliable and idempotent by processing provider webhooks.
### Tasks
- [x] Add `POST /billing/v1/webhooks/{provider}` endpoint:
- [x] signature verification
- [x] event parsing to `BillingEvent`
- [x] idempotency by provider event ID
- [x] tenant mapping via provider metadata or stored `provider_customer_id`
- [x] Map provider statuses to internal `SubscriptionStatus`:
- [x] `trialing`, `active`, `past_due`, `canceled`, etc.
- [x] Store updates in `BillingStore` and expose via `GET /tenants/{tenant_id}/billing`
- [x] ensure updates are monotonic (ignore older provider event timestamps)
### Required Tests (Gate)
- [x] Workspace verification commands
- [x] Unit tests (Control API):
- [x] webhook signature verification (good/bad signatures)
- [x] idempotency behavior (same event twice does not double-apply)
- [x] status mapping tables are stable
- [x] out-of-order events do not regress state
- [x] Docker/local integration (optional, if a provider CLI is used; env-gated):
- [x] `CONTROL_TEST_STRIPE_CLI=1` runs a local webhook-forward flow and verifies state update
## Milestone 3: Customer Portal (Self-Management)
### Dependencies
- Milestone 2
### Goal
Provide a “Manage billing” path for tenants to self-serve changes without operator involvement.
### Tasks
- [x] Add `POST /admin/v1/tenants/{tenant_id}/billing/portal`:
- [x] create provider portal session and return URL
- [x] ensure tenant ownership checks (header vs path)
- [x] permission gate: requires `control:write`
- [ ] Add Control UI billing page:
- [ ] show plan/status + renewal date
- [ ] “Subscribe / Upgrade” and “Manage billing” actions
- [ ] show “Billing not configured” when provider is disabled
### Required Tests (Gate)
- [x] Workspace verification commands
- [ ] UI unit tests (Vitest):
- [ ] billing page renders from mocked API state
- [ ] action buttons call the expected API endpoints
- [x] Env-gated integration tests:
- [x] portal session URL is generated and is HTTPS
## Milestone 4: Entitlements + Enforcement (Controlled Rollout)
### Dependencies
- Milestone 2 (Milestone 3 recommended for admin UX)
### Goal
Gate selected platform capabilities by tenant subscription state while maintaining a safe rollout path.
### Tasks
- [x] Define initial entitlement set and defaults:
- [x] choose “free/trial” behavior (read-only vs limited capability)
- [x] define grace period behavior for `past_due`
- [x] Add enforcement points in Control API:
- [x] middleware/helper to require entitlement per route
- [x] first enforcement target: a low-risk, tenant-scoped “write” capability
- [x] feature flag to disable enforcement globally during rollout
- [x] Add audit log entries for billing enforcement denials (no PII, no secrets)
### Required Tests (Gate)
- [x] Workspace verification commands
- [x] Unit tests (Control API):
- [x] entitlement checks per route return correct HTTP status
- [x] grace period handling
- [x] Integration tests:
- [x] a tenant without active subscription cannot perform the gated operation
- [x] an active tenant can perform the same operation
## Milestone 5: Reconciliation + Operational Hardening
### Dependencies
- Milestone 2
### Goal
Make billing state resilient against missed webhooks and operational drift.
### Tasks
- [x] Add a reconciliation job:
- [x] periodically fetch subscription state from provider for tenants
- [x] correct local state and emit audit entries
- [x] Add metrics:
- [x] webhook processing latency, verification failures, idempotency hits
- [x] tenant count by subscription status
- [x] Add robust error handling:
- [x] structured errors with safe messages
- [x] no provider payloads logged verbatim
- [x] Add provider API timeout/retry policy:
- [x] short timeouts with bounded retries
- [x] no retries on webhook signature failures
### Required Tests (Gate)
- [x] Workspace verification commands
- [x] Unit tests:
- [x] reconciliation updates state correctly
- [x] provider errors do not corrupt local state
## Milestone 6: Production Rollout
### Dependencies
- Milestone 3 (recommended), Milestone 4 (if enforcing)
### Goal
Deploy billing in production with safe secret handling and verifiable smoke checks.
### Tasks
- [x] Provision provider configuration (operator):
- [x] create products/prices (Stripe) or products/plans (Polar)
- [x] configure webhook endpoint + secret
- [x] set up customer portal settings (Stripe) if used
- [x] Configure Swarm secrets and stack env:
- [x] provider API keys and webhook secret stored as Swarm secrets
- [x] `CONTROL_BILLING_PROVIDER`, `CONTROL_BILLING_STATE_PATH`
- [x] `CONTROL_BILLING_ALLOWED_RETURN_ORIGINS` set to production UI origins
- [x] Define rollback plan:
- [x] disable enforcement feature flag
- [x] keep billing read-only operational
### Required Tests (Gate)
- [x] Workspace verification commands
- [x] Production smoke (env-gated):
- [x] create checkout session for a test tenant
- [x] process a webhook event and verify tenant state updates
- [x] generate a portal session URL
## Workspace Verification Commands
- `cargo fmt --check`
- `cargo clippy --workspace --all-targets -- -D warnings`
- `cargo test --workspace`
- `cd control/ui && npm ci && npm run lint && npm run typecheck && npm run test && npm run build`

View File

@@ -0,0 +1,27 @@
effects:
- name: noop
provider: noop
config: {}
- name: send_email
provider: email
config:
# Choose ONE backend for production.
#
# Option A) SMTP (recommended when you have an SMTP relay):
backend: smtp
url_env: RUNNER_SMTP_URL
#
# Option B) Resend:
# backend: resend
# api_key_env: RESEND_API_KEY
# from: "no-reply@example.com"
#
# Option C) Postmark:
# backend: postmark
# server_token_env: POSTMARK_SERVER_TOKEN
# from: "no-reply@example.com"
#
# Option D) AWS SES:
# backend: ses
# region: "eu-central-1"
# from: "no-reply@example.com"

View File

@@ -2,3 +2,8 @@ effects:
- name: noop
provider: noop
config: {}
- name: send_email
provider: email
config:
backend: smtp
url_env: RUNNER_SMTP_URL

View File

@@ -0,0 +1,42 @@
# Rollback Plan: Billing Enforced Gating
This document outlines the emergency procedure for disabling subscription-based entitlement gating in the Cloudlysis Control Plane.
## Symptoms
- Tenants receiving `402 Payment Required` errors even with valid active subscriptions.
- `JobEngine` refusing valid configuration updates due to incorrect resource limit enforcement.
- S3 Document Storage being inaccessible for authorized Pro/Enterprise tenants.
## Emergency Rollback Steps
### 1. Disable Global Enforcement
The quickest way to restore service is to disable enforcement via the environment variable toggle. This preserves all billing data and synchronization logic but bypasses the "Payment Required" blocks.
```bash
# In your Swarm stack file (e.g. control-plane.yml):
services:
control-api:
environment:
- CONTROL_BILLING_ENFORCEMENT_ENABLED=false
```
### 2. Deploy the Update
Deploy the stack to apply the change:
```bash
docker stack deploy -c control-plane.yml control
```
### 3. Verify System State
Confirm that tenants can now perform previously blocked operations (e.g., uploading documents or updating deployment configurations).
## Forensic Analysis
Once the system is stable, perform the following:
1. **Check Reconciliation Logs**: Look for `failed to fetch subscription` or `failed to apply reconciled billing event`.
2. **Verify Metrics**: Check `billing_webhook_requests_total{status="error"}` in Prometheus.
3. **Audit Drift**: Compare the `CONTROL_BILLING_STATE_PATH` file content against the Stripe Dashboard for the affected `tenant_id`.
## Recovery
To re-enable gating (after the root cause is resolved):
1. Set `CONTROL_BILLING_ENFORCEMENT_ENABLED=true`.
2. Redeploy the stack.
3. Monitor logs and metrics for 30 minutes.

View File

@@ -0,0 +1,36 @@
#!/bin/bash
set -e
# sample-secrets.sh
# This script demonstrates how to provision the necessary Swarm secrets
# for the billing system.
# 1. Stripe Secret Key (from Stripe Dashboard -> Developers -> API keys)
# Usage: echo "sk_test_..." | ./sample-secrets.sh
if [ -t 0 ]; then
echo "Error: Please pipe the Stripe Secret Key into this script."
echo "Example: echo \"sk_test_...\" | $0"
exit 1
fi
STRIPE_SK=$(cat -)
echo "Creating 'control_stripe_secret_key' secret..."
echo "$STRIPE_SK" | docker secret create control_stripe_secret_key -
# 2. Stripe Webhook Secret (from Stripe Dashboard -> Developers -> Webhooks -> [Endpoint])
# Note: You get this after configuring the endpoint in the dashboard.
echo "NOTE: Remember to also create 'control_stripe_webhook_secret' once you have it."
# echo "whsec_..." | docker secret create control_stripe_webhook_secret -
echo "Done. Update your stack file to reference these secrets:"
echo "
services:
control-api:
secrets:
- control_stripe_secret_key
- control_stripe_webhook_secret
environment:
- CONTROL_STRIPE_SECRET_KEY_FILE=/run/secrets/control_stripe_secret_key
- CONTROL_STRIPE_WEBHOOK_SECRET_FILE=/run/secrets/control_stripe_webhook_secret
"

View File

@@ -0,0 +1,79 @@
version: "3.9"
services:
control-api:
image: ${IMAGE_PREFIX:-cloudlysis}/control-api:${IMAGE_TAG:-dev}
environment:
CONTROL_API_ADDR: "0.0.0.0:8080"
CONTROL_PLACEMENT_PATH: "/etc/control/placement.json"
CONTROL_SWARM_STATE_PATH: "/etc/control/swarm_state.json"
CONTROL_SELF_URL: "${CONTROL_SELF_URL:-http://control-api:8080}"
# S3 document storage (Hetzner Object Storage in production).
CONTROL_S3_ENDPOINT: "${CONTROL_S3_ENDPOINT:?missing}"
CONTROL_S3_PUBLIC_ENDPOINT: "${CONTROL_S3_PUBLIC_ENDPOINT:-}"
CONTROL_S3_REGION: "${CONTROL_S3_REGION:?missing}"
CONTROL_S3_ACCESS_KEY_ID_FILE: "/run/secrets/control_s3_access_key_id"
CONTROL_S3_SECRET_ACCESS_KEY_FILE: "/run/secrets/control_s3_secret_access_key"
CONTROL_S3_FORCE_PATH_STYLE: "${CONTROL_S3_FORCE_PATH_STYLE:-false}"
CONTROL_S3_INSECURE: "${CONTROL_S3_INSECURE:-false}"
CONTROL_S3_BUCKET_DOCS: "${CONTROL_S3_BUCKET_DOCS:?missing}"
CONTROL_S3_PREFIX_DOCS: "${CONTROL_S3_PREFIX_DOCS:-docs/}"
secrets:
- control_s3_access_key_id
- control_s3_secret_access_key
configs:
- source: control_placement
target: /etc/control/placement.json
- source: control_swarm_state
target: /etc/control/swarm_state.json
networks:
- internal
ports:
- target: 8080
published: 8080
protocol: tcp
mode: ingress
deploy:
replicas: 2
restart_policy:
condition: on-failure
update_config:
parallelism: 1
order: start-first
failure_action: rollback
control-ui:
image: ${IMAGE_PREFIX:-cloudlysis}/control-ui:${IMAGE_TAG:-dev}
environment:
VITE_CONTROL_API_URL: "${VITE_CONTROL_API_URL:-http://control-api:8080}"
networks:
- public
- internal
ports:
- target: 80
published: 8081
protocol: tcp
mode: ingress
deploy:
replicas: 2
restart_policy:
condition: on-failure
configs:
control_placement:
file: ../../config/placement/dev.json
control_swarm_state:
file: ../../swarm/dev.json
secrets:
control_s3_access_key_id:
external: true
control_s3_secret_access_key:
external: true
networks:
public:
driver: overlay
internal:
driver: overlay

View File

@@ -1,6 +1,37 @@
version: "3.9"
services:
minio:
image: minio/minio:RELEASE.2025-02-28T09-55-16Z
command: ["server", "/data", "--console-address", ":9001"]
environment:
MINIO_ROOT_USER: minioadmin
MINIO_ROOT_PASSWORD: minioadmin
volumes:
- minio_data:/data
networks:
- internal
deploy:
replicas: 1
minio-init:
image: minio/mc:RELEASE.2025-02-21T16-00-46Z
networks:
- internal
command:
- /bin/sh
- -c
- |
set -euo pipefail
mc alias set local http://minio:9000 minioadmin minioadmin
mc mb -p local/cloudlysis-docs || true
mc anonymous set download local/cloudlysis-docs || true
echo "minio init done"
deploy:
replicas: 1
restart_policy:
condition: none
control-api:
image: ${IMAGE_PREFIX:-cloudlysis}/control-api:${IMAGE_TAG:-dev}
environment:
@@ -8,6 +39,18 @@ services:
CONTROL_PLACEMENT_PATH: "/etc/control/placement.json"
CONTROL_SWARM_STATE_PATH: "/etc/control/swarm_state.json"
CONTROL_SELF_URL: "http://control-api:8080"
CONTROL_S3_ENDPOINT: "${CONTROL_S3_ENDPOINT:-http://minio:9000}"
CONTROL_S3_PUBLIC_ENDPOINT: "${CONTROL_S3_PUBLIC_ENDPOINT:-}"
CONTROL_S3_REGION: "${CONTROL_S3_REGION:-us-east-1}"
CONTROL_S3_ACCESS_KEY_ID_FILE: "/run/secrets/control_s3_access_key_id"
CONTROL_S3_SECRET_ACCESS_KEY_FILE: "/run/secrets/control_s3_secret_access_key"
CONTROL_S3_FORCE_PATH_STYLE: "${CONTROL_S3_FORCE_PATH_STYLE:-true}"
CONTROL_S3_INSECURE: "${CONTROL_S3_INSECURE:-true}"
CONTROL_S3_BUCKET_DOCS: "${CONTROL_S3_BUCKET_DOCS:-cloudlysis-docs}"
CONTROL_S3_PREFIX_DOCS: "${CONTROL_S3_PREFIX_DOCS:-docs/}"
secrets:
- control_s3_access_key_id
- control_s3_secret_access_key
configs:
- source: control_placement_dev
target: /etc/control/placement.json
@@ -44,12 +87,21 @@ services:
configs:
control_placement_dev:
file: ../../placement/dev.json
file: ../../config/placement/dev.json
control_swarm_state_dev:
file: ../../swarm/dev.json
secrets:
control_s3_access_key_id:
external: true
control_s3_secret_access_key:
external: true
networks:
public:
driver: overlay
internal:
driver: overlay
volumes:
minio_data:

View File

@@ -89,6 +89,8 @@ services:
RUNNER_STORAGE_PATH: /data/runner.mdbx
RUNNER_SAGA_MANIFEST_PATH: /config/sagas.yaml
RUNNER_EFFECTS_MANIFEST_PATH: /config/effects.yaml
# For production, point this at a real relay (SMTP/Resend/Postmark/SES) via effects config.
RUNNER_SMTP_URL: "${RUNNER_SMTP_URL:-}"
volumes:
- runner_saga_data:/data
configs:
@@ -107,6 +109,7 @@ services:
RUNNER_HTTP_ADDR: 0.0.0.0:8081
RUNNER_STORAGE_PATH: /data/runner.mdbx
RUNNER_EFFECTS_MANIFEST_PATH: /config/effects.yaml
RUNNER_SMTP_URL: "${RUNNER_SMTP_URL:-}"
volumes:
- runner_effect_data:/data
configs: