diff --git a/.gitea/workflows/s3-provision.yml b/.gitea/workflows/s3-provision.yml new file mode 100644 index 0000000..32d9c3d --- /dev/null +++ b/.gitea/workflows/s3-provision.yml @@ -0,0 +1,45 @@ +name: s3-provision + +on: + workflow_dispatch: + +jobs: + provision-docs-bucket: + runs-on: ubuntu-latest + timeout-minutes: 10 + steps: + - uses: actions/checkout@v4 + + - name: Install AWS CLI + run: | + sudo apt-get update + sudo apt-get install -y awscli + + - name: Validate required secrets + env: + AWS_ACCESS_KEY_ID: ${{ secrets.S3_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_SECRET_ACCESS_KEY }} + S3_ENDPOINT: ${{ secrets.S3_ENDPOINT }} + S3_REGION: ${{ secrets.S3_REGION }} + S3_BUCKET_DOCS: ${{ secrets.S3_BUCKET_DOCS }} + run: | + test -n "$AWS_ACCESS_KEY_ID" + test -n "$AWS_SECRET_ACCESS_KEY" + test -n "$S3_ENDPOINT" + test -n "$S3_REGION" + test -n "$S3_BUCKET_DOCS" + + - name: Provision docs bucket (idempotent) + env: + AWS_ACCESS_KEY_ID: ${{ secrets.S3_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_SECRET_ACCESS_KEY }} + S3_ENDPOINT: ${{ secrets.S3_ENDPOINT }} + S3_REGION: ${{ secrets.S3_REGION }} + S3_BUCKET_DOCS: ${{ secrets.S3_BUCKET_DOCS }} + S3_ENABLE_VERSIONING: ${{ secrets.S3_ENABLE_VERSIONING }} + S3_LIFECYCLE_JSON: docs/usage/s3_lifecycle_docs_default.json + S3_PREFIX_DOCS: docs/ + run: | + sh docker/scripts/s3_create_docs_bucket.sh + sh docker/scripts/s3_verify_docs.sh + diff --git a/Cargo.lock b/Cargo.lock index d40fdf6..87ea5ce 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13,13 +13,13 @@ name = "aggregate" version = "0.1.0" dependencies = [ "anyhow", - "async-nats", + "async-nats 0.39.0", "axum 0.7.9", "chrono", "edge-logger-client", "edge_storage", "futures", - "lru", + "lru 0.12.5", "prost 0.13.5", "protoc-bin-vendored", "query_engine", @@ -150,8 +150,15 @@ checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" name = "api" version = "0.1.0" dependencies = [ + "async-nats 0.42.0", + "async-trait", + "aws-config", + "aws-credential-types", + "aws-sdk-s3", "axum 0.8.8", "clap", + "futures", + "hex", "jsonwebtoken", "metrics 0.23.1", "metrics-exporter-prometheus 0.16.2", @@ -159,6 +166,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", + "sha2", "shared", "thiserror 2.0.18", "tokio", @@ -166,6 +174,8 @@ dependencies = [ "tower-http 0.6.8", "tracing", "tracing-subscriber", + "url", + "urlencoding", "uuid", ] @@ -229,6 +239,42 @@ dependencies = [ "url", ] +[[package]] +name = "async-nats" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08f6da6d49a956424ca4e28fe93656f790d748b469eaccbc7488fec545315180" +dependencies = [ + "base64", + "bytes", + "futures", + "memchr", + "nkeys", + "nuid", + "once_cell", + "pin-project", + "portable-atomic", + "rand 0.8.5", + "regex", + "ring", + "rustls-native-certs 0.7.3", + "rustls-pemfile", + "rustls-webpki 0.102.8", + "serde", + "serde_json", + "serde_nanos", + "serde_repr", + "thiserror 1.0.69", + "time", + "tokio", + "tokio-rustls 0.26.4", + "tokio-util", + "tokio-websockets", + "tracing", + "tryhard", + "url", +] + [[package]] name = "async-stream" version = "0.3.6" @@ -282,6 +328,8 @@ checksum = "11493b0bad143270fb8ad284a096dd529ba91924c5409adeac856cc1bf047dbc" dependencies = [ "aws-credential-types", "aws-runtime", + "aws-sdk-sso", + "aws-sdk-ssooidc", "aws-sdk-sts", "aws-smithy-async", "aws-smithy-http", @@ -292,11 +340,14 @@ dependencies = [ "aws-types", "bytes", "fastrand", + "hex", "http 1.4.0", + "sha1", "time", "tokio", "tracing", "url", + "zeroize", ] [[package]] @@ -342,6 +393,7 @@ dependencies = [ "aws-credential-types", "aws-sigv4", "aws-smithy-async", + "aws-smithy-eventstream", "aws-smithy-http", "aws-smithy-runtime", "aws-smithy-runtime-api", @@ -350,7 +402,9 @@ dependencies = [ "bytes", "bytes-utils", "fastrand", + "http 0.2.12", "http 1.4.0", + "http-body 0.4.6", "http-body 1.0.1", "percent-encoding", "pin-project-lite", @@ -358,6 +412,41 @@ dependencies = [ "uuid", ] +[[package]] +name = "aws-sdk-s3" +version = "1.127.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "151783f64e0dcddeb4965d08e36c276b4400a46caa88805a2e36d497deaf031a" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-sigv4", + "aws-smithy-async", + "aws-smithy-checksums", + "aws-smithy-eventstream", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-observability", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-smithy-xml", + "aws-types", + "bytes", + "fastrand", + "hex", + "hmac", + "http 0.2.12", + "http 1.4.0", + "http-body 1.0.1", + "lru 0.16.3", + "percent-encoding", + "regex-lite", + "sha2", + "tracing", + "url", +] + [[package]] name = "aws-sdk-sesv2" version = "1.117.0" @@ -382,6 +471,54 @@ dependencies = [ "tracing", ] +[[package]] +name = "aws-sdk-sso" +version = "1.97.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9aadc669e184501caaa6beafb28c6267fc1baef0810fb58f9b205485ca3f2567" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-observability", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sdk-ssooidc" +version = "1.99.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1342a7db8f358d3de0aed2007a0b54e875458e39848d54cc1d46700b2bfcb0a8" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-observability", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "regex-lite", + "tracing", +] + [[package]] name = "aws-sdk-sts" version = "1.101.0" @@ -414,19 +551,25 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b0b660013a6683ab23797778e21f1f854744fdf05f68204b4cca4c8c04b5d1f4" dependencies = [ "aws-credential-types", + "aws-smithy-eventstream", "aws-smithy-http", "aws-smithy-runtime-api", "aws-smithy-types", "bytes", + "crypto-bigint 0.5.5", "form_urlencoded", "hex", "hmac", "http 0.2.12", "http 1.4.0", + "p256", "percent-encoding", + "ring", "sha2", + "subtle", "time", "tracing", + "zeroize", ] [[package]] @@ -440,12 +583,45 @@ dependencies = [ "tokio", ] +[[package]] +name = "aws-smithy-checksums" +version = "0.64.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6750f3dd509b0694a4377f0293ed2f9630d710b1cebe281fa8bac8f099f88bc6" +dependencies = [ + "aws-smithy-http", + "aws-smithy-types", + "bytes", + "crc-fast", + "hex", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "md-5", + "pin-project-lite", + "sha1", + "sha2", + "tracing", +] + +[[package]] +name = "aws-smithy-eventstream" +version = "0.60.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "faf09d74e5e32f76b8762da505a3cd59303e367a664ca67295387baa8c1d7548" +dependencies = [ + "aws-smithy-types", + "bytes", + "crc32fast", +] + [[package]] name = "aws-smithy-http" version = "0.63.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba1ab2dc1c2c3749ead27180d333c42f11be8b0e934058fb4b2258ee8dbe5231" dependencies = [ + "aws-smithy-eventstream", "aws-smithy-runtime-api", "aws-smithy-types", "bytes", @@ -473,13 +649,21 @@ dependencies = [ "h2 0.3.27", "h2 0.4.13", "http 0.2.12", + "http 1.4.0", "http-body 0.4.6", "hyper 0.14.32", + "hyper 1.8.1", "hyper-rustls 0.24.2", + "hyper-rustls 0.27.7", + "hyper-util", "pin-project-lite", "rustls 0.21.12", + "rustls 0.23.37", "rustls-native-certs 0.8.3", + "rustls-pki-types", "tokio", + "tokio-rustls 0.26.4", + "tower 0.5.3", "tracing", ] @@ -562,6 +746,7 @@ dependencies = [ "base64-simd", "bytes", "bytes-utils", + "futures-core", "http 0.2.12", "http 1.4.0", "http-body 0.4.6", @@ -574,6 +759,8 @@ dependencies = [ "ryu", "serde", "time", + "tokio", + "tokio-util", ] [[package]] @@ -706,6 +893,12 @@ dependencies = [ "tracing", ] +[[package]] +name = "base16ct" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349a06037c7bf932dd7e7d1f653678b2038b9ad46a74102f1fc7bd7872678cce" + [[package]] name = "base32" version = "0.5.1" @@ -1157,6 +1350,33 @@ dependencies = [ "libc", ] +[[package]] +name = "crc" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9710d3b3739c2e349eb44fe848ad0b7c8cb1e42bd87ee49371df2f7acaf3e675" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" + +[[package]] +name = "crc-fast" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fd92aca2c6001b1bf5ba0ff84ee74ec8501b52bbef0cac80bf25a6c1d87a83d" +dependencies = [ + "crc", + "digest", + "rustversion", + "spin", +] + [[package]] name = "crc32fast" version = "1.5.0" @@ -1200,6 +1420,28 @@ version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +[[package]] +name = "crypto-bigint" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef2b4b23cddf68b89b8f8069890e8c270d54e2d5fe1b143820234805e4cb17ef" +dependencies = [ + "generic-array", + "rand_core 0.6.4", + "subtle", + "zeroize", +] + +[[package]] +name = "crypto-bigint" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76" +dependencies = [ + "rand_core 0.6.4", + "subtle", +] + [[package]] name = "crypto-common" version = "0.1.7" @@ -1242,6 +1484,16 @@ version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7a1e2f27636f116493b8b860f5546edb47c8d8f8ea73e1d2a20be88e28d1fea" +[[package]] +name = "der" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1a467a65c5e759bce6e65eaf91cc29f466cdc57cb65777bd646872a8a1fd4de" +dependencies = [ + "const-oid", + "zeroize", +] + [[package]] name = "der" version = "0.7.10" @@ -1320,13 +1572,25 @@ version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" +[[package]] +name = "ecdsa" +version = "0.14.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c" +dependencies = [ + "der 0.6.1", + "elliptic-curve", + "rfc6979", + "signature 1.6.4", +] + [[package]] name = "ed25519" version = "2.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53" dependencies = [ - "signature", + "signature 2.2.0", ] [[package]] @@ -1338,7 +1602,7 @@ dependencies = [ "curve25519-dalek", "ed25519", "sha2", - "signature", + "signature 2.2.0", "subtle", ] @@ -1414,6 +1678,26 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +[[package]] +name = "elliptic-curve" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3" +dependencies = [ + "base16ct", + "crypto-bigint 0.4.9", + "der 0.6.1", + "digest", + "ff", + "generic-array", + "group", + "pkcs8 0.9.0", + "rand_core 0.6.4", + "sec1", + "subtle", + "zeroize", +] + [[package]] name = "email-encoding" version = "0.4.1" @@ -1477,6 +1761,16 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +[[package]] +name = "ff" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d013fc25338cc558c5c2cfbad646908fb23591e2404481826742b651c9af7160" +dependencies = [ + "rand_core 0.6.4", + "subtle", +] + [[package]] name = "fiat-crypto" version = "0.2.9" @@ -1524,6 +1818,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "foreign-types" version = "0.3.2" @@ -1674,7 +1974,7 @@ version = "0.1.0" dependencies = [ "anyhow", "argon2", - "async-nats", + "async-nats 0.39.0", "async-trait", "axum 0.7.9", "base32", @@ -1768,6 +2068,17 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" +[[package]] +name = "group" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5dfbfb3a6cfbd390d5c9564ab283a0349b9b9fcd46a706c1eb10e0db70bfbac7" +dependencies = [ + "ff", + "rand_core 0.6.4", + "subtle", +] + [[package]] name = "gzip-header" version = "1.0.0" @@ -1841,7 +2152,7 @@ checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ "allocator-api2", "equivalent", - "foldhash", + "foldhash 0.1.5", ] [[package]] @@ -1849,6 +2160,11 @@ name = "hashbrown" version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash 0.2.0", +] [[package]] name = "heck" @@ -2532,6 +2848,15 @@ dependencies = [ "hashbrown 0.15.5", ] +[[package]] +name = "lru" +version = "0.16.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1dc47f592c06f33f8e3aea9591776ec7c9f9e4124778ff8a3c3b87159f7e593" +dependencies = [ + "hashbrown 0.16.1", +] + [[package]] name = "lru-slab" version = "0.1.2" @@ -2559,6 +2884,16 @@ version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + [[package]] name = "mdbx-sys" version = "13.11.0" @@ -3123,6 +3458,17 @@ dependencies = [ "unicode-id-start", ] +[[package]] +name = "p256" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51f44edd08f51e2ade572f141051021c5af22677e42b7dd28a88155151c33594" +dependencies = [ + "ecdsa", + "elliptic-curve", + "sha2", +] + [[package]] name = "parking_lot" version = "0.12.5" @@ -3300,14 +3646,24 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkcs8" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9eca2c590a5f85da82668fa685c09ce2888b9430e83299debf1f34b65fd4a4ba" +dependencies = [ + "der 0.6.1", + "spki 0.6.0", +] + [[package]] name = "pkcs8" version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7" dependencies = [ - "der", - "spki", + "der 0.7.10", + "spki 0.7.3", ] [[package]] @@ -3379,7 +3735,7 @@ name = "projection" version = "0.1.0" dependencies = [ "anyhow", - "async-nats", + "async-nats 0.39.0", "axum 0.7.9", "chrono", "edge-logger-client", @@ -3937,6 +4293,17 @@ dependencies = [ "webpki-roots 1.0.6", ] +[[package]] +name = "rfc6979" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7743f17af12fa0b03b803ba12cd6a8d9483a587e89c69445e3909655c0b9fabb" +dependencies = [ + "crypto-bigint 0.4.9", + "hmac", + "zeroize", +] + [[package]] name = "ring" version = "0.17.14" @@ -3985,7 +4352,7 @@ name = "runner" version = "0.1.0" dependencies = [ "anyhow", - "async-nats", + "async-nats 0.39.0", "aws-config", "aws-sdk-sesv2", "axum 0.7.9", @@ -4306,6 +4673,20 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "sec1" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928" +dependencies = [ + "base16ct", + "der 0.6.1", + "generic-array", + "pkcs8 0.9.0", + "subtle", + "zeroize", +] + [[package]] name = "security-framework" version = "2.11.1" @@ -4535,12 +4916,22 @@ version = "0.27.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c1e303f8205714074f6068773f0e29527e0453937fe837c9717d066635b65f31" dependencies = [ - "pkcs8", + "pkcs8 0.10.2", "rand_core 0.6.4", - "signature", + "signature 2.2.0", "zeroize", ] +[[package]] +name = "signature" +version = "1.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74233d3b3b2f6d4b006dc19dee745e73e2a6bfb6f93607cd3b02bd5b00797d7c" +dependencies = [ + "digest", + "rand_core 0.6.4", +] + [[package]] name = "signature" version = "2.2.0" @@ -4625,6 +5016,22 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "spin" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5fe4ccb98d9c292d56fec89a5e07da7fc4cf0dc11e156b41793132775d3e591" + +[[package]] +name = "spki" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67cf02bbac7a337dc36e4f5a693db6c21e7863f45070f7064577eb4367a3212b" +dependencies = [ + "base64ct", + "der 0.6.1", +] + [[package]] name = "spki" version = "0.7.3" @@ -4632,7 +5039,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d" dependencies = [ "base64ct", - "der", + "der 0.7.10", ] [[package]] diff --git a/DOCKER.md b/DOCKER.md index 83ad3c0..e671f90 100644 --- a/DOCKER.md +++ b/DOCKER.md @@ -11,8 +11,15 @@ docker compose down -v To include the observability stack (Grafana/Loki/Tempo/VictoriaMetrics) with the local compose: ```bash -docker compose -f docker-compose.yml -f observability/docker-compose.yml up -d --build -docker compose -f docker-compose.yml -f observability/docker-compose.yml down -v +docker compose --profile observability up -d --build +docker compose --profile observability down -v +``` + +To use S3-compatible object storage (MinIO) for Loki + Tempo locally: + +```bash +docker compose -f docker-compose.yml -f observability/docker-compose.s3.yml --profile observability up -d --build +docker compose -f docker-compose.yml -f observability/docker-compose.s3.yml --profile observability down -v ``` Service ports in the default compose: @@ -23,8 +30,20 @@ Service ports in the default compose: - Runner HTTP: `http://localhost:28080` - Control API: `http://localhost:38080` - Control UI: `http://localhost:8082` +- MailHog SMTP: `smtp://localhost:1025` +- MailHog UI: `http://localhost:8025` +- MinIO S3 API: `http://localhost:9000` +- MinIO console: `http://localhost:9001` - NATS: `nats://localhost:4222`, monitoring `http://localhost:8222` +MinIO defaults: +- Credentials: `minioadmin` / `minioadmin` +- Bucket: `cloudlysis-docs-0,cloudlysis-docs-1,cloudlysis-docs-2` (comma-separated docs bucket set) + +Email defaults (local): +- Runner uses SMTP backend via `RUNNER_SMTP_URL=smtp://mailhog:1025` +- Inspect emails at MailHog UI `http://localhost:8025` + ## Swarm (Dev) Build images: @@ -56,6 +75,10 @@ Create dev secrets required by the observability stack: sh docker/scripts/swarm_dev_secrets.sh ``` +This also creates dev secrets used by the control plane for S3 document storage: +- `control_s3_access_key_id` +- `control_s3_secret_access_key` + Deploy: ```bash @@ -66,6 +89,60 @@ docker stack deploy -c swarm/stacks/control-plane.yml cloudlysis_control docker stack deploy -c swarm/stacks/observability.yml cloudlysis_obs ``` +Production-style control plane (no MinIO in stack; S3 is external): + +```bash +# create secrets (set CONTROL_S3_ACCESS_KEY_ID / CONTROL_S3_SECRET_ACCESS_KEY first) +sh docker/scripts/swarm_dev_secrets.sh + +# required env for the stack +export CONTROL_S3_ENDPOINT="https://" +export CONTROL_S3_REGION="eu-central-1" +export CONTROL_S3_BUCKET_DOCS="cloudlysis-docs" + +docker stack deploy -c swarm/stacks/control-plane-prod.yml cloudlysis_control +``` + +Verify production S3 bucket/prefix permissions with AWS CLI (env-gated): + +```bash +# install aws cli v2, then export creds and target +export S3_ENDPOINT="https://" +export S3_REGION="eu-central-1" +export S3_BUCKET_DOCS="cloudlysis-docs" +export S3_PREFIX_DOCS="docs/" + +# optionally set S3_FORCE_PATH_STYLE=true for some S3-compatible endpoints + +sh docker/scripts/s3_verify_docs.sh +``` + +Create/provision the docs bucket (idempotent; CI/CD-friendly): + +```bash +export S3_ENDPOINT="https://" +export S3_REGION="eu-central-1" +export S3_BUCKET_DOCS="cloudlysis-docs" + +# optional +# export S3_ENABLE_VERSIONING=true + +sh docker/scripts/s3_create_docs_bucket.sh +``` + +Apply a lifecycle policy to the docs bucket (operator; automated): + +```bash +export S3_ENDPOINT="https://" +export S3_REGION="eu-central-1" +export S3_BUCKET_DOCS="cloudlysis-docs" + +# optional: provide your own lifecycle JSON file +# export S3_LIFECYCLE_JSON="path/to/lifecycle.json" + +sh docker/scripts/s3_apply_lifecycle_docs.sh +``` + Remove: ```bash diff --git a/README.md b/README.md index be36f6d..b60e389 100644 --- a/README.md +++ b/README.md @@ -1,38 +1,55 @@ -# cloudlysis (monorepo) +# Cloudlysis (monorepo) -## Layout -- Rust services (Cargo workspace): `aggregate/`, `gateway/`, `projection/`, `runner/`, `control/api/`, `shared/` -- Control UI: `control/ui/` -- Docker + Swarm + Compose: `docker/`, `docker-compose.yml`, `swarm/`, `observability/` +Production-oriented, multi-service Rust workspace with an operator-facing Control Plane (API + Admin UI), S3-backed document storage, and an optional observability stack for local parity. -## Documentation -- docs/README.md -- Architecture: docs/architecture/overview.md, docs/architecture/transport.md -- Developer: docs/developer/setup.md, docs/developer/testing.md -- Usage: docs/usage/quickstart.md, docs/usage/api.md, docs/usage/nats.md -- Gitea Wiki: run `scripts/publish_gitea_wiki.sh` (publishes `wiki/` to the repo wiki) +## Quickstart (local dev) -## Quick Start (Docker Compose) +Core stack (includes MinIO + MailHog + Control Plane): ```bash docker compose up -d --build ``` -Full local stack with observability: +Full local stack with observability (Grafana/Loki/Tempo/VictoriaMetrics): ```bash -docker compose -f docker-compose.yml -f observability/docker-compose.yml up -d --build +docker compose --profile observability up -d --build ``` -## Commands -- `make compose-up`, `make compose-down` -- `make compose-up-observability`, `make compose-down-observability` -- `make docker-build-all` -- `make swarm-deploy-all`, `make swarm-rm-all` +Full local stack + Loki/Tempo using MinIO (S3 mode): -More details: `DOCKER.md` +```bash +docker compose -f docker-compose.yml -f observability/docker-compose.s3.yml --profile observability up -d --build +``` -## Workspace Verification +## Local endpoints +- **Control UI**: `http://localhost:8082` +- **Control API**: `http://localhost:38080` +- **Grafana** (observability profile): `http://localhost:3000` +- **MailHog UI**: `http://localhost:8025` (SMTP on `localhost:1025`) +- **MinIO console**: `http://localhost:9001` (S3 API on `localhost:9000`) + +## Repository layout (high level) +- **Rust services (Cargo workspace)**: `aggregate/`, `gateway/`, `projection/`, `runner/`, `control/api/`, `shared/` +- **Admin UI**: `control/ui/` +- **Docker / Swarm / Compose**: `docker/`, `docker-compose.yml`, `swarm/`, `observability/` + +## Production (overview) +- **Control plane Swarm stack**: `swarm/stacks/control-plane-prod.yml` +- **S3 docs buckets**: `CONTROL_S3_BUCKET_DOCS` supports a comma-separated shard set (e.g. `cloudlysis-docs-0,cloudlysis-docs-1,cloudlysis-docs-2`). Bucket selection is deterministic per-tenant; keep the full shard set stable to avoid remapping tenants. +- **S3 provisioning helpers** (idempotent scripts; CI/CD friendly): + - `docker/scripts/s3_create_docs_bucket.sh` + - `docker/scripts/s3_apply_lifecycle_docs.sh` + - `docker/scripts/s3_verify_docs.sh` + - Gitea Actions workflow: `.gitea/workflows/s3-provision.yml` + +## Docs +- **Docker / local dev / Swarm**: `DOCKER.md` +- **Developer docs**: `docs/developer/setup.md`, `docs/developer/testing.md` +- **Architecture**: `docs/architecture/overview.md`, `docs/architecture/transport.md` +- **Usage**: `docs/usage/quickstart.md`, `docs/usage/api.md`, `docs/usage/nats.md` + +## Workspace verification ```bash cargo fmt --check diff --git a/S3_PLAN.md b/S3_PLAN.md deleted file mode 100644 index 4b1867e..0000000 --- a/S3_PLAN.md +++ /dev/null @@ -1,187 +0,0 @@ -# S3-Compatible Object Storage Plan (Hetzner in Prod, MinIO Locally) - -## Principles -- S3-compatible object storage is mandatory for platform document storage in every environment: - - Local development uses MinIO. - - Production uses Hetzner Object Storage (S3 API compatible). -- Each milestone is stop-the-line gated: - - All tasks completed - - All milestone tests pass - - Workspace verification commands pass -- Secrets are never committed and never logged: - - Access keys via Swarm secrets in production - - `.env` or compose env in local dev - -## Goals -- Introduce a single, shared S3-compatible configuration surface for the platform. -- Make document storage always backed by S3 (no filesystem fallback for documents). -- Keep the implementation incremental and test-gated per milestone. -- Optionally expand to observability object storage after document storage is stable. - -## Definitions -### Document Storage -“Documents” are versioned blobs the platform needs to store and retrieve reliably: -- Deployment bundles and artifacts -- Definitions/manifests (projection programs, saga/effects definitions, schema bundles) -- Exported audit/log bundles, diagnostics, or snapshots that are not part of the primary KV/MDBX state - -Document storage must support: -- Tenant-scoped namespaces (prefixes) -- Content-addressed or versioned keys (immutability preferred) -- Listing by prefix for admin workflows - -## Configuration Contract (Platform-Wide) -### Common Settings -- `S3_ENDPOINT` (Hetzner: HTTPS endpoint; MinIO: `http://minio:9000`) -- `S3_REGION` (required even for some S3-compatible providers) -- `S3_ACCESS_KEY_ID` (secret) -- `S3_SECRET_ACCESS_KEY` (secret) -- `S3_FORCE_PATH_STYLE` (`true/false`) -- `S3_INSECURE` (`true/false`, only allowed for local MinIO) - -### Buckets and Prefixes -- `S3_BUCKET_DOCS` (required everywhere) -- `S3_PREFIX_DOCS` (default `docs/`) - -Optional (later milestones): -- `S3_BUCKET_LOKI`, `S3_PREFIX_LOKI` -- `S3_BUCKET_TEMPO`, `S3_PREFIX_TEMPO` - -## Target Architecture -### Local Development -- MinIO is part of the local stack for parity. -- Control API is the document gateway: - - Upload/download via signed URLs or streamed proxy endpoints - - Metadata stored in existing storage/KV (document index) or derived from key scheme - -### Production -- Hetzner Object Storage provides S3-compatible bucket(s). -- Credentials and bucket details injected via Swarm secrets and stack env. - -## Development Plan (Milestones by Dependency) - -## Milestone 0: S3 Contract + Local MinIO Baseline -### Dependencies -- None - -### Goal -Provide a consistent local S3-compatible endpoint and stable bucket naming to unblock higher milestones. - -### Tasks -- [ ] Add MinIO to local development stack: - - [ ] Add `minio` service to compose (API + console) - - [ ] Add `minio-init` job to create required buckets -- [ ] Define standard bucket/prefix defaults for local dev: - - [ ] `S3_BUCKET_DOCS=cloudlysis-docs` - - [ ] `S3_PREFIX_DOCS=docs/` -- [ ] Document local workflow to enable MinIO-backed document storage. - -### Required Tests (Gate) -- [ ] Workspace verification commands -- [ ] Local manual verification checklist: - - [ ] `cloudlysis-docs` bucket exists - - [ ] credentials work from a container in the compose network - -## Milestone 1: Document Storage API (Control API) -### Dependencies -- Milestone 0 - -### Goal -Make document storage a first-class platform API and require it in all environments. - -### Tasks -- [ ] Add an S3 client module to Control API: - - [ ] parse config from env with strict validation (endpoint, bucket, keys) - - [ ] support path-style and TLS/insecure options -- [ ] Implement document primitives: - - [ ] Put (upload) and Get (download) - - [ ] List by prefix (tenant + doc-type) - - [ ] Delete (admin-only) if needed -- [ ] Decide and document a key scheme: - - [ ] tenant-scoped prefix - - [ ] immutable keys preferred (content hash + metadata) -- [ ] Add authz rules for document operations (deny-by-default, tenant-scoped). - -### Required Tests (Gate) -- [ ] Workspace verification commands -- [ ] Unit tests: - - [ ] config parsing/validation - - [ ] key generation stability -- [ ] Gated integration tests (MinIO): - - [ ] put/get roundtrip - - [ ] list by prefix - - [ ] tenant isolation (cannot read other tenant prefix) - -## Milestone 2: Control UI Integration (Upload/Download Flows) -### Dependencies -- Milestone 1 - -### Goal -Make document workflows usable from the Control UI without leaking credentials. - -### Tasks -- [ ] Add Control API endpoints for signed URLs (recommended) or streamed proxy: - - [ ] create upload URL (PUT) - - [ ] create download URL (GET) -- [ ] Implement Control UI flows for a first document type: - - [ ] upload - - [ ] list - - [ ] download -- [ ] Ensure correlation/trace propagation on Control API operations. - -### Required Tests (Gate) -- [ ] Workspace verification commands -- [ ] Control UI unit tests for routing/component render stability -- [ ] Gated end-to-end checklist (local): - - [ ] upload appears in list - - [ ] download returns expected bytes - -## Milestone 3: Production Rollout (Hetzner) -### Dependencies -- Milestone 2 - -### Goal -Deploy document storage on Hetzner S3-compatible backend with production-grade secret handling. - -### Tasks -- [ ] Provision buckets and lifecycle policies (docs bucket): - - [ ] retention rules appropriate to documents - - [ ] access policy scoped to required actions -- [ ] Swarm deployment: - - [ ] add secrets for access keys - - [ ] configure Control API with endpoint/region/bucket/prefix -- [ ] Rollback plan: - - [ ] switch to a fallback bucket or MinIO-on-prod if needed - -### Required Tests (Gate) -- [ ] Workspace verification commands -- [ ] Production smoke runbook: - - [ ] upload/list/download for a tenant - - [ ] verify objects exist under expected prefixes - -## Milestone 4 (Optional): Observability Storage on S3 (Loki + Tempo) -### Dependencies -- Milestone 3 - -### Goal -Store logs and traces in S3-compatible storage (MinIO locally; Hetzner in production). - -### Tasks -- [ ] Loki: - - [ ] add S3 config variant and compose overlay - - [ ] validate log query and bucket objects -- [ ] Tempo: - - [ ] add S3 config variant and compose overlay - - [ ] validate traces and bucket objects - -### Required Tests (Gate) -- [ ] Workspace verification commands -- [ ] Gated local validation: - - [ ] Loki writes objects to bucket/prefix after ingest - - [ ] Tempo writes objects to bucket/prefix after ingest - -## Workspace Verification Commands -- `cargo fmt --check` -- `cargo clippy --workspace --all-targets -- -D warnings` -- `cargo test --workspace` -- `cd control/ui && npm ci && npm run lint && npm run typecheck && npm run test && npm run build` diff --git a/aggregate/src/runtime/executor.rs b/aggregate/src/runtime/executor.rs index 2c7d07f..1be8503 100644 --- a/aggregate/src/runtime/executor.rs +++ b/aggregate/src/runtime/executor.rs @@ -1,6 +1,7 @@ use serde_json::Value as JsonValue; use std::time::Duration; +#[allow(unreachable_code)] pub async fn execute_decide_program( state: &JsonValue, command: &JsonValue, @@ -28,6 +29,7 @@ pub async fn execute_decide_program( } } +#[allow(unreachable_code)] pub async fn execute_apply_program( state: &JsonValue, event: &JsonValue, @@ -60,11 +62,10 @@ async fn execute_decide_v8( state: &JsonValue, command: &JsonValue, program: &str, - gas_limit: u64, + _gas_limit: u64, timeout: Duration, ) -> Result, crate::types::AggregateError> { - use std::sync::Arc; - use v8::{Array, Context, Function, HandleScope, Isolate, Object, Scope, Script}; + use v8::{Context, ContextScope, Function, HandleScope, Isolate, Script}; let state_str = serde_json::to_string(state).map_err(|e| { crate::types::AggregateError::DecideError(format!("State serialization: {}", e)) @@ -73,47 +74,45 @@ async fn execute_decide_v8( crate::types::AggregateError::DecideError(format!("Command serialization: {}", e)) })?; + let program_owned = program.to_string(); let result = tokio::task::spawn_blocking(move || { let isolate = &mut Isolate::new(v8::CreateParams::default()); let scope = &mut HandleScope::new(isolate); - let context = Context::new(scope); + let context = Context::new(scope, v8::ContextOptions::default()); let scope = &mut ContextScope::new(scope, context); let source = - v8::String::new(scope, program).ok_or_else(|| "Failed to create program string")?; + v8::String::new(scope, &program_owned).ok_or("Failed to create program string")?; - let script = - Script::compile(scope, source, None).ok_or_else(|| "Failed to compile program")?; + let script = Script::compile(scope, source, None).ok_or("Failed to compile program")?; - script.run(scope).ok_or_else(|| "Failed to run program")?; + script.run(scope).ok_or("Failed to run program")?; let global = context.global(scope); let decide_name = - v8::String::new(scope, "decide").ok_or_else(|| "Failed to create decide string")?; + v8::String::new(scope, "decide").ok_or("Failed to create decide string")?; let decide_fn = global .get(scope, decide_name.into()) .and_then(|v| v8::Local::::try_from(v).ok()) - .ok_or_else(|| "decide function not found")?; + .ok_or("decide function not found")?; - let state_json = v8::String::new(scope, &state_str) - .ok_or_else(|| "Failed to create state JSON string")?; - let state_obj = - v8::json::parse(scope, state_json).ok_or_else(|| "Failed to parse state JSON")?; + let state_json = + v8::String::new(scope, &state_str).ok_or("Failed to create state JSON string")?; + let state_obj = v8::json::parse(scope, state_json).ok_or("Failed to parse state JSON")?; - let command_json = v8::String::new(scope, &command_str) - .ok_or_else(|| "Failed to create command JSON string")?; + let command_json = + v8::String::new(scope, &command_str).ok_or("Failed to create command JSON string")?; let command_obj = - v8::json::parse(scope, command_json).ok_or_else(|| "Failed to parse command JSON")?; + v8::json::parse(scope, command_json).ok_or("Failed to parse command JSON")?; - let args: [v8::Local; 2] = [state_obj.into(), command_obj.into()]; + let args: [v8::Local; 2] = [state_obj, command_obj]; let result = decide_fn .call(scope, global.into(), &args) - .ok_or_else(|| "decide function call failed")?; + .ok_or("decide function call failed")?; - let result_json = - v8::json::stringify(scope, result).ok_or_else(|| "Failed to stringify result")?; + let result_json = v8::json::stringify(scope, result).ok_or("Failed to stringify result")?; let result_str = result_json.to_rust_string_lossy(scope); let events: Vec = serde_json::from_str(&result_str) @@ -155,47 +154,43 @@ async fn execute_apply_v8( let _ = gas_limit; + let program_owned = program.to_string(); let result = tokio::task::spawn_blocking(move || { let isolate = &mut Isolate::new(v8::CreateParams::default()); let scope = &mut HandleScope::new(isolate); - let context = Context::new(scope); + let context = Context::new(scope, v8::ContextOptions::default()); let scope = &mut ContextScope::new(scope, context); let source = - v8::String::new(scope, program).ok_or_else(|| "Failed to create program string")?; + v8::String::new(scope, &program_owned).ok_or("Failed to create program string")?; - let script = - Script::compile(scope, source, None).ok_or_else(|| "Failed to compile program")?; + let script = Script::compile(scope, source, None).ok_or("Failed to compile program")?; - script.run(scope).ok_or_else(|| "Failed to run program")?; + script.run(scope).ok_or("Failed to run program")?; let global = context.global(scope); - let apply_name = - v8::String::new(scope, "apply").ok_or_else(|| "Failed to create apply string")?; + let apply_name = v8::String::new(scope, "apply").ok_or("Failed to create apply string")?; let apply_fn = global .get(scope, apply_name.into()) .and_then(|v| v8::Local::::try_from(v).ok()) - .ok_or_else(|| "apply function not found")?; + .ok_or("apply function not found")?; - let state_json = v8::String::new(scope, &state_str) - .ok_or_else(|| "Failed to create state JSON string")?; - let state_obj = - v8::json::parse(scope, state_json).ok_or_else(|| "Failed to parse state JSON")?; + let state_json = + v8::String::new(scope, &state_str).ok_or("Failed to create state JSON string")?; + let state_obj = v8::json::parse(scope, state_json).ok_or("Failed to parse state JSON")?; - let event_json = v8::String::new(scope, &event_str) - .ok_or_else(|| "Failed to create event JSON string")?; - let event_obj = - v8::json::parse(scope, event_json).ok_or_else(|| "Failed to parse event JSON")?; + let event_json = + v8::String::new(scope, &event_str).ok_or("Failed to create event JSON string")?; + let event_obj = v8::json::parse(scope, event_json).ok_or("Failed to parse event JSON")?; - let args: [v8::Local; 2] = [state_obj.into(), event_obj.into()]; + let args: [v8::Local; 2] = [state_obj, event_obj]; let result = apply_fn .call(scope, global.into(), &args) - .ok_or_else(|| "apply function call failed")?; + .ok_or("apply function call failed")?; - let result_json = - v8::json::stringify(scope, result).ok_or_else(|| "Failed to stringify result")?; + let result_json = v8::json::stringify(scope, result).ok_or("Failed to stringify result")?; let result_str = result_json.to_rust_string_lossy(scope); let new_state: JsonValue = serde_json::from_str(&result_str) @@ -250,6 +245,7 @@ async fn execute_apply_wasm( mod tests { use super::*; use serde_json::json; + use std::time::Duration; #[tokio::test] async fn no_runtime_returns_error() { @@ -257,7 +253,7 @@ mod tests { { let state = json!({}); let command = json!({}); - let result = + let result: Result, crate::types::AggregateError> = execute_decide_program(&state, &command, "program", 1000, Duration::from_secs(1)) .await; assert!(result.is_err()); diff --git a/placement/dev.json b/config/placement/dev.json similarity index 100% rename from placement/dev.json rename to config/placement/dev.json diff --git a/routing/dev.json b/config/routing/dev.json similarity index 100% rename from routing/dev.json rename to config/routing/dev.json diff --git a/control/DEVELOPMENT_PLAN.md b/control/DEVELOPMENT_PLAN.md index 4a588fe..1040d05 100644 --- a/control/DEVELOPMENT_PLAN.md +++ b/control/DEVELOPMENT_PLAN.md @@ -339,3 +339,119 @@ This plan is intentionally aligned with the style and gating discipline used in - verify Grafana dashboards provisioned and VictoriaMetrics receives samples - [x] **T7.3** End-to-end “control plane can see the fleet” test (requires docker) - UI/API can query placement + health snapshots for all services + +--- + +## Milestone 8: Config Registry + Safe Change Management (Plan/Apply/Rollback) + +**Goal:** Make configuration first-class, versioned, validated, and safely mutable from the control plane, while keeping production and development sources consistent. + +### Dependencies +- Milestone 2 (Control Plane API foundation) +- Milestone 5 (safe mutations baseline) +- Milestone 7 (Swarm deployment baseline) + +### Exit Criteria +- Operators can list, view, validate, and safely apply config changes with audit + idempotent jobs +- Config changes have revision semantics and are roll-backable +- Gatekeeper safety checks prevent applying invalid or unsafe configs + +### Tasks +- [x] **8.1** Inventory and classify configuration surfaces (platform-wide) + - classify as: static boot config (env/secrets), dynamic runtime config (KV), large immutable artifacts (S3/docs) + - map current sources per domain: + - Gateway routing config (`config/routing/dev.json` / production KV) + - Placement config (`config/placement/dev.json` / production KV) + - Runner definitions (effects/sagas) (documents/S3) and activation config (KV) + - Observability provisioning (Swarm configs + repo-managed assets) + - Control plane feature flags (KV) +- [~] **8.2** Define a Config Registry contract in the Control API + - **Implemented (initial)**: + - config identity: `{domain}` (routing|placement) + - metadata: `revision` (KV revision when using NATS), and `source` info (file vs nats) + - storage policy per config: `source=dev_file | nats_kv` + - **Still needed**: + - `{domain, name, scope}` and richer metadata (`updated_at`, `updated_by`, `sha256`) + - history API for KV-backed configs +- [x] **8.3** Implement config storage abstraction (dev + prod) + - dev: file-backed, atomic write (tmp + rename), hot-reload where applicable + - prod: NATS KV for dynamic configs (revisioned values + watch streams) + - consistent error model: decode/validate/source errors are distinguishable and safe +- [x] **8.4** Add read-only config APIs + - `GET /admin/v1/config` list domains + - `GET /admin/v1/config/{domain}` fetch current value + revision + source + - (history not implemented yet) +- [~] **8.5** Add validate/plan/apply/rollback mutation workflows as jobs + - **Implemented**: + - `POST /admin/v1/jobs/config/validate` (job, idempotency key required) + - `POST /admin/v1/jobs/config/apply` (job, idempotency key required, backup + apply) + - `POST /admin/v1/jobs/config/rollback` (job, idempotency key required, restore last backup) + - per-domain locking to avoid concurrent config mutations + - **Still needed**: + - `POST /admin/v1/plan/config/apply` deterministic plan (diff + impacted services) + - richer post-conditions (routing resolution sampling, fleet consistency checks, etc.) +- [~] **8.6** Implement initial config domains end-to-end + - **Gateway routing config**: + - implemented: schema validation via JSON decode + - still needed: semantic validation (tenant entries/shard directories/endpoints URL parsing) + sampled routing verification + - **Placement config**: + - implemented: schema validation via JSON decode + - still needed: semantic validation (targets non-empty, etc.) + fleet snapshot consistency checks +- [x] **8.7** Implement Admin UI “Config” page for safe operations + - list + view configs with revision/sha/audit linkage + - editor for JSON (and YAML when supported by the domain) + - validate button (server-side) and apply/rollback flows as jobs with reason required + +### Tests +- [x] **T8.1** Unit tests: config decode/encode stability for each config domain + - routing/placement decode is enforced by server-side validate job (schema-level) +- [ ] **T8.2** Unit tests: validation rejects unsafe configs with stable error codes/messages +- [ ] **T8.3** Unit tests: plan generation is deterministic for same inputs +- [x] **T8.4** Integration tests (env-gated): + - NATS KV config apply + rollback via Control API (requires `CONTROL_TEST_NATS=1` + `CONTROL_TEST_NATS_URL`) + - (Gateway route-resolution E2E verification still pending) +- [x] **T8.5** UI tests: config page renders, validate/apply/rollback flows navigate to job progress + +--- + +## Milestone 9: Control Node Management (Inventory, Drift, and Safer Ops) + +**Goal:** Improve how the control plane understands and manages the live control node and platform state: node inventory, config drift detection, and safer operational guardrails. + +### Dependencies +- Milestone 7 (Swarm deployment baseline) +- Milestone 8 (config registry + safe change management) + +### Exit Criteria +- Control plane provides a reliable “what is running vs what should be running” view +- Config drift is detectable and actionable +- Core operational actions are guarded by preflight checks and produce audit trails + +### Tasks +- [x] **9.1** Define a “desired vs observed” model for platform state + - desired: Swarm stacks + config registry revisions + - observed: live service/task state + effective runtime configs + - drift categories: missing, extra, version mismatch, config mismatch, unhealthy +- [~] **9.2** Improve Swarm observation fidelity + - implemented (initial): docker-cli-backed Swarm observation (`CONTROL_SWARM_MODE=docker`) + - still needed: direct Docker API client (avoid shelling out), richer normalization, and wiring into production stacks + - keep file source as a dev fallback for deterministic tests + - normalize service identity: `{service, image_tag, git_sha, updated_at}` +- [x] **9.3** Add drift APIs and UI views + - `GET /admin/v1/platform/drift` returns drift summary + actionable items + - UI: “Platform Drift” page with filters and links to remediate jobs +- [ ] **9.4** Add safer operational guardrails as reusable checks + - preflight checks for: + - service unhealthy / crashloop + - tenant migration safety thresholds (lag/inflight) + - config apply safety (impact radius, sampled verify) + - consistent failure modes: clear reason + audit entry, no partial side effects +- [ ] **9.5** Add operational playbooks as executable checks + - post-deploy verification suite callable as an idempotent job + - rollback verification suite callable as an idempotent job + +### Tests +- [x] **T9.1** Unit tests: drift classification for synthetic desired/observed fixtures +- [x] **T9.2** Integration tests (docker-gated): drift view detects intentional mismatches in a local Swarm + - requires `CONTROL_TEST_DOCKER=1` and an active local Swarm node +- [x] **T9.3** UI tests: drift page renders in route smoke test diff --git a/control/api/Cargo.toml b/control/api/Cargo.toml index 15f5bb3..75f9e75 100644 --- a/control/api/Cargo.toml +++ b/control/api/Cargo.toml @@ -5,22 +5,32 @@ edition = "2024" publish = ["madapes"] [dependencies] +async-nats = "0.42.0" +async-trait = "0.1.89" axum = "0.8.6" +aws-config = { version = "1.8.6", features = ["behavior-version-latest"] } +aws-credential-types = "1.2.6" +aws-sdk-s3 = "1.106.0" clap = { version = "4.5.48", features = ["derive", "env"] } +futures = "0.3.31" jsonwebtoken = "9.3.1" metrics = "0.23.0" metrics-exporter-prometheus = "0.16.0" reqwest = { version = "0.12.23", default-features = false, features = ["json", "rustls-tls"] } serde = { version = "1.0.228", features = ["derive"] } serde_json = "1.0.149" +sha2 = "0.10.9" +hex = "0.4.3" shared = { path = "../../shared" } thiserror = "2.0.16" tokio = { version = "1.45.0", features = ["macros", "net", "process", "rt-multi-thread", "signal", "time"] } tower-http = { version = "0.6.6", features = ["trace"] } tracing = "0.1.41" tracing-subscriber = { version = "0.3.20", features = ["env-filter"] } +url = "2.5.4" uuid = { version = "1.18.1", features = ["serde", "v4"] } [dev-dependencies] serde_yaml = "0.9.34" tower = "0.5.2" +urlencoding = "2.1.3" diff --git a/control/api/src/admin.rs b/control/api/src/admin.rs index d2ba4d0..cd55f12 100644 --- a/control/api/src/admin.rs +++ b/control/api/src/admin.rs @@ -1,7 +1,9 @@ use crate::{ AppState, RequestIds, auth::{Principal, has_permission}, - fleet, + config_registry::{ConfigDomain, ConfigRegistryError}, + config_schemas::RoutingConfig, + drift, fleet, job_engine::{JobEngine, StartJobError}, jobs::{Job, JobStatus, JobStep}, placement::{PlacementResponse, ServiceKind}, @@ -15,7 +17,9 @@ use axum::{ routing::{get, post}, }; use serde::Deserialize; +use sha2::Digest; use std::time::{SystemTime, UNIX_EPOCH}; +use url::Url; use uuid::Uuid; const HEADER_IDEMPOTENCY_KEY: &str = "idempotency-key"; @@ -25,21 +29,125 @@ pub fn admin_router() -> Router { Router::new() .route("/whoami", get(whoami)) .route("/platform/info", get(platform_info)) + .route("/platform/drift", get(platform_drift)) .route("/fleet/snapshot", get(fleet_snapshot)) .route("/tenants", get(list_tenants)) .route("/placement/{kind}", get(get_placement)) + .route("/config", get(list_config)) + .route("/config/{domain}", get(get_config)) + .route("/config/{domain}/history", get(get_config_history)) + .route("/jobs/platform/verify", post(start_platform_verify)) + .route("/jobs/config/validate", post(start_config_validate)) + .route("/jobs/config/apply", post(start_config_apply)) + .route("/jobs/config/rollback", post(start_config_rollback)) .route("/tenants/echo", get(tenant_echo)) + .route( + "/tenants/{tenant_id}/billing", + get(crate::billing::get_billing), + ) + .route( + "/tenants/{tenant_id}/billing/checkout", + post(crate::billing::checkout), + ) + .route( + "/tenants/{tenant_id}/billing/portal", + post(crate::billing::portal), + ) .route("/jobs/echo", post(create_echo_job)) .route("/jobs/{job_id}", get(get_job)) .route("/jobs/{job_id}/cancel", post(cancel_job)) .route("/jobs/tenant/drain", post(start_tenant_drain)) .route("/jobs/tenant/migrate", post(start_tenant_migrate)) .route("/plan/tenant/migrate", post(plan_tenant_migrate)) + .route("/plan/config/apply", post(plan_config_apply)) .route("/audit", get(list_audit)) .route("/swarm/services", get(list_swarm_services)) .route("/swarm/services/{name}/tasks", get(list_swarm_tasks)) } +#[derive(Debug, Deserialize)] +struct PlatformVerifyRequest { + reason: String, +} + +async fn start_platform_verify( + State(state): State, + headers: HeaderMap, + Extension(principal): Extension, + Json(body): Json, +) -> impl IntoResponse { + if !has_permission(&principal, "control:write") { + return StatusCode::FORBIDDEN.into_response(); + } + + let key = headers + .get(HEADER_IDEMPOTENCY_KEY) + .and_then(|v| v.to_str().ok()) + .ok_or(StatusCode::BAD_REQUEST); + let key = match key { + Ok(k) if !k.is_empty() => k, + _ => return StatusCode::BAD_REQUEST.into_response(), + }; + + let engine = JobEngine::new( + state.jobs.clone(), + state.audit.clone(), + state.tenant_locks.clone(), + state.config_locks.clone(), + ); + let job_id = match engine.start_platform_verify(state.clone(), &principal, body.reason, key) { + Ok(id) => id, + Err(StartJobError::TenantLocked) => return StatusCode::CONFLICT.into_response(), + }; + + ( + StatusCode::OK, + Json(serde_json::json!({ "job_id": job_id })), + ) + .into_response() +} + +async fn get_config_history( + State(state): State, + Path(domain): Path, + Extension(principal): Extension, +) -> impl IntoResponse { + if !has_permission(&principal, "control:read") { + return StatusCode::FORBIDDEN.into_response(); + } + + let domain = match domain.as_str() { + "routing" => ConfigDomain::Routing, + "placement" => ConfigDomain::Placement, + _ => return StatusCode::NOT_FOUND.into_response(), + }; + let Some(source) = state.config.source(domain) else { + return StatusCode::NOT_FOUND.into_response(); + }; + + let rows = match source.history_bytes(50).await { + Ok(items) => items + .into_iter() + .filter_map(|(rev, bytes)| { + let v = serde_json::from_slice::(&bytes).ok()?; + Some(serde_json::json!({ + "revision": rev, + "sha256": sha256_hex(&bytes), + "value": v + })) + }) + .collect::>(), + Err(ConfigRegistryError::Source(_)) => return StatusCode::BAD_GATEWAY.into_response(), + Err(_) => return StatusCode::NOT_IMPLEMENTED.into_response(), + }; + + ( + StatusCode::OK, + Json(serde_json::json!({ "domain": domain.as_str(), "items": rows })), + ) + .into_response() +} + async fn whoami(Extension(principal): Extension) -> impl IntoResponse { if !has_permission(&principal, "control:read") { return StatusCode::FORBIDDEN.into_response(); @@ -70,6 +178,18 @@ async fn platform_info(Extension(principal): Extension) -> impl IntoR .into_response() } +async fn platform_drift( + State(state): State, + Extension(principal): Extension, +) -> impl IntoResponse { + if !has_permission(&principal, "control:read") { + return StatusCode::FORBIDDEN.into_response(); + } + + let r = drift::compute(&state).await; + (StatusCode::OK, Json(r)).into_response() +} + async fn fleet_snapshot( State(state): State, Extension(principal): Extension, @@ -109,6 +229,434 @@ async fn get_placement( (StatusCode::OK, Json(resp)).into_response() } +async fn list_config( + State(state): State, + Extension(principal): Extension, +) -> impl IntoResponse { + if !has_permission(&principal, "control:read") { + return StatusCode::FORBIDDEN.into_response(); + } + + let domains: Vec<&'static str> = [ConfigDomain::Routing, ConfigDomain::Placement] + .into_iter() + .filter(|d| state.config.source(*d).is_some()) + .map(|d| d.as_str()) + .collect(); + + ( + StatusCode::OK, + Json(serde_json::json!({ "domains": domains })), + ) + .into_response() +} + +async fn get_config( + State(state): State, + Path(domain): Path, + Extension(principal): Extension, +) -> impl IntoResponse { + if !has_permission(&principal, "control:read") { + return StatusCode::FORBIDDEN.into_response(); + } + + let domain = match domain.as_str() { + "routing" => ConfigDomain::Routing, + "placement" => ConfigDomain::Placement, + _ => return StatusCode::NOT_FOUND.into_response(), + }; + + let Some(source) = state.config.source(domain) else { + return StatusCode::NOT_FOUND.into_response(); + }; + + let loaded = source.load_bytes().await; + let (bytes, revision) = match loaded { + Ok(x) => x, + Err(ConfigRegistryError::Source(_)) => return StatusCode::BAD_GATEWAY.into_response(), + Err(ConfigRegistryError::Decode(_)) => return StatusCode::BAD_REQUEST.into_response(), + Err(ConfigRegistryError::NotConfigured) => return StatusCode::NOT_FOUND.into_response(), + }; + + let json_value = match bytes { + Some(ref b) => match serde_json::from_slice::(b) { + Ok(v) => v, + Err(e) => { + return ( + StatusCode::BAD_REQUEST, + Json(serde_json::json!({ "error": format!("invalid json: {e}") })), + ) + .into_response(); + } + }, + None => serde_json::Value::Null, + }; + + let sha256 = bytes.as_deref().map(sha256_hex); + + ( + StatusCode::OK, + Json(serde_json::json!({ + "domain": domain.as_str(), + "revision": revision, + "sha256": sha256, + "source": source.info(), + "value": json_value, + })), + ) + .into_response() +} + +#[derive(Debug, Deserialize)] +struct ConfigApplyRequest { + domain: String, + expected_revision: Option, + reason: String, + value: serde_json::Value, +} + +#[derive(Debug, Deserialize)] +struct ConfigValidateRequest { + domain: String, + reason: String, + value: serde_json::Value, +} + +#[derive(Debug, Deserialize)] +struct ConfigRollbackRequest { + domain: String, + reason: String, +} + +fn parse_domain(domain: &str) -> Option { + match domain { + "routing" => Some(ConfigDomain::Routing), + "placement" => Some(ConfigDomain::Placement), + _ => None, + } +} + +async fn start_config_validate( + State(state): State, + headers: HeaderMap, + Extension(principal): Extension, + Json(body): Json, +) -> impl IntoResponse { + if !has_permission(&principal, "control:write") { + return StatusCode::FORBIDDEN.into_response(); + } + + let key = headers + .get(HEADER_IDEMPOTENCY_KEY) + .and_then(|v| v.to_str().ok()) + .ok_or(StatusCode::BAD_REQUEST); + let key = match key { + Ok(k) if !k.is_empty() => k, + _ => return StatusCode::BAD_REQUEST.into_response(), + }; + + let Some(domain) = parse_domain(body.domain.as_str()) else { + return StatusCode::BAD_REQUEST.into_response(); + }; + + let engine = JobEngine::new( + state.jobs.clone(), + state.audit.clone(), + state.tenant_locks.clone(), + state.config_locks.clone(), + ); + let job_id = match engine.start_config_validate( + state.clone(), + &principal, + domain, + body.reason, + body.value, + key, + ) { + Ok(id) => id, + Err(StartJobError::TenantLocked) => return StatusCode::CONFLICT.into_response(), + }; + + ( + StatusCode::OK, + Json(serde_json::json!({ "job_id": job_id })), + ) + .into_response() +} + +async fn start_config_apply( + State(state): State, + headers: HeaderMap, + Extension(principal): Extension, + Json(body): Json, +) -> impl IntoResponse { + if !has_permission(&principal, "control:write") { + return StatusCode::FORBIDDEN.into_response(); + } + + let key = headers + .get(HEADER_IDEMPOTENCY_KEY) + .and_then(|v| v.to_str().ok()) + .ok_or(StatusCode::BAD_REQUEST); + let key = match key { + Ok(k) if !k.is_empty() => k, + _ => return StatusCode::BAD_REQUEST.into_response(), + }; + + let Some(domain) = parse_domain(body.domain.as_str()) else { + return StatusCode::BAD_REQUEST.into_response(); + }; + + let engine = JobEngine::new( + state.jobs.clone(), + state.audit.clone(), + state.tenant_locks.clone(), + state.config_locks.clone(), + ); + let job_id = match engine.start_config_apply( + state.clone(), + &principal, + domain, + body.reason, + body.expected_revision, + body.value, + key, + ) { + Ok(id) => id, + Err(StartJobError::TenantLocked) => return StatusCode::CONFLICT.into_response(), + }; + + ( + StatusCode::OK, + Json(serde_json::json!({ "job_id": job_id })), + ) + .into_response() +} + +async fn start_config_rollback( + State(state): State, + headers: HeaderMap, + Extension(principal): Extension, + Json(body): Json, +) -> impl IntoResponse { + if !has_permission(&principal, "control:write") { + return StatusCode::FORBIDDEN.into_response(); + } + + let key = headers + .get(HEADER_IDEMPOTENCY_KEY) + .and_then(|v| v.to_str().ok()) + .ok_or(StatusCode::BAD_REQUEST); + let key = match key { + Ok(k) if !k.is_empty() => k, + _ => return StatusCode::BAD_REQUEST.into_response(), + }; + + let Some(domain) = parse_domain(body.domain.as_str()) else { + return StatusCode::BAD_REQUEST.into_response(); + }; + + let engine = JobEngine::new( + state.jobs.clone(), + state.audit.clone(), + state.tenant_locks.clone(), + state.config_locks.clone(), + ); + let job_id = + match engine.start_config_rollback(state.clone(), &principal, domain, body.reason, key) { + Ok(id) => id, + Err(StartJobError::TenantLocked) => return StatusCode::CONFLICT.into_response(), + }; + + ( + StatusCode::OK, + Json(serde_json::json!({ "job_id": job_id })), + ) + .into_response() +} + +#[derive(Debug, Deserialize)] +struct ConfigPlanApplyRequest { + domain: String, + value: serde_json::Value, +} + +async fn plan_config_apply( + State(state): State, + Extension(principal): Extension, + Json(body): Json, +) -> impl IntoResponse { + if !has_permission(&principal, "control:write") { + return StatusCode::FORBIDDEN.into_response(); + } + + let domain = match body.domain.as_str() { + "routing" => ConfigDomain::Routing, + "placement" => ConfigDomain::Placement, + _ => return StatusCode::BAD_REQUEST.into_response(), + }; + let Some(source) = state.config.source(domain) else { + return StatusCode::NOT_FOUND.into_response(); + }; + + // Validate proposed config (schema + semantics). + let validate_res: Result<(), String> = match domain { + ConfigDomain::Routing => { + let cfg = match serde_json::from_value::(body.value.clone()) { + Ok(v) => v, + Err(e) => { + return ( + StatusCode::BAD_REQUEST, + Json(serde_json::json!({ "error": e.to_string() })), + ) + .into_response(); + } + }; + validate_routing_semantics(&cfg) + } + ConfigDomain::Placement => { + let cfg = + match serde_json::from_value::(body.value.clone()) + { + Ok(v) => v, + Err(e) => { + return ( + StatusCode::BAD_REQUEST, + Json(serde_json::json!({ "error": e.to_string() })), + ) + .into_response(); + } + }; + validate_placement_semantics(&cfg) + } + }; + if let Err(e) = validate_res { + return ( + StatusCode::BAD_REQUEST, + Json(serde_json::json!({ "error": e })), + ) + .into_response(); + } + + let (cur_bytes, cur_rev) = match source.load_bytes().await { + Ok(x) => x, + Err(_) => return StatusCode::BAD_GATEWAY.into_response(), + }; + let cur_value = cur_bytes + .as_deref() + .and_then(|b| serde_json::from_slice::(b).ok()) + .unwrap_or(serde_json::Value::Null); + + let before = serde_json::to_string_pretty(&cur_value).unwrap_or_default(); + let after = serde_json::to_string_pretty(&body.value).unwrap_or_default(); + + let changed = cur_value != body.value; + let impacted_services: Vec<&'static str> = match domain { + ConfigDomain::Routing => vec!["gateway"], + ConfigDomain::Placement => vec!["gateway", "control-api"], + }; + + ( + StatusCode::OK, + Json(serde_json::json!({ + "domain": domain.as_str(), + "current_revision": cur_rev, + "changed": changed, + "impacted_services": impacted_services, + "diff": { + "before": before, + "after": after, + } + })), + ) + .into_response() +} + +fn sha256_hex(bytes: &[u8]) -> String { + let mut h = sha2::Sha256::new(); + h.update(bytes); + hex::encode(h.finalize()) +} + +fn validate_routing_semantics(cfg: &RoutingConfig) -> Result<(), String> { + let shard_maps = [ + ("aggregate_shards", &cfg.aggregate_shards), + ("projection_shards", &cfg.projection_shards), + ("runner_shards", &cfg.runner_shards), + ]; + for (name, map) in shard_maps { + for (shard_id, endpoints) in map { + if endpoints.is_empty() { + return Err(format!("{name}[{shard_id}] has no endpoints")); + } + for ep in endpoints { + let u = Url::parse(ep) + .map_err(|e| format!("{name}[{shard_id}] invalid endpoint {ep:?}: {e}"))?; + if u.scheme() != "http" && u.scheme() != "https" { + return Err(format!( + "{name}[{shard_id}] endpoint {ep:?} must be http(s)" + )); + } + if u.host_str().is_none() { + return Err(format!( + "{name}[{shard_id}] endpoint {ep:?} must include host" + )); + } + } + } + } + + let placements = [ + ( + "aggregate_placement", + &cfg.aggregate_placement, + &cfg.aggregate_shards, + ), + ( + "projection_placement", + &cfg.projection_placement, + &cfg.projection_shards, + ), + ( + "runner_placement", + &cfg.runner_placement, + &cfg.runner_shards, + ), + ]; + for (pname, pmap, shards) in placements { + for (tenant, shard_id) in pmap { + if shard_id.trim().is_empty() { + return Err(format!("{pname}[{tenant}] shard_id is empty")); + } + if !shards.contains_key(shard_id) { + return Err(format!( + "{pname}[{tenant}] references missing shard_id {shard_id:?}" + )); + } + } + } + Ok(()) +} + +fn validate_placement_semantics(cfg: &crate::placement::PlacementFile) -> Result<(), String> { + let kinds = [ + ("aggregate_placement", cfg.aggregate_placement.as_ref()), + ("projection_placement", cfg.projection_placement.as_ref()), + ("runner_placement", cfg.runner_placement.as_ref()), + ]; + for (kind, k) in kinds { + let Some(k) = k else { continue }; + for p in &k.placements { + if p.targets.is_empty() { + return Err(format!("{kind} tenant {} has no targets", p.tenant_id)); + } + if p.targets.iter().any(|t| t.trim().is_empty()) { + return Err(format!("{kind} tenant {} has empty target", p.tenant_id)); + } + } + } + Ok(()) +} + async fn list_tenants( State(state): State, Extension(principal): Extension, @@ -256,6 +804,7 @@ async fn start_tenant_drain( state.jobs.clone(), state.audit.clone(), state.tenant_locks.clone(), + state.config_locks.clone(), ); let job_id = match engine.start_tenant_drain( state.clone(), @@ -298,6 +847,7 @@ async fn start_tenant_migrate( state.jobs.clone(), state.audit.clone(), state.tenant_locks.clone(), + state.config_locks.clone(), ); let job_id = match engine.start_tenant_migrate( state.clone(), diff --git a/control/api/src/billing.rs b/control/api/src/billing.rs new file mode 100644 index 0000000..eff0102 --- /dev/null +++ b/control/api/src/billing.rs @@ -0,0 +1,904 @@ +use crate::{ + AppState, + auth::{Principal, has_permission}, +}; +use async_trait::async_trait; +use axum::{ + Json, + extract::{Extension, Path, State}, + http::{HeaderMap, StatusCode}, + response::IntoResponse, +}; +use serde::{Deserialize, Serialize}; +use std::time::Duration; +use std::{ + collections::BTreeMap, + fs, + path::PathBuf, + sync::{Arc, RwLock}, + time::SystemTime, +}; +use thiserror::Error; +use uuid::Uuid; + +const HEADER_TENANT_ID: &str = shared::HEADER_X_TENANT_ID; + +fn verify_tenant_isolation(headers: &HeaderMap, path_tenant_id: Uuid) -> Result<(), StatusCode> { + let header_tenant_id = headers + .get(HEADER_TENANT_ID) + .and_then(|v| v.to_str().ok()) + .ok_or(StatusCode::BAD_REQUEST) + .and_then(|s| Uuid::parse_str(s).map_err(|_| StatusCode::BAD_REQUEST))?; + + if header_tenant_id != path_tenant_id { + return Err(StatusCode::FORBIDDEN); + } + Ok(()) +} + +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum Plan { + Free, + Pro, + Enterprise, +} + +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum SubscriptionStatus { + Trialing, + Active, + PastDue, + Paused, + Canceled, + Incomplete, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct Entitlements { + pub max_deployments: u32, + pub max_runners: u32, + pub s3_docs_enabled: bool, + pub support_tier: String, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +pub enum BillingEvent { + SubscriptionCreated { + tenant_id: Uuid, + event_id: String, + provider_customer_id: String, + provider_subscription_id: String, + status: SubscriptionStatus, + plan: Plan, + current_period_end: String, + ts_ms: u64, + }, + SubscriptionUpdated { + tenant_id: Uuid, + event_id: String, + status: SubscriptionStatus, + plan: Plan, + current_period_end: String, + cancel_at_period_end: bool, + ts_ms: u64, + }, + SubscriptionDeleted { + tenant_id: Uuid, + event_id: String, + ts_ms: u64, + }, +} + +impl BillingEvent { + pub fn tenant_id(&self) -> Uuid { + match self { + Self::SubscriptionCreated { tenant_id, .. } => *tenant_id, + Self::SubscriptionUpdated { tenant_id, .. } => *tenant_id, + Self::SubscriptionDeleted { tenant_id, .. } => *tenant_id, + } + } + + pub fn event_id(&self) -> &str { + match self { + Self::SubscriptionCreated { event_id, .. } => event_id, + Self::SubscriptionUpdated { event_id, .. } => event_id, + Self::SubscriptionDeleted { event_id, .. } => event_id, + } + } + + pub fn ts_ms(&self) -> u64 { + match self { + Self::SubscriptionCreated { ts_ms, .. } => *ts_ms, + Self::SubscriptionUpdated { ts_ms, .. } => *ts_ms, + Self::SubscriptionDeleted { ts_ms, .. } => *ts_ms, + } + } +} + +impl Entitlements { + pub fn derive(plan: Option<&Plan>, status: Option<&SubscriptionStatus>) -> Self { + let is_active = matches!( + status, + Some(SubscriptionStatus::Trialing | SubscriptionStatus::Active) + ); + + if !is_active { + return Self { + max_deployments: 1, + max_runners: 1, + s3_docs_enabled: false, + support_tier: "community".to_string(), + }; + } + + match plan.unwrap_or(&Plan::Free) { + Plan::Free => Self { + max_deployments: 3, + max_runners: 1, + s3_docs_enabled: false, + support_tier: "community".to_string(), + }, + Plan::Pro => Self { + max_deployments: 10, + max_runners: 5, + s3_docs_enabled: true, + support_tier: "standard".to_string(), + }, + Plan::Enterprise => Self { + max_deployments: 1000, + max_runners: 50, + s3_docs_enabled: true, + support_tier: "priority".to_string(), + }, + } + } +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct TenantBillingState { + pub provider: String, + pub provider_customer_id: Option, + pub provider_subscription_id: Option, + pub provider_checkout_session_id: Option, + pub status: Option, + pub plan: Option, + pub current_period_end: Option, + pub cancel_at_period_end: Option, + pub processed_webhook_event_ids: Vec, + pub updated_at: u64, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct BillingStateFile { + pub revision: Option, + pub tenants: BTreeMap, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct BillingResponse { + pub configured: bool, + pub provider: Option, + pub plan: Option, + pub status: Option, + pub current_period_end: Option, + pub cancel_at_period_end: Option, + pub entitlements: Entitlements, +} + +#[derive(Clone)] +pub struct BillingStore { + inner: Arc>, +} + +struct Inner { + path: PathBuf, + last_modified: Option, + cached: Option, +} + +impl BillingStore { + pub fn new(path: PathBuf) -> Self { + Self { + inner: Arc::new(RwLock::new(Inner { + path, + last_modified: None, + cached: None, + })), + } + } + + pub fn get_for_tenant(&self, tenant_id: Uuid) -> BillingResponse { + let mut inner = self.inner.write().expect("billing lock poisoned"); + inner.reload_if_changed(); + + if let Some(state) = inner + .cached + .as_ref() + .and_then(|file| file.tenants.get(&tenant_id)) + { + return BillingResponse { + configured: true, + provider: Some(state.provider.clone()), + plan: state.plan.clone(), + status: state.status.clone(), + current_period_end: state.current_period_end.clone(), + cancel_at_period_end: state.cancel_at_period_end, + entitlements: Entitlements::derive(state.plan.as_ref(), state.status.as_ref()), + }; + } + + BillingResponse { + configured: false, + provider: None, + plan: None, + status: None, + current_period_end: None, + cancel_at_period_end: None, + entitlements: Entitlements::derive(None, None), + } + } + + pub fn get_all_tenant_ids(&self) -> Vec { + let mut inner = self.inner.write().expect("billing lock poisoned"); + inner.reload_if_changed(); + + inner + .cached + .as_ref() + .map(|f| f.tenants.keys().cloned().collect()) + .unwrap_or_default() + } + + pub fn get_subscription_id(&self, tenant_id: Uuid) -> Option { + let mut inner = self.inner.write().expect("billing lock poisoned"); + inner.reload_if_changed(); + + inner + .cached + .as_ref() + .and_then(|f| f.tenants.get(&tenant_id)) + .and_then(|s| s.provider_subscription_id.clone()) + } + + pub fn apply_event(&self, event: BillingEvent) -> Result<(), String> { + let mut inner = self.inner.write().expect("billing lock poisoned"); + inner.reload_if_changed(); + + let mut file = inner.cached.clone().unwrap_or(BillingStateFile { + revision: Some("dev".to_string()), + tenants: BTreeMap::new(), + }); + + let tenant_id = event.tenant_id(); + let event_id = event.event_id().to_string(); + let ts_ms = event.ts_ms(); + + let state = file.tenants.entry(tenant_id).or_insert(TenantBillingState { + provider: "unknown".to_string(), // Will be updated by Created event + provider_customer_id: None, + provider_subscription_id: None, + provider_checkout_session_id: None, + status: None, + plan: None, + current_period_end: None, + cancel_at_period_end: None, + processed_webhook_event_ids: vec![], + updated_at: 0, + }); + + // Deduplication + if state.processed_webhook_event_ids.contains(&event_id) { + return Ok(()); + } + + // Monotonicity check + if state.updated_at > ts_ms { + state.processed_webhook_event_ids.push(event_id); + state.processed_webhook_event_ids.truncate(50); + inner.save(file)?; + return Ok(()); + } + + match event { + BillingEvent::SubscriptionCreated { + provider_customer_id, + provider_subscription_id, + status, + plan, + current_period_end, + .. + } => { + state.provider_customer_id = Some(provider_customer_id); + state.provider_subscription_id = Some(provider_subscription_id); + state.status = Some(status); + state.plan = Some(plan); + state.current_period_end = Some(current_period_end); + } + BillingEvent::SubscriptionUpdated { + status, + plan, + current_period_end, + cancel_at_period_end, + .. + } => { + state.status = Some(status); + state.plan = Some(plan); + state.current_period_end = Some(current_period_end); + state.cancel_at_period_end = Some(cancel_at_period_end); + } + BillingEvent::SubscriptionDeleted { .. } => { + state.status = Some(SubscriptionStatus::Canceled); + } + } + + state.updated_at = ts_ms; + state.processed_webhook_event_ids.push(event_id); + state.processed_webhook_event_ids.truncate(50); + + inner.save(file)?; + Ok(()) + } + + #[cfg(test)] + pub fn update_tenant_state( + &self, + tenant_id: Uuid, + state: TenantBillingState, + ) -> Result { + let mut inner = self.inner.write().expect("billing lock poisoned"); + inner.reload_if_changed(); + + let mut file = inner.cached.clone().unwrap_or(BillingStateFile { + revision: Some("dev".to_string()), + tenants: BTreeMap::new(), + }); + + file.tenants.insert(tenant_id, state); + inner.save(file) + } +} + +impl Inner { + fn save(&mut self, mut file: BillingStateFile) -> Result { + let revision = format!("rev-{}", Uuid::new_v4()); + file.revision = Some(revision.clone()); + + let raw = serde_json::to_string_pretty(&file).map_err(|e| e.to_string())?; + let tmp = self.path.with_extension("json.tmp"); + fs::write(&tmp, raw).map_err(|e| e.to_string())?; + fs::rename(&tmp, &self.path).map_err(|e| e.to_string())?; + + self.last_modified = None; + self.cached = Some(file); + + Ok(revision) + } + + fn reload_if_changed(&mut self) { + let meta = fs::metadata(&self.path).ok(); + let modified = meta.and_then(|m| m.modified().ok()); + + if self.cached.is_some() && modified.is_some() && modified == self.last_modified { + return; + } + + self.last_modified = modified; + let p = &self.path; + self.cached = fs::read_to_string(p) + .ok() + .and_then(|raw| serde_json::from_str(&raw).ok()); + } +} + +pub async fn get_billing( + State(state): State, + Path(tenant_id): Path, + headers: HeaderMap, + Extension(principal): Extension, +) -> impl IntoResponse { + if !has_permission(&principal, "control:read") { + return StatusCode::FORBIDDEN.into_response(); + } + + if let Err(status) = verify_tenant_isolation(&headers, tenant_id) { + return status.into_response(); + } + + let resp = state.billing.get_for_tenant(tenant_id); + (StatusCode::OK, Json(resp)).into_response() +} + +#[derive(Debug, Deserialize)] +pub struct CheckoutRequest { + pub plan: Plan, + pub return_path: Option, +} + +pub async fn checkout( + State(state): State, + Path(tenant_id): Path, + headers: HeaderMap, + Extension(principal): Extension, + Json(body): Json, +) -> impl IntoResponse { + if !has_permission(&principal, "control:write") { + return StatusCode::FORBIDDEN.into_response(); + } + + if let Err(status) = verify_tenant_isolation(&headers, tenant_id) { + return status.into_response(); + } + + // Check if subscription already exists and is active/trialing + let current = state.billing.get_for_tenant(tenant_id); + if current.configured + && matches!( + current.status, + Some(SubscriptionStatus::Active | SubscriptionStatus::Trialing) + ) + { + return ( + StatusCode::CONFLICT, + Json(serde_json::json!({ "error": "tenant already has an active subscription" })), + ) + .into_response(); + } + + // Construct full return URL + // TODO: Validate return_path against ALLOWED_RETURN_ORIGINS if provided + let return_url = body.return_path.unwrap_or_else(|| "/billing".to_string()); + + match state + .billing_provider + .create_checkout_session(tenant_id, body.plan, return_url) + .await + { + Ok(url) => (StatusCode::OK, Json(serde_json::json!({ "url": url }))).into_response(), + Err(e) => { + let err_msg = e.to_string(); + ( + StatusCode::INTERNAL_SERVER_ERROR, + Json(serde_json::json!({ "error": err_msg })), + ) + .into_response() + } + } +} + +pub async fn portal( + State(state): State, + Path(tenant_id): Path, + headers: HeaderMap, + Extension(principal): Extension, +) -> impl IntoResponse { + if !has_permission(&principal, "control:write") { + return StatusCode::FORBIDDEN.into_response(); + } + + if let Err(status) = verify_tenant_isolation(&headers, tenant_id) { + return status.into_response(); + } + + let return_url = "/billing".to_string(); + match state + .billing_provider + .create_portal_session(tenant_id, return_url) + .await + { + Ok(url) => (StatusCode::OK, Json(serde_json::json!({ "url": url }))).into_response(), + Err(e) => { + let err_msg = e.to_string(); + ( + StatusCode::INTERNAL_SERVER_ERROR, + Json(serde_json::json!({ "error": err_msg })), + ) + .into_response() + } + } +} + +pub async fn webhook( + State(state): State, + Path(_provider): Path, + headers: HeaderMap, + body: axum::body::Bytes, +) -> impl IntoResponse { + // Note: We don't require auth here as this is a public endpoint called by the provider. + // Security is handled via signature verification in the provider trait. + + match state.billing_provider.verify_webhook(&body, &headers).await { + Ok(event) => { + metrics::counter!("billing_webhook_requests_total", "status" => "success").increment(1); + if let Err(e) = state.billing.apply_event(event) { + tracing::error!(error = %e, "failed to apply billing event from webhook"); + return ( + StatusCode::INTERNAL_SERVER_ERROR, + Json(serde_json::json!({ "error": e })), + ) + .into_response(); + } + StatusCode::OK.into_response() + } + Err(e) => { + metrics::counter!("billing_webhook_requests_total", "status" => "error").increment(1); + ( + StatusCode::BAD_REQUEST, + Json(serde_json::json!({ "error": e.to_string() })), + ) + .into_response() + } + } +} + +pub async fn run_reconciliation_loop(state: AppState) { + let interval_secs = std::env::var("CONTROL_BILLING_RECONCILE_INTERVAL_SECS") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(3600); + + tracing::info!(interval_secs, "starting billing reconciliation loop"); + + loop { + tokio::time::sleep(Duration::from_secs(interval_secs)).await; + + tracing::info!("starting billing reconciliation run"); + reconcile_once(&state).await; + + // Update tenant status gauges + // Note: This is an expensive operation if there are many tenants, + // but for reconciliation it's fine once per hour. + update_billing_gauges(&state); + } +} + +pub async fn reconcile_once(state: &AppState) { + let start = std::time::Instant::now(); + + let tenant_ids = state.billing.get_all_tenant_ids(); + let mut success = 0; + let mut error = 0; + let mut skipped = 0; + + for tenant_id in tenant_ids { + let sub_id = state.billing.get_subscription_id(tenant_id); + if let Some(subscription_id) = sub_id { + match state + .billing_provider + .fetch_subscription(tenant_id, &subscription_id) + .await + { + Ok(event) => { + if let Err(e) = state.billing.apply_event(event) { + tracing::error!(?tenant_id, error = %e, "failed to apply reconciled billing event"); + error += 1; + } else { + success += 1; + } + } + Err(e) => { + tracing::error!(?tenant_id, error = %e, "failed to fetch subscription for reconciliation"); + error += 1; + } + } + } else { + skipped += 1; + } + } + + let elapsed = start.elapsed(); + metrics::counter!("billing_reconciliation_runs_total", "result" => "done").increment(1); + metrics::histogram!("billing_reconciliation_duration_ms").record(elapsed.as_millis() as f64); + + tracing::info!( + success, + error, + skipped, + duration_ms = elapsed.as_millis(), + "billing reconciliation run complete" + ); +} + +fn update_billing_gauges(state: &AppState) { + let tenant_ids = state.billing.get_all_tenant_ids(); + let mut counts: BTreeMap<(String, String), u64> = BTreeMap::new(); + + for tenant_id in tenant_ids { + let resp = state.billing.get_for_tenant(tenant_id); + let plan = match resp.plan { + Some(Plan::Free) => "free", + Some(Plan::Pro) => "pro", + Some(Plan::Enterprise) => "enterprise", + None => "none", + } + .to_string(); + + let status = match resp.status { + Some(SubscriptionStatus::Active) => "active", + Some(SubscriptionStatus::Trialing) => "trialing", + Some(SubscriptionStatus::PastDue) => "past_due", + Some(SubscriptionStatus::Paused) => "paused", + Some(SubscriptionStatus::Canceled) => "canceled", + Some(SubscriptionStatus::Incomplete) => "incomplete", + None => "none", + } + .to_string(); + + *counts.entry((plan, status)).or_insert(0) += 1; + } + + for ((plan, status), count) in counts { + metrics::gauge!("billing_tenant_status_count", "plan" => plan, "status" => status) + .set(count as f64); + } +} + +#[derive(Debug, Error)] +pub enum BillingError { + #[error("provider error: {0}")] + Provider(String), + #[error("invalid configuration: {0}")] + Config(String), +} + +#[async_trait] +pub trait BillingProvider: Send + Sync { + async fn create_checkout_session( + &self, + tenant_id: Uuid, + plan: Plan, + return_url: String, + ) -> Result; + + async fn create_portal_session( + &self, + tenant_id: Uuid, + return_url: String, + ) -> Result; + + async fn verify_webhook( + &self, + payload: &[u8], + headers: &HeaderMap, + ) -> Result; + + async fn fetch_subscription( + &self, + tenant_id: Uuid, + subscription_id: &str, + ) -> Result; +} + +pub struct StripeProvider { + pub secret_key: String, + pub price_pro: String, + pub price_enterprise: String, +} + +#[async_trait] +impl BillingProvider for StripeProvider { + async fn create_checkout_session( + &self, + tenant_id: Uuid, + plan: Plan, + _return_url: String, + ) -> Result { + let _price = match plan { + Plan::Pro => &self.price_pro, + Plan::Enterprise => &self.price_enterprise, + Plan::Free => { + return Err(BillingError::Config( + "Free plan has no checkout".to_string(), + )); + } + }; + + // TODO: Actually call Stripe API + // For now, returning a simulated Stripe checkout URL + Ok(format!( + "https://checkout.stripe.com/pay/cs_test_{}?tenant_id={}", + Uuid::new_v4(), + tenant_id + )) + } + + async fn create_portal_session( + &self, + tenant_id: Uuid, + _return_url: String, + ) -> Result { + // TODO: Actually call Stripe API + Ok(format!( + "https://billing.stripe.com/p/session/ps_test_{}?tenant_id={}", + Uuid::new_v4(), + tenant_id + )) + } + + async fn verify_webhook( + &self, + _payload: &[u8], + _headers: &HeaderMap, + ) -> Result { + // TODO: Implement real Stripe signature verification + Err(BillingError::Provider("Not implemented".to_string())) + } + + async fn fetch_subscription( + &self, + _tenant_id: Uuid, + _subscription_id: &str, + ) -> Result { + // TODO: Actually call Stripe API with timeout + // let client = reqwest::Client::builder().timeout(Duration::from_secs(10)).build()... + Err(BillingError::Provider("Not implemented".to_string())) + } +} + +pub struct MockProvider; + +#[async_trait] +impl BillingProvider for MockProvider { + async fn create_checkout_session( + &self, + tenant_id: Uuid, + _plan: Plan, + _return_url: String, + ) -> Result { + Ok(format!("https://mock.stripe.com/checkout/{}", tenant_id)) + } + + async fn create_portal_session( + &self, + tenant_id: Uuid, + _return_url: String, + ) -> Result { + Ok(format!("https://mock.stripe.com/portal/{}", tenant_id)) + } + + async fn verify_webhook( + &self, + payload: &[u8], + _headers: &HeaderMap, + ) -> Result { + // Mock implementation: just parse the payload as a BillingEvent + serde_json::from_slice(payload).map_err(|e| BillingError::Provider(e.to_string())) + } + + async fn fetch_subscription( + &self, + tenant_id: Uuid, + _subscription_id: &str, + ) -> Result { + // Mock implementation: return a SubscriptionUpdated event with current state + // In a real mock we might want to store expectations, but for now we just return something plausible. + Ok(BillingEvent::SubscriptionUpdated { + tenant_id, + event_id: format!("reconcile-{}", Uuid::new_v4()), + status: SubscriptionStatus::Active, + plan: Plan::Pro, + current_period_end: "2099-12-31T23:59:59Z".to_string(), + cancel_at_period_end: false, + ts_ms: SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap() + .as_millis() as u64, + }) + } +} + +impl MockProvider { + pub fn get_checkout_url(tenant: Uuid) -> String { + format!("https://mock.stripe.com/checkout/{}", tenant) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::env::temp_dir; + + #[test] + fn test_entitlement_derivation() { + let e = Entitlements::derive(Some(&Plan::Free), Some(&SubscriptionStatus::PastDue)); + assert_eq!(e.max_deployments, 1); + + let e = Entitlements::derive(Some(&Plan::Pro), Some(&SubscriptionStatus::Active)); + assert_eq!(e.max_deployments, 10); + assert!(e.s3_docs_enabled); + + let e = Entitlements::derive(Some(&Plan::Enterprise), Some(&SubscriptionStatus::Trialing)); + assert_eq!(e.max_deployments, 1000); + } + + #[test] + fn test_billing_state_roundtrip() { + let mut path = temp_dir(); + path.push(format!("billing-{}.json", Uuid::new_v4())); + + let store = BillingStore::new(path.clone()); + let tenant_id = Uuid::new_v4(); + + let resp = store.get_for_tenant(tenant_id); + assert!(!resp.configured); + assert_eq!(resp.entitlements.max_deployments, 1); + + let state = TenantBillingState { + provider: "mock".to_string(), + provider_customer_id: None, + provider_subscription_id: None, + provider_checkout_session_id: None, + status: Some(SubscriptionStatus::Active), + plan: Some(Plan::Pro), + current_period_end: None, + cancel_at_period_end: Some(false), + processed_webhook_event_ids: vec![], + updated_at: 0, + }; + + store.update_tenant_state(tenant_id, state).unwrap(); + + let resp2 = store.get_for_tenant(tenant_id); + assert!(resp2.configured); + assert_eq!(resp2.provider.as_deref(), Some("mock")); + assert_eq!(resp2.plan, Some(Plan::Pro)); + assert_eq!(resp2.entitlements.max_deployments, 10); + + let _ = fs::remove_file(path); + } + + #[tokio::test] + async fn test_reconciliation_corrects_state() { + let mut path = temp_dir(); + path.push(format!("billing-reconcile-{}.json", Uuid::new_v4())); + let store = BillingStore::new(path.clone()); + let tenant_id = Uuid::new_v4(); + + // 1. Initial state: PastDue + store + .update_tenant_state( + tenant_id, + TenantBillingState { + provider: "mock".to_string(), + provider_customer_id: Some("cus_1".to_string()), + provider_subscription_id: Some("sub_1".to_string()), + provider_checkout_session_id: None, + status: Some(SubscriptionStatus::PastDue), + plan: Some(Plan::Pro), + current_period_end: None, + cancel_at_period_end: Some(false), + processed_webhook_event_ids: vec![], + updated_at: 100, + }, + ) + .unwrap(); + + let state = AppState { + prometheus: crate::get_test_prometheus_handle(), + auth: crate::AuthConfig { hs256_secret: None }, + jobs: crate::jobs::JobStore::default(), + audit: crate::AuditStore::default(), + tenant_locks: crate::job_engine::TenantLocks::default(), + config_locks: crate::job_engine::ConfigLocks::default(), + http: reqwest::Client::new(), + placement: crate::placement::PlacementStore::new(temp_dir().join("placement.json")), + billing: store.clone(), + billing_provider: Arc::new(MockProvider), + billing_enforcement_enabled: true, + config: crate::config_registry::ConfigRegistry::new(None, None), + fleet_services: vec![], + swarm: crate::swarm::SwarmStore::new(temp_dir().join("swarm.json")), + docs: None, + }; + + // 2. Run reconciliation. MockProvider returns Active status. + reconcile_once(&state).await; + + // 3. Verify state is now Active + let resp = store.get_for_tenant(tenant_id); + assert_eq!(resp.status, Some(SubscriptionStatus::Active)); + + let _ = fs::remove_file(path); + } +} diff --git a/control/api/src/config_registry.rs b/control/api/src/config_registry.rs new file mode 100644 index 0000000..b7d17f2 --- /dev/null +++ b/control/api/src/config_registry.rs @@ -0,0 +1,323 @@ +use async_trait::async_trait; +use futures::StreamExt; +use serde::{Deserialize, Serialize}; +use std::{path::PathBuf, sync::Arc, time::Duration}; +use thiserror::Error; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ConfigDomain { + Routing, + Placement, +} + +impl ConfigDomain { + pub fn as_str(&self) -> &'static str { + match self { + ConfigDomain::Routing => "routing", + ConfigDomain::Placement => "placement", + } + } +} + +#[derive(Debug, Error)] +pub enum ConfigRegistryError { + #[error("source error: {0}")] + Source(String), + #[error("decode error: {0}")] + Decode(String), + #[error("domain not configured")] + NotConfigured, +} + +#[derive(Debug, Clone, Serialize)] +pub struct ConfigSnapshot { + pub domain: String, + pub revision: u64, + pub value: T, + pub source: ConfigSourceInfo, +} + +#[derive(Debug, Clone, Serialize)] +#[serde(tag = "kind", rename_all = "snake_case")] +pub enum ConfigSourceInfo { + File { path: String }, + NatsKv { bucket: String, key: String }, + Fixed, +} + +#[async_trait] +pub trait ConfigSource: Send + Sync { + async fn load_bytes(&self) -> Result<(Option>, u64), ConfigRegistryError>; + async fn put_bytes( + &self, + expected_revision: Option, + value: Vec, + ) -> Result; + async fn history_bytes(&self, limit: usize) + -> Result)>, ConfigRegistryError>; + async fn watch( + &self, + ) -> Result< + std::pin::Pin> + Send>>, + ConfigRegistryError, + >; + fn info(&self) -> ConfigSourceInfo; +} + +#[derive(Clone)] +pub struct FixedSource { + bytes: Arc>, +} + +impl FixedSource { + pub fn new(bytes: Vec) -> Self { + Self { + bytes: Arc::new(bytes), + } + } +} + +#[async_trait] +impl ConfigSource for FixedSource { + async fn load_bytes(&self) -> Result<(Option>, u64), ConfigRegistryError> { + Ok((Some(self.bytes.as_ref().clone()), 1)) + } + + async fn put_bytes( + &self, + _expected_revision: Option, + _value: Vec, + ) -> Result { + Err(ConfigRegistryError::Source( + "fixed source is read-only".to_string(), + )) + } + + async fn history_bytes( + &self, + _limit: usize, + ) -> Result)>, ConfigRegistryError> { + Err(ConfigRegistryError::Source( + "fixed source has no history".to_string(), + )) + } + + async fn watch( + &self, + ) -> Result< + std::pin::Pin> + Send>>, + ConfigRegistryError, + > { + Ok(Box::pin(futures::stream::empty())) + } + + fn info(&self) -> ConfigSourceInfo { + ConfigSourceInfo::Fixed + } +} + +#[derive(Clone)] +pub struct FileSource { + path: PathBuf, +} + +impl FileSource { + pub fn new(path: PathBuf) -> Self { + Self { path } + } +} + +#[async_trait] +impl ConfigSource for FileSource { + async fn load_bytes(&self) -> Result<(Option>, u64), ConfigRegistryError> { + let raw = tokio::fs::read(&self.path) + .await + .map_err(|e| ConfigRegistryError::Source(e.to_string()))?; + Ok((Some(raw), 0)) + } + + async fn put_bytes( + &self, + _expected_revision: Option, + value: Vec, + ) -> Result { + let tmp = self.path.with_extension("tmp"); + tokio::fs::write(&tmp, &value) + .await + .map_err(|e| ConfigRegistryError::Source(e.to_string()))?; + tokio::fs::rename(&tmp, &self.path) + .await + .map_err(|e| ConfigRegistryError::Source(e.to_string()))?; + Ok(0) + } + + async fn history_bytes( + &self, + _limit: usize, + ) -> Result)>, ConfigRegistryError> { + Err(ConfigRegistryError::Source( + "file source has no history".to_string(), + )) + } + + async fn watch( + &self, + ) -> Result< + std::pin::Pin> + Send>>, + ConfigRegistryError, + > { + Ok(Box::pin(futures::stream::empty())) + } + + fn info(&self) -> ConfigSourceInfo { + ConfigSourceInfo::File { + path: self.path.to_string_lossy().to_string(), + } + } +} + +#[derive(Clone)] +pub struct NatsKvSource { + kv: async_nats::jetstream::kv::Store, + bucket: String, + key: String, +} + +impl NatsKvSource { + pub async fn connect( + nats_url: impl Into, + bucket: impl Into, + key: impl Into, + ) -> Result { + let nats_url = nats_url.into(); + let bucket = bucket.into(); + let key = key.into(); + + let client = tokio::time::timeout(Duration::from_secs(2), async_nats::connect(nats_url)) + .await + .map_err(|_| ConfigRegistryError::Source("connect timeout".to_string()))? + .map_err(|e| ConfigRegistryError::Source(e.to_string()))?; + let jetstream = async_nats::jetstream::new(client); + let kv = match jetstream.get_key_value(&bucket).await { + Ok(kv) => kv, + Err(_) => jetstream + .create_key_value(async_nats::jetstream::kv::Config { + bucket: bucket.clone(), + ..Default::default() + }) + .await + .map_err(|e| ConfigRegistryError::Source(e.to_string()))?, + }; + + Ok(Self { kv, bucket, key }) + } +} + +#[async_trait] +impl ConfigSource for NatsKvSource { + async fn load_bytes(&self) -> Result<(Option>, u64), ConfigRegistryError> { + let entry = self + .kv + .entry(&self.key) + .await + .map_err(|e| ConfigRegistryError::Source(e.to_string()))?; + Ok(match entry { + Some(e) => (Some(e.value.to_vec()), e.revision), + None => (None, 0), + }) + } + + async fn put_bytes( + &self, + expected_revision: Option, + value: Vec, + ) -> Result { + let rev = match expected_revision { + Some(expected) if expected > 0 => self + .kv + .update(&self.key, value.into(), expected) + .await + .map_err(|e| ConfigRegistryError::Source(e.to_string()))?, + _ => self + .kv + .put(&self.key, value.into()) + .await + .map_err(|e| ConfigRegistryError::Source(e.to_string()))?, + }; + Ok(rev) + } + + async fn history_bytes( + &self, + limit: usize, + ) -> Result)>, ConfigRegistryError> { + let mut stream = self + .kv + .history(&self.key) + .await + .map_err(|e| ConfigRegistryError::Source(e.to_string()))?; + let mut out = Vec::new(); + while let Some(item) = stream.next().await { + let entry = item.map_err(|e| ConfigRegistryError::Source(e.to_string()))?; + out.push((entry.revision, entry.value.to_vec())); + if out.len() >= limit { + break; + } + } + Ok(out) + } + + async fn watch( + &self, + ) -> Result< + std::pin::Pin> + Send>>, + ConfigRegistryError, + > { + let key = self.key.clone(); + let watch = self + .kv + .watch(&key) + .await + .map_err(|e| ConfigRegistryError::Source(e.to_string()))?; + Ok(Box::pin(watch.filter_map(|entry| async move { + match entry { + Ok(entry) => match entry.operation { + async_nats::jetstream::kv::Operation::Put => Some(Ok(())), + async_nats::jetstream::kv::Operation::Delete + | async_nats::jetstream::kv::Operation::Purge => None, + }, + Err(e) => Some(Err(ConfigRegistryError::Source(e.to_string()))), + } + }))) + } + + fn info(&self) -> ConfigSourceInfo { + ConfigSourceInfo::NatsKv { + bucket: self.bucket.clone(), + key: self.key.clone(), + } + } +} + +#[derive(Clone)] +pub struct ConfigRegistry { + routing: Option>, + placement: Option>, +} + +impl ConfigRegistry { + pub fn new( + routing: Option>, + placement: Option>, + ) -> Self { + Self { routing, placement } + } + + pub fn source(&self, domain: ConfigDomain) -> Option> { + match domain { + ConfigDomain::Routing => self.routing.clone(), + ConfigDomain::Placement => self.placement.clone(), + } + } +} diff --git a/control/api/src/config_schemas.rs b/control/api/src/config_schemas.rs new file mode 100644 index 0000000..77b0a42 --- /dev/null +++ b/control/api/src/config_schemas.rs @@ -0,0 +1,15 @@ +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct RoutingConfig { + pub revision: u64, + + pub aggregate_placement: HashMap, + pub projection_placement: HashMap, + pub runner_placement: HashMap, + + pub aggregate_shards: HashMap>, + pub projection_shards: HashMap>, + pub runner_shards: HashMap>, +} diff --git a/control/api/src/documents.rs b/control/api/src/documents.rs new file mode 100644 index 0000000..3bcf5bb --- /dev/null +++ b/control/api/src/documents.rs @@ -0,0 +1,353 @@ +use crate::auth::{Principal, has_permission}; +use crate::{AppState, RequestIds}; +use axum::{ + Router, + body::Bytes, + extract::{Extension, Path, Query, State}, + http::{HeaderMap, StatusCode, header}, + response::IntoResponse, + routing::{get, post, put}, +}; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +const HEADER_TENANT_ID: &str = shared::HEADER_X_TENANT_ID; + +pub fn router() -> Router { + Router::new() + .route("/tenants/{tenant_id}/docs", get(list_docs)) + .route( + "/tenants/{tenant_id}/docs/{doc_type}/{doc_id}/{filename}", + put(upload_doc), + ) + .route( + "/tenants/{tenant_id}/docs/object/{*key}", + get(get_doc).delete(delete_doc), + ) + .route( + "/tenants/{tenant_id}/docs/presign/upload", + post(presign_upload), + ) + .route( + "/tenants/{tenant_id}/docs/presign/download", + post(presign_download), + ) +} + +fn ensure_tenant_header(headers: &HeaderMap, tenant_id: Uuid) -> Result<(), StatusCode> { + let header_tid = headers + .get(HEADER_TENANT_ID) + .and_then(|v| v.to_str().ok()) + .ok_or(StatusCode::BAD_REQUEST)?; + let header_tid = Uuid::parse_str(header_tid).map_err(|_| StatusCode::BAD_REQUEST)?; + if header_tid != tenant_id { + return Err(StatusCode::FORBIDDEN); + } + Ok(()) +} + +fn ensure_docs_enabled(state: &AppState, tenant_id: Uuid) -> Result<(), StatusCode> { + if !state.billing_enforcement_enabled { + return Ok(()); + } + + let entitlements = state.billing.get_for_tenant(tenant_id).entitlements; + if !entitlements.s3_docs_enabled { + return Err(StatusCode::PAYMENT_REQUIRED); + } + Ok(()) +} + +#[derive(Debug, Deserialize)] +struct ListQuery { + prefix: Option, +} + +#[derive(Debug, Serialize)] +struct ListResponse { + objects: Vec, +} + +async fn list_docs( + State(state): State, + headers: HeaderMap, + Path(tenant_id): Path, + Query(q): Query, + Extension(principal): Extension, +) -> impl IntoResponse { + if !has_permission(&principal, "control:read") { + return StatusCode::FORBIDDEN.into_response(); + } + if let Err(s) = ensure_tenant_header(&headers, tenant_id) { + return s.into_response(); + } + if let Err(s) = ensure_docs_enabled(&state, tenant_id) { + return s.into_response(); + } + let store = match state.docs.as_ref() { + Some(s) => s, + None => return StatusCode::SERVICE_UNAVAILABLE.into_response(), + }; + let prefix = q.prefix.unwrap_or_default(); + let prefix = prefix.trim(); + if prefix.contains("..") { + return StatusCode::BAD_REQUEST.into_response(); + } + let base = format!("{}{}", store_prefix(store), tenant_id); + let prefix = if prefix.is_empty() { + format!("{base}/") + } else { + format!("{base}/{prefix}") + }; + match store.list_for_tenant(&tenant_id.to_string(), &prefix).await { + Ok(objects) => (StatusCode::OK, axum::Json(ListResponse { objects })).into_response(), + Err(_) => StatusCode::BAD_GATEWAY.into_response(), + } +} + +async fn upload_doc( + State(state): State, + headers: HeaderMap, + Path((tenant_id, doc_type, doc_id, filename)): Path<(Uuid, String, String, String)>, + Extension(principal): Extension, + Extension(request_ids): Extension, + body: Bytes, +) -> impl IntoResponse { + if !has_permission(&principal, "control:write") { + return StatusCode::FORBIDDEN.into_response(); + } + if let Err(s) = ensure_tenant_header(&headers, tenant_id) { + return s.into_response(); + } + if let Err(s) = ensure_docs_enabled(&state, tenant_id) { + return s.into_response(); + } + let store = match state.docs.as_ref() { + Some(s) => s, + None => return StatusCode::SERVICE_UNAVAILABLE.into_response(), + }; + + let ct = headers + .get(header::CONTENT_TYPE) + .and_then(|v| v.to_str().ok()) + .map(|s| s.to_string()); + + let key = match store.key_for(&tenant_id.to_string(), &doc_type, &doc_id, &filename) { + Ok(k) => k, + Err(_) => return StatusCode::BAD_REQUEST.into_response(), + }; + + let bytes = body.to_vec(); + let hash = crate::s3_docs::DocsStore::content_hash_sha256_hex(&bytes); + if let Err(e) = store + .put_for_tenant(&tenant_id.to_string(), &key, bytes, ct) + .await + { + tracing::warn!( + request_id = %request_ids.request_id, + correlation_id = ?request_ids.correlation_id, + error = %e, + "docs upload failed" + ); + return StatusCode::BAD_GATEWAY.into_response(); + } + + ( + StatusCode::OK, + axum::Json(serde_json::json!({ + "key": key, + "sha256": hash, + })), + ) + .into_response() +} + +async fn get_doc( + State(state): State, + headers: HeaderMap, + Path((tenant_id, key)): Path<(Uuid, String)>, + Extension(principal): Extension, +) -> impl IntoResponse { + if !has_permission(&principal, "control:read") { + return StatusCode::FORBIDDEN.into_response(); + } + if let Err(s) = ensure_tenant_header(&headers, tenant_id) { + return s.into_response(); + } + if let Err(s) = ensure_docs_enabled(&state, tenant_id) { + return s.into_response(); + } + let store = match state.docs.as_ref() { + Some(s) => s, + None => return StatusCode::SERVICE_UNAVAILABLE.into_response(), + }; + + let base = format!("{}{}", store_prefix(store), tenant_id); + if !key.starts_with(&base) { + return StatusCode::FORBIDDEN.into_response(); + } + + match store + .get_bytes_for_tenant(&tenant_id.to_string(), &key) + .await + { + Ok((bytes, ct)) => { + let mut res = axum::response::Response::new(axum::body::Body::from(bytes)); + *res.status_mut() = StatusCode::OK; + if let Some(ct) = ct + && let Ok(v) = axum::http::HeaderValue::from_str(&ct) + { + res.headers_mut().insert(header::CONTENT_TYPE, v); + } + res + } + Err(_) => StatusCode::NOT_FOUND.into_response(), + } +} + +async fn delete_doc( + State(state): State, + headers: HeaderMap, + Path((tenant_id, key)): Path<(Uuid, String)>, + Extension(principal): Extension, +) -> impl IntoResponse { + if !has_permission(&principal, "control:write") { + return StatusCode::FORBIDDEN.into_response(); + } + if let Err(s) = ensure_tenant_header(&headers, tenant_id) { + return s.into_response(); + } + if let Err(s) = ensure_docs_enabled(&state, tenant_id) { + return s.into_response(); + } + let store = match state.docs.as_ref() { + Some(s) => s, + None => return StatusCode::SERVICE_UNAVAILABLE.into_response(), + }; + + let base = format!("{}{}", store_prefix(store), tenant_id); + if !key.starts_with(&base) { + return StatusCode::FORBIDDEN.into_response(); + } + + match store.delete_for_tenant(&tenant_id.to_string(), &key).await { + Ok(_) => StatusCode::NO_CONTENT.into_response(), + Err(_) => StatusCode::BAD_GATEWAY.into_response(), + } +} + +#[derive(Debug, Deserialize)] +struct PresignUploadRequest { + doc_type: String, + doc_id: Option, + filename: String, + content_type: Option, +} + +async fn presign_upload( + State(state): State, + headers: HeaderMap, + Path(tenant_id): Path, + Extension(principal): Extension, + axum::Json(body): axum::Json, +) -> impl IntoResponse { + if !has_permission(&principal, "control:write") { + return StatusCode::FORBIDDEN.into_response(); + } + if let Err(s) = ensure_tenant_header(&headers, tenant_id) { + return s.into_response(); + } + if let Err(s) = ensure_docs_enabled(&state, tenant_id) { + return s.into_response(); + } + let store = match state.docs.as_ref() { + Some(s) => s, + None => return StatusCode::SERVICE_UNAVAILABLE.into_response(), + }; + + let doc_id = body.doc_id.unwrap_or_else(|| Uuid::new_v4().to_string()); + let key = match store.key_for( + &tenant_id.to_string(), + &body.doc_type, + &doc_id, + &body.filename, + ) { + Ok(k) => k, + Err(_) => return StatusCode::BAD_REQUEST.into_response(), + }; + + match store + .presign_put_for_tenant( + &tenant_id.to_string(), + &key, + body.content_type, + std::time::Duration::from_secs(300), + ) + .await + { + Ok(url) => ( + StatusCode::OK, + axum::Json(serde_json::json!({ + "method": "PUT", + "url": url, + "key": key, + })), + ) + .into_response(), + Err(_) => StatusCode::BAD_GATEWAY.into_response(), + } +} + +#[derive(Debug, Deserialize)] +struct PresignDownloadRequest { + key: String, +} + +async fn presign_download( + State(state): State, + headers: HeaderMap, + Path(tenant_id): Path, + Extension(principal): Extension, + axum::Json(body): axum::Json, +) -> impl IntoResponse { + if !has_permission(&principal, "control:read") { + return StatusCode::FORBIDDEN.into_response(); + } + if let Err(s) = ensure_tenant_header(&headers, tenant_id) { + return s.into_response(); + } + if let Err(s) = ensure_docs_enabled(&state, tenant_id) { + return s.into_response(); + } + let store = match state.docs.as_ref() { + Some(s) => s, + None => return StatusCode::SERVICE_UNAVAILABLE.into_response(), + }; + let base = format!("{}{}", store_prefix(store), tenant_id); + if !body.key.starts_with(&base) { + return StatusCode::FORBIDDEN.into_response(); + } + match store + .presign_get_for_tenant( + &tenant_id.to_string(), + &body.key, + std::time::Duration::from_secs(300), + ) + .await + { + Ok(url) => ( + StatusCode::OK, + axum::Json(serde_json::json!({ + "method": "GET", + "url": url, + "key": body.key, + })), + ) + .into_response(), + Err(_) => StatusCode::BAD_GATEWAY.into_response(), + } +} + +fn store_prefix(store: &crate::s3_docs::DocsStore) -> &str { + store.prefix() +} diff --git a/control/api/src/drift.rs b/control/api/src/drift.rs new file mode 100644 index 0000000..50634a3 --- /dev/null +++ b/control/api/src/drift.rs @@ -0,0 +1,127 @@ +use crate::{AppState, build_info::extract_build_info, fleet, swarm::SwarmService}; +use serde::Serialize; +use std::collections::{BTreeMap, BTreeSet}; + +#[derive(Debug, Clone, Serialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum DriftKind { + Missing, + Extra, + Unhealthy, + VersionMismatch, +} + +#[derive(Debug, Clone, Serialize)] +pub struct DriftItem { + pub kind: DriftKind, + pub service: String, + pub details: serde_json::Value, +} + +#[derive(Debug, Clone, Serialize)] +pub struct DriftResponse { + pub summary: BTreeMap, + pub items: Vec, +} + +pub async fn compute(state: &AppState) -> DriftResponse { + let mut items: Vec = Vec::new(); + + // Desired service set: what the Control API was configured to observe. + // (In production, this should evolve into "desired stacks + required services".) + let desired: BTreeSet = state + .fleet_services + .iter() + .map(|s| s.name.clone()) + .collect(); + + // Observed service set: what Swarm reports (dev: from file snapshot). + let observed_services: Vec = state.swarm.list_services(); + let observed: BTreeSet = observed_services.iter().map(|s| s.name.clone()).collect(); + + for missing in desired.difference(&observed) { + items.push(DriftItem { + kind: DriftKind::Missing, + service: missing.clone(), + details: serde_json::json!({ "expected": true }), + }); + } + + for extra in observed.difference(&desired) { + items.push(DriftItem { + kind: DriftKind::Extra, + service: extra.clone(), + details: serde_json::json!({ "observed": true }), + }); + } + + // Health drift: based on fleet snapshot. + let snapshots = fleet::snapshot(&state.http, &state.fleet_services).await; + for s in snapshots { + if !s.health_ok || !s.ready_ok { + items.push(DriftItem { + kind: DriftKind::Unhealthy, + service: s.name.clone(), + details: serde_json::json!({ + "health_ok": s.health_ok, + "ready_ok": s.ready_ok, + "metrics_ok": s.metrics_ok, + "base_url": s.base_url, + }), + }); + } + } + + // Version drift: compare build_info between services when present. + // Desired is not yet explicit; for now we flag when multiple versions exist for same service. + let mut versions_by_service: BTreeMap> = BTreeMap::new(); + let snapshots = fleet::snapshot(&state.http, &state.fleet_services).await; + for s in snapshots { + if let Ok(metrics) = state + .http + .get(format!("{}/metrics", s.base_url)) + .send() + .await + && let Ok(body) = metrics.text().await + { + for bi in extract_build_info(&body) { + versions_by_service + .entry(bi.service.clone()) + .or_default() + .insert(format!("{}@{}", bi.version, bi.git_sha)); + } + } + } + for (svc, vs) in versions_by_service { + if vs.len() > 1 { + items.push(DriftItem { + kind: DriftKind::VersionMismatch, + service: svc, + details: serde_json::json!({ "seen": vs.into_iter().collect::>() }), + }); + } + } + + fn ord(k: &DriftKind) -> u8 { + match k { + DriftKind::Missing => 0, + DriftKind::Extra => 1, + DriftKind::Unhealthy => 2, + DriftKind::VersionMismatch => 3, + } + } + items.sort_by(|a, b| (ord(&a.kind), &a.service).cmp(&(ord(&b.kind), &b.service))); + + let mut summary: BTreeMap = BTreeMap::new(); + for item in &items { + let k = match item.kind { + DriftKind::Missing => "missing", + DriftKind::Extra => "extra", + DriftKind::Unhealthy => "unhealthy", + DriftKind::VersionMismatch => "version_mismatch", + }; + *summary.entry(k.to_string()).or_insert(0) += 1; + } + + DriftResponse { summary, items } +} diff --git a/control/api/src/job_engine.rs b/control/api/src/job_engine.rs index c2b4e10..448a632 100644 --- a/control/api/src/job_engine.rs +++ b/control/api/src/job_engine.rs @@ -1,14 +1,19 @@ use crate::{ AppState, Principal, audit::{AuditEvent, AuditStore}, + config_registry::{ConfigDomain, ConfigRegistryError}, + config_schemas::RoutingConfig, fleet, jobs::{Job, JobStatus, JobStep, JobStore}, + placement::PlacementFile, }; use std::{ collections::HashMap, + path::PathBuf, sync::{Arc, Mutex}, time::{Duration, SystemTime, UNIX_EPOCH}, }; +use url::Url; use uuid::Uuid; #[derive(Clone, Default)] @@ -34,20 +39,52 @@ impl TenantLocks { } } +#[derive(Clone, Default)] +pub struct ConfigLocks { + inner: Arc>>, +} + +impl ConfigLocks { + pub fn try_lock(&self, domain: ConfigDomain, job_id: Uuid) -> bool { + let mut map = self.inner.lock().expect("config locks poisoned"); + let k = domain.as_str().to_string(); + if map.contains_key(&k) { + return false; + } + map.insert(k, job_id); + true + } + + pub fn unlock(&self, domain: ConfigDomain, job_id: Uuid) { + let mut map = self.inner.lock().expect("config locks poisoned"); + let k = domain.as_str().to_string(); + if map.get(&k).copied() == Some(job_id) { + map.remove(&k); + } + } +} + #[derive(Clone)] pub struct JobEngine { pub jobs: JobStore, pub audit: AuditStore, pub tenant_locks: TenantLocks, + pub config_locks: ConfigLocks, pub step_timeout: Duration, } impl JobEngine { - pub fn new(jobs: JobStore, audit: AuditStore, tenant_locks: TenantLocks) -> Self { + pub fn new( + jobs: JobStore, + audit: AuditStore, + tenant_locks: TenantLocks, + config_locks: ConfigLocks, + ) -> Self { Self { jobs, audit, tenant_locks, + config_locks, step_timeout: Duration::from_millis(500), } } @@ -93,7 +130,7 @@ impl JobEngine { let engine = self.clone(); tokio::spawn(async move { engine - .run_job(state, inserted, Some(tenant_id), RunSpec::Drain) + .run_job(state, inserted, Some(tenant_id), None, RunSpec::Drain) .await; }); @@ -152,6 +189,7 @@ impl JobEngine { state, inserted, Some(tenant_id), + None, RunSpec::Migrate { runner_target }, ) .await; @@ -160,7 +198,238 @@ impl JobEngine { Ok(inserted) } - async fn run_job(&self, state: AppState, job_id: Uuid, tenant_id: Option, spec: RunSpec) { + #[allow(clippy::too_many_arguments)] + pub fn start_config_apply( + &self, + state: AppState, + principal: &Principal, + domain: ConfigDomain, + reason: String, + expected_revision: Option, + value: serde_json::Value, + idempotency_key: &str, + ) -> Result { + if let Some(existing) = self.jobs.get_idempotent(idempotency_key) { + return Ok(existing); + } + + let job_id = Uuid::new_v4(); + if !self.config_locks.try_lock(domain, job_id) { + return Err(StartJobError::TenantLocked); + } + + let now = now_ms(); + let job = Job { + job_id, + status: JobStatus::Pending, + steps: vec![ + step("preflight"), + step("validate_config"), + step("backup_config"), + step("apply_config"), + step("reload_config"), + step("verify_config"), + ], + error: None, + created_at_ms: now, + started_at_ms: None, + finished_at_ms: None, + }; + + let inserted = self.jobs.insert_idempotent(idempotency_key, job); + self.audit.record(AuditEvent { + ts_ms: now, + principal_sub: principal.sub.clone(), + action: format!("config.{}.apply", domain.as_str()), + tenant_id: None, + reason, + job_id: Some(inserted), + }); + + let engine = self.clone(); + tokio::spawn(async move { + engine + .run_job( + state, + inserted, + None, + Some(domain), + RunSpec::ConfigApply { + domain, + expected_revision, + value, + }, + ) + .await; + }); + + Ok(inserted) + } + + pub fn start_config_validate( + &self, + state: AppState, + principal: &Principal, + domain: ConfigDomain, + reason: String, + value: serde_json::Value, + idempotency_key: &str, + ) -> Result { + if let Some(existing) = self.jobs.get_idempotent(idempotency_key) { + return Ok(existing); + } + + let job_id = Uuid::new_v4(); + if !self.config_locks.try_lock(domain, job_id) { + return Err(StartJobError::TenantLocked); + } + + let now = now_ms(); + let job = Job { + job_id, + status: JobStatus::Pending, + steps: vec![step("validate_config")], + error: None, + created_at_ms: now, + started_at_ms: None, + finished_at_ms: None, + }; + + let inserted = self.jobs.insert_idempotent(idempotency_key, job); + self.audit.record(AuditEvent { + ts_ms: now, + principal_sub: principal.sub.clone(), + action: format!("config.{}.validate", domain.as_str()), + tenant_id: None, + reason, + job_id: Some(inserted), + }); + + let engine = self.clone(); + tokio::spawn(async move { + engine + .run_job( + state, + inserted, + None, + Some(domain), + RunSpec::ConfigValidate { domain, value }, + ) + .await; + }); + + Ok(inserted) + } + + pub fn start_config_rollback( + &self, + state: AppState, + principal: &Principal, + domain: ConfigDomain, + reason: String, + idempotency_key: &str, + ) -> Result { + if let Some(existing) = self.jobs.get_idempotent(idempotency_key) { + return Ok(existing); + } + + let job_id = Uuid::new_v4(); + if !self.config_locks.try_lock(domain, job_id) { + return Err(StartJobError::TenantLocked); + } + + let now = now_ms(); + let job = Job { + job_id, + status: JobStatus::Pending, + steps: vec![ + step("rollback_config"), + step("reload_config"), + step("verify_config"), + ], + error: None, + created_at_ms: now, + started_at_ms: None, + finished_at_ms: None, + }; + + let inserted = self.jobs.insert_idempotent(idempotency_key, job); + self.audit.record(AuditEvent { + ts_ms: now, + principal_sub: principal.sub.clone(), + action: format!("config.{}.rollback", domain.as_str()), + tenant_id: None, + reason, + job_id: Some(inserted), + }); + + let engine = self.clone(); + tokio::spawn(async move { + engine + .run_job( + state, + inserted, + None, + Some(domain), + RunSpec::ConfigRollback { domain }, + ) + .await; + }); + + Ok(inserted) + } + + pub fn start_platform_verify( + &self, + state: AppState, + principal: &Principal, + reason: String, + idempotency_key: &str, + ) -> Result { + if let Some(existing) = self.jobs.get_idempotent(idempotency_key) { + return Ok(existing); + } + + let job_id = Uuid::new_v4(); + let now = now_ms(); + let job = Job { + job_id, + status: JobStatus::Pending, + steps: vec![step("preflight"), step("platform_verify")], + error: None, + created_at_ms: now, + started_at_ms: None, + finished_at_ms: None, + }; + + let inserted = self.jobs.insert_idempotent(idempotency_key, job); + self.audit.record(AuditEvent { + ts_ms: now, + principal_sub: principal.sub.clone(), + action: "platform.verify".to_string(), + tenant_id: None, + reason, + job_id: Some(inserted), + }); + + let engine = self.clone(); + tokio::spawn(async move { + engine + .run_job(state, inserted, None, None, RunSpec::PlatformVerify) + .await; + }); + + Ok(inserted) + } + + async fn run_job( + &self, + state: AppState, + job_id: Uuid, + tenant_id: Option, + config_domain: Option, + spec: RunSpec, + ) { self.jobs.update(job_id, |j| { j.status = JobStatus::Running; j.started_at_ms = Some(now_ms()); @@ -265,6 +534,9 @@ impl JobEngine { if let Some(tid) = tenant_id { self.tenant_locks.unlock(tid, job_id); } + if let Some(domain) = config_domain { + self.config_locks.unlock(domain, job_id); + } } } @@ -276,7 +548,22 @@ pub enum StartJobError { #[derive(Clone)] enum RunSpec { Drain, - Migrate { runner_target: String }, + Migrate { + runner_target: String, + }, + ConfigValidate { + domain: ConfigDomain, + value: serde_json::Value, + }, + ConfigApply { + domain: ConfigDomain, + expected_revision: Option, + value: serde_json::Value, + }, + ConfigRollback { + domain: ConfigDomain, + }, + PlatformVerify, } fn step(name: &str) -> JobStep { @@ -316,9 +603,14 @@ async fn run_step( "update_placement" => match spec { RunSpec::Migrate { runner_target } => { let tenant_id = tenant_id.ok_or_else(|| "missing tenant_id".to_string())?; + let entitlements = state.billing.get_for_tenant(tenant_id).entitlements; state .placement - .update_runner_target(tenant_id, runner_target.clone()) + .update_runner_target( + tenant_id, + runner_target.clone(), + entitlements.max_runners as usize, + ) .map(|_| ()) } _ => Ok(()), @@ -343,6 +635,400 @@ async fn run_step( } _ => Ok(()), }, + "validate_config" => match spec { + RunSpec::ConfigValidate { domain, value } + | RunSpec::ConfigApply { domain, value, .. } => match domain { + ConfigDomain::Routing => { + let cfg = serde_json::from_value::(value.clone()) + .map_err(|e| format!("invalid routing config: {e}"))?; + validate_routing_semantic(&cfg)?; + Ok(()) + } + ConfigDomain::Placement => { + let cfg = serde_json::from_value::(value.clone()) + .map_err(|e| format!("invalid placement config: {e}"))?; + validate_placement_semantic(state, &cfg)?; + Ok(()) + } + }, + _ => Ok(()), + }, + "backup_config" => match spec { + RunSpec::ConfigApply { domain, .. } => { + let Some(source) = state.config.source(*domain) else { + return Err("config domain not configured".to_string()); + }; + let (cur, _) = source + .load_bytes() + .await + .map_err(|e| format!("failed to load config: {e}"))?; + let cur = cur.unwrap_or_else(|| b"null".to_vec()); + let backup_key_value = serde_json::json!({ "backup": serde_json::from_slice::(&cur).unwrap_or(serde_json::Value::Null) }); + let bytes = + serde_json::to_vec_pretty(&backup_key_value).map_err(|e| e.to_string())?; + let backup_source = backup_source_for(&source.info(), *domain) + .await + .map_err(|e| format!("failed to build backup source: {e}"))?; + let _ = backup_source + .put_bytes(None, bytes) + .await + .map_err(|e| format!("failed to write backup: {e}"))?; + Ok(()) + } + _ => Ok(()), + }, + "apply_config" => match spec { + RunSpec::ConfigApply { + domain, + expected_revision, + value, + } => { + let Some(source) = state.config.source(*domain) else { + return Err("config domain not configured".to_string()); + }; + let bytes = + serde_json::to_vec_pretty(value).map_err(|e| format!("encode error: {e}"))?; + let _ = source + .put_bytes(*expected_revision, bytes) + .await + .map_err(|e| format!("apply failed: {e}"))?; + Ok(()) + } + _ => Ok(()), + }, + "rollback_config" => match spec { + RunSpec::ConfigRollback { domain } => { + let Some(source) = state.config.source(*domain) else { + return Err("config domain not configured".to_string()); + }; + let backup_source = backup_source_for(&source.info(), *domain) + .await + .map_err(|e| format!("failed to build backup source: {e}"))?; + let (bytes, _) = backup_source + .load_bytes() + .await + .map_err(|e| format!("failed to load backup: {e}"))?; + let Some(bytes) = bytes else { + return Err("no backup available".to_string()); + }; + let v: serde_json::Value = serde_json::from_slice(&bytes) + .map_err(|e| format!("invalid backup json: {e}"))?; + let backup = v.get("backup").cloned().unwrap_or(serde_json::Value::Null); + let next = + serde_json::to_vec_pretty(&backup).map_err(|e| format!("encode error: {e}"))?; + let _ = source + .put_bytes(None, next) + .await + .map_err(|e| format!("rollback failed: {e}"))?; + Ok(()) + } + _ => Ok(()), + }, + "reload_config" => Ok(()), + "verify_config" => match spec { + RunSpec::ConfigValidate { domain, .. } + | RunSpec::ConfigApply { domain, .. } + | RunSpec::ConfigRollback { domain } => { + let Some(source) = state.config.source(*domain) else { + return Err("config domain not configured".to_string()); + }; + let (bytes, _) = source + .load_bytes() + .await + .map_err(|e| format!("failed to load config: {e}"))?; + let bytes = bytes.unwrap_or_else(|| b"null".to_vec()); + let v: serde_json::Value = serde_json::from_slice(&bytes) + .map_err(|e| format!("invalid stored json: {e}"))?; + match domain { + ConfigDomain::Routing => { + let cfg = serde_json::from_value::(v) + .map_err(|e| format!("invalid routing config: {e}"))?; + validate_routing_semantic(&cfg)?; + Ok(()) + } + ConfigDomain::Placement => { + let cfg = serde_json::from_value::(v) + .map_err(|e| format!("invalid placement config: {e}"))?; + validate_placement_semantic(state, &cfg)?; + Ok(()) + } + } + } + _ => Ok(()), + }, + "platform_verify" => match spec { + RunSpec::PlatformVerify => { + let snapshots = fleet::snapshot(&state.http, &state.fleet_services).await; + let bad: Vec<_> = snapshots + .into_iter() + .filter(|s| !(s.health_ok && s.ready_ok)) + .map(|s| { + format!( + "{} health_ok={} ready_ok={}", + s.name, s.health_ok, s.ready_ok + ) + }) + .collect(); + if !bad.is_empty() { + return Err(format!("platform verify failed: {}", bad.join("; "))); + } + Ok(()) + } + _ => Ok(()), + }, _ => Ok(()), } } + +async fn backup_source_for( + info: &crate::config_registry::ConfigSourceInfo, + domain: ConfigDomain, +) -> Result, ConfigRegistryError> { + use crate::config_registry::{ConfigSource, FileSource, NatsKvSource}; + match info { + crate::config_registry::ConfigSourceInfo::File { path } => Ok(Arc::new(FileSource::new( + PathBuf::from(path).with_extension(format!("{}.bak.json", domain.as_str())), + )) + as Arc), + crate::config_registry::ConfigSourceInfo::NatsKv { bucket, key } => { + let nats_url = std::env::var("CONTROL_CONFIG_NATS_URL").map_err(|_| { + ConfigRegistryError::Source("missing CONTROL_CONFIG_NATS_URL".to_string()) + })?; + Ok(Arc::new( + NatsKvSource::connect(nats_url, bucket.clone(), format!("{key}.bak")) + .await + .map_err(|e| ConfigRegistryError::Source(e.to_string()))?, + ) as Arc) + } + crate::config_registry::ConfigSourceInfo::Fixed => Err(ConfigRegistryError::Source( + "no backups for fixed source".to_string(), + )), + } +} + +fn validate_routing_semantic(cfg: &RoutingConfig) -> Result<(), String> { + let shard_maps = [ + ("aggregate_shards", &cfg.aggregate_shards), + ("projection_shards", &cfg.projection_shards), + ("runner_shards", &cfg.runner_shards), + ]; + for (name, map) in shard_maps { + for (shard_id, endpoints) in map { + if endpoints.is_empty() { + return Err(format!("{name}[{shard_id}] has no endpoints")); + } + for ep in endpoints { + let u = Url::parse(ep) + .map_err(|e| format!("{name}[{shard_id}] invalid endpoint {ep:?}: {e}"))?; + if u.scheme() != "http" && u.scheme() != "https" { + return Err(format!( + "{name}[{shard_id}] endpoint {ep:?} must be http(s)" + )); + } + if u.host_str().is_none() { + return Err(format!( + "{name}[{shard_id}] endpoint {ep:?} must include host" + )); + } + } + } + } + + // Ensure placement references known shard ids. + let placements = [ + ( + "aggregate_placement", + &cfg.aggregate_placement, + &cfg.aggregate_shards, + ), + ( + "projection_placement", + &cfg.projection_placement, + &cfg.projection_shards, + ), + ( + "runner_placement", + &cfg.runner_placement, + &cfg.runner_shards, + ), + ]; + for (pname, pmap, shards) in placements { + for (tenant, shard_id) in pmap { + if shard_id.trim().is_empty() { + return Err(format!("{pname}[{tenant}] shard_id is empty")); + } + if !shards.contains_key(shard_id) { + return Err(format!( + "{pname}[{tenant}] references missing shard_id {shard_id:?}" + )); + } + } + } + Ok(()) +} + +fn validate_placement_semantic(state: &AppState, cfg: &PlacementFile) -> Result<(), String> { + if !state.billing_enforcement_enabled { + return Ok(()); + } + + let mut tenant_counts = std::collections::HashMap::new(); + + let kinds = [ + ("aggregate_placement", cfg.aggregate_placement.as_ref()), + ("projection_placement", cfg.projection_placement.as_ref()), + ("runner_placement", cfg.runner_placement.as_ref()), + ]; + for (kind_name, k) in kinds { + let Some(k) = k else { continue }; + for p in &k.placements { + if p.targets.is_empty() { + return Err(format!("{kind_name} tenant {} has no targets", p.tenant_id)); + } + if p.targets.iter().any(|t| t.trim().is_empty()) { + return Err(format!( + "{kind_name} tenant {} has empty target", + p.tenant_id + )); + } + + let entry = tenant_counts.entry(p.tenant_id).or_insert((0, 0)); // (deployments, runners) + if kind_name == "runner_placement" { + entry.1 += p.targets.len(); + } else { + entry.0 += p.targets.len(); + } + } + } + + for (tenant_id, (deployments, runners)) in tenant_counts { + let entitlements = state.billing.get_for_tenant(tenant_id).entitlements; + if deployments > entitlements.max_deployments as usize { + return Err(format!( + "tenant {} exceeds max_deployments limit ({} > {})", + tenant_id, deployments, entitlements.max_deployments + )); + } + if runners > entitlements.max_runners as usize { + return Err(format!( + "tenant {} exceeds max_runners limit ({} > {})", + tenant_id, runners, entitlements.max_runners + )); + } + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::billing::{BillingStore, Plan, SubscriptionStatus, TenantBillingState}; + use crate::placement::{PlacementFile, PlacementKind, TenantPlacement}; + + fn mock_state(billing: BillingStore) -> AppState { + let handle = crate::get_test_prometheus_handle(); + let root = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")); + AppState { + prometheus: handle, + auth: crate::AuthConfig { + hs256_secret: Some(b"secret".to_vec()), + }, + jobs: JobStore::default(), + audit: AuditStore::default(), + tenant_locks: TenantLocks::default(), + config_locks: ConfigLocks::default(), + http: reqwest::Client::new(), + placement: crate::placement::PlacementStore::new( + std::env::temp_dir().join("placement.json"), + ), + billing, + billing_provider: Arc::new(crate::billing::MockProvider), + billing_enforcement_enabled: true, + config: crate::config_registry::ConfigRegistry::new(None, None), + fleet_services: vec![], + swarm: crate::swarm::SwarmStore::new(root.join("swarm/dev.json")), + docs: None, + } + } + + #[test] + fn test_validate_placement_limits() { + let tenant_id = Uuid::new_v4(); + let billing_path = + std::env::temp_dir().join(format!("billing-unit-{}.json", Uuid::new_v4())); + let billing = BillingStore::new(billing_path.clone()); + + let state = mock_state(billing.clone()); + + // 1. Free plan (default): max_deployments=1, max_runners=1 + let cfg = PlacementFile { + revision: Some("v1".to_string()), + aggregate_placement: Some(PlacementKind { + placements: vec![TenantPlacement { + tenant_id, + targets: vec!["a1".to_string()], + }], + }), + projection_placement: Some(PlacementKind { + placements: vec![TenantPlacement { + tenant_id, + targets: vec!["p1".to_string()], + }], + }), + runner_placement: Some(PlacementKind { + placements: vec![TenantPlacement { + tenant_id, + targets: vec!["r1".to_string()], + }], + }), + }; + + // aggregate(1) + projection(1) = 2 deployments. Limit is 1. Should fail. + let err = validate_placement_semantic(&state, &cfg).unwrap_err(); + assert!(err.contains("exceeds max_deployments limit")); + + // 2. Reduce to 1 deployment + let cfg2 = PlacementFile { + revision: Some("v2".to_string()), + aggregate_placement: Some(PlacementKind { + placements: vec![TenantPlacement { + tenant_id, + targets: vec!["a1".to_string()], + }], + }), + projection_placement: None, + runner_placement: Some(PlacementKind { + placements: vec![TenantPlacement { + tenant_id, + targets: vec!["r1".to_string()], + }], + }), + }; + validate_placement_semantic(&state, &cfg2).unwrap(); + + // 3. Upgrade to Pro: max_deployments=10, max_runners=10 + billing + .update_tenant_state( + tenant_id, + TenantBillingState { + provider: "mock".to_string(), + provider_customer_id: None, + provider_subscription_id: None, + provider_checkout_session_id: None, + status: Some(SubscriptionStatus::Active), + plan: Some(Plan::Pro), + current_period_end: None, + cancel_at_period_end: None, + processed_webhook_event_ids: vec![], + updated_at: 100, + }, + ) + .unwrap(); + + // Now the first cfg should pass + validate_placement_semantic(&state, &cfg).unwrap(); + + let _ = std::fs::remove_file(billing_path); + } +} diff --git a/control/api/src/lib.rs b/control/api/src/lib.rs index 830fa94..7f922b7 100644 --- a/control/api/src/lib.rs +++ b/control/api/src/lib.rs @@ -1,14 +1,22 @@ mod admin; mod audit; mod auth; +pub mod billing; mod build_info; +pub mod config_registry; +mod config_schemas; mod deployments; +mod documents; +mod drift; mod fleet; mod job_engine; mod jobs; mod placement; +pub mod s3_docs; mod swarm; +use std::sync::Arc; + pub use audit::AuditStore; pub use auth::{AuthConfig, Principal}; use axum::{ @@ -20,8 +28,10 @@ use axum::{ routing::get, }; pub use build_info::{BuildInfo, extract_build_info}; +pub use config_registry::{ConfigDomain, ConfigRegistry}; pub use deployments::{DeployAnnotationArgs, GrafanaAnnotation, build_grafana_deploy_annotation}; pub use fleet::FleetService; +pub use job_engine::ConfigLocks; pub use job_engine::TenantLocks; pub use jobs::JobStore; use metrics_exporter_prometheus::PrometheusHandle; @@ -40,10 +50,16 @@ pub struct AppState { pub jobs: JobStore, pub audit: AuditStore, pub tenant_locks: TenantLocks, + pub config_locks: ConfigLocks, pub http: reqwest::Client, pub placement: PlacementStore, + pub billing: billing::BillingStore, + pub billing_provider: Arc, + pub billing_enforcement_enabled: bool, + pub config: ConfigRegistry, pub fleet_services: Vec, pub swarm: SwarmStore, + pub docs: Option, } #[derive(Clone, Debug)] @@ -93,13 +109,18 @@ pub fn build_app(state: AppState) -> Router { }, ); - let admin = - admin::admin_router().layer(from_fn_with_state(state.clone(), auth::auth_middleware)); + let admin = admin::admin_router() + .merge(documents::router()) + .layer(from_fn_with_state(state.clone(), auth::auth_middleware)); Router::new() .route("/health", get(health)) .route("/ready", get(ready)) .route("/metrics", get(metrics)) + .route( + "/admin/v1/billing/webhooks/{provider}", + axum::routing::post(billing::webhook), + ) .nest("/admin/v1", admin) .with_state(state) .layer(trace) @@ -167,25 +188,46 @@ async fn request_id_middleware(mut req: Request, next: Next) - res } +#[cfg(test)] +static TEST_PROMETHEUS_HANDLE: std::sync::OnceLock = std::sync::OnceLock::new(); + +#[cfg(test)] +pub(crate) fn get_test_prometheus_handle() -> PrometheusHandle { + TEST_PROMETHEUS_HANDLE + .get_or_init(|| { + metrics_exporter_prometheus::PrometheusBuilder::new() + .install_recorder() + .unwrap_or_else(|_| { + // This can happen if another test already installed it. + // We might not get the ACTUAL handle to the global recorder here if we don't share it, + // but for tests it's usually fine to have a dummy one if we are not asserting on metrics. + metrics_exporter_prometheus::PrometheusBuilder::new() + .build() + .expect("failed to build prometheus recorder") + .0 + .handle() + }) + }) + .clone() +} + #[cfg(test)] mod tests { use super::*; + use crate::config_registry::{FileSource, FixedSource}; use crate::jobs::JobStatus; use axum::{ body::Body, http::{Request, StatusCode, header}, }; use jsonwebtoken::{EncodingKey, Header, encode}; - use metrics_exporter_prometheus::PrometheusBuilder; use serde::Serialize; use std::fs; use std::path::PathBuf; - use std::sync::OnceLock; + use std::sync::Arc; use tower::ServiceExt; use uuid::Uuid; - static HANDLE: OnceLock = OnceLock::new(); - #[derive(Serialize)] struct TestClaims { sub: String, @@ -199,15 +241,10 @@ mod tests { } fn test_app_with_fleet(fleet_services: Vec) -> Router { - let handle = HANDLE - .get_or_init(|| { - PrometheusBuilder::new() - .install_recorder() - .expect("failed to install prometheus recorder") - }) - .clone(); + let handle = get_test_prometheus_handle(); let placement_path = temp_placement_file(); + let root = repo_root(); build_app(AppState { prometheus: handle, @@ -217,10 +254,23 @@ mod tests { jobs: JobStore::default(), audit: AuditStore::default(), tenant_locks: TenantLocks::default(), + config_locks: ConfigLocks::default(), http: reqwest::Client::new(), placement: PlacementStore::new(placement_path), + billing: crate::billing::BillingStore::new( + std::env::temp_dir().join(format!("billing-test-{}.json", Uuid::new_v4())), + ), + billing_provider: Arc::new(crate::billing::MockProvider), + billing_enforcement_enabled: true, + config: ConfigRegistry::new( + Some(Arc::new(FileSource::new( + root.join("config/routing/dev.json"), + ))), + Some(Arc::new(FixedSource::new(b"{}".to_vec()))), + ), fleet_services, swarm: SwarmStore::new(repo_root().join("swarm/dev.json")), + docs: None, }) } @@ -234,14 +284,14 @@ mod tests { fn temp_placement_file() -> PathBuf { let root = repo_root(); - let src = root.join("placement/dev.json"); + let src = root.join("config/placement/dev.json"); let mut dst = std::env::temp_dir(); dst.push(format!( "cloudlysis-control-placement-{}-{}.json", std::process::id(), Uuid::new_v4() )); - let raw = fs::read_to_string(src).expect("missing placement/dev.json"); + let raw = fs::read_to_string(src).expect("missing config/placement/dev.json"); fs::write(&dst, raw).expect("failed to write temp placement file"); dst } @@ -689,4 +739,467 @@ mod tests { &serde_json::json!(["preflight", "drain", "update_placement", "reload", "verify"]) ); } + + #[tokio::test] + async fn billing_returns_not_configured_by_default() { + let token = make_token(&["control:read"]); + let tenant_id = Uuid::new_v4(); + let res = test_app() + .oneshot( + Request::builder() + .uri(format!("/admin/v1/tenants/{tenant_id}/billing")) + .header(header::AUTHORIZATION, format!("Bearer {token}")) + .header("x-tenant-id", tenant_id.to_string()) + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(res.status(), StatusCode::OK); + let body = axum::body::to_bytes(res.into_body(), 1024 * 1024) + .await + .unwrap(); + let v: serde_json::Value = serde_json::from_slice(&body).unwrap(); + assert_eq!(v.get("configured").unwrap(), &serde_json::json!(false)); + assert_eq!( + v.get("entitlements") + .unwrap() + .get("max_deployments") + .unwrap(), + &serde_json::json!(1) + ); + } + + #[tokio::test] + async fn billing_returns_configured_state() { + let token = make_token(&["control:read"]); + let tenant_id = Uuid::new_v4(); + + let handle = get_test_prometheus_handle(); + + let billing_path = + std::env::temp_dir().join(format!("billing-test-cfg-{}.json", Uuid::new_v4())); + let billing = crate::billing::BillingStore::new(billing_path.clone()); + + billing + .update_tenant_state( + tenant_id, + crate::billing::TenantBillingState { + provider: "stripe".to_string(), + provider_customer_id: Some("cus_123".to_string()), + provider_subscription_id: Some("sub_123".to_string()), + provider_checkout_session_id: None, + status: Some(crate::billing::SubscriptionStatus::Active), + plan: Some(crate::billing::Plan::Pro), + current_period_end: Some("2026-04-30T00:00:00Z".to_string()), + cancel_at_period_end: Some(false), + processed_webhook_event_ids: vec![], + updated_at: 1234567890, + }, + ) + .unwrap(); + + let root = repo_root(); + let app = build_app(AppState { + prometheus: handle, + auth: AuthConfig { + hs256_secret: Some(b"test_secret".to_vec()), + }, + jobs: JobStore::default(), + audit: AuditStore::default(), + tenant_locks: TenantLocks::default(), + config_locks: ConfigLocks::default(), + http: reqwest::Client::new(), + placement: PlacementStore::new(temp_placement_file()), + billing, + billing_provider: Arc::new(crate::billing::MockProvider), + billing_enforcement_enabled: true, + config: ConfigRegistry::new( + Some(Arc::new(FileSource::new( + root.join("config/routing/dev.json"), + ))), + Some(Arc::new(FixedSource::new(b"{}".to_vec()))), + ), + fleet_services: vec![], + swarm: SwarmStore::new(repo_root().join("swarm/dev.json")), + docs: None, + }); + + let res = app + .oneshot( + Request::builder() + .uri(format!("/admin/v1/tenants/{tenant_id}/billing")) + .header(header::AUTHORIZATION, format!("Bearer {token}")) + .header("x-tenant-id", tenant_id.to_string()) + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(res.status(), StatusCode::OK); + let body = axum::body::to_bytes(res.into_body(), 1024 * 1024) + .await + .unwrap(); + let v: serde_json::Value = serde_json::from_slice(&body).unwrap(); + assert_eq!(v.get("configured").unwrap(), &serde_json::json!(true)); + assert_eq!(v.get("plan").unwrap(), &serde_json::json!("pro")); + assert_eq!( + v.get("entitlements") + .unwrap() + .get("max_deployments") + .unwrap(), + &serde_json::json!(10) + ); + + let _ = std::fs::remove_file(billing_path); + } + + #[tokio::test] + async fn checkout_returns_mock_url() { + let token = make_token(&["control:write"]); + let tenant_id = Uuid::new_v4(); + let body = serde_json::json!({ + "plan": "pro", + "return_path": "/custom-return" + }); + + let res = test_app() + .oneshot( + Request::builder() + .uri(format!("/admin/v1/tenants/{tenant_id}/billing/checkout")) + .method("POST") + .header(header::AUTHORIZATION, format!("Bearer {token}")) + .header("x-tenant-id", tenant_id.to_string()) + .header(header::CONTENT_TYPE, "application/json") + .body(Body::from(body.to_string())) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(res.status(), StatusCode::OK); + let body = axum::body::to_bytes(res.into_body(), 1024 * 1024) + .await + .unwrap(); + let v: serde_json::Value = serde_json::from_slice(&body).unwrap(); + assert_eq!( + v.get("url").unwrap(), + &serde_json::json!(format!("https://mock.stripe.com/checkout/{}", tenant_id)) + ); + } + + #[tokio::test] + async fn checkout_fails_if_already_active() { + let token = make_token(&["control:write"]); + let tenant_id = Uuid::new_v4(); + + // Setup app with active subscription + let billing_path = + std::env::temp_dir().join(format!("billing-test-active-{}.json", Uuid::new_v4())); + let billing = crate::billing::BillingStore::new(billing_path.clone()); + billing + .update_tenant_state( + tenant_id, + crate::billing::TenantBillingState { + provider: "mock".to_string(), + provider_customer_id: None, + provider_subscription_id: None, + provider_checkout_session_id: None, + status: Some(crate::billing::SubscriptionStatus::Active), + plan: Some(crate::billing::Plan::Pro), + current_period_end: None, + cancel_at_period_end: None, + processed_webhook_event_ids: vec![], + updated_at: 0, + }, + ) + .unwrap(); + + let handle = get_test_prometheus_handle(); + let root = repo_root(); + let app = build_app(AppState { + prometheus: handle, + auth: AuthConfig { + hs256_secret: Some(b"test_secret".to_vec()), + }, + jobs: JobStore::default(), + audit: AuditStore::default(), + tenant_locks: TenantLocks::default(), + config_locks: ConfigLocks::default(), + http: reqwest::Client::new(), + placement: PlacementStore::new(temp_placement_file()), + billing, + billing_provider: Arc::new(crate::billing::MockProvider), + billing_enforcement_enabled: true, + config: ConfigRegistry::new( + Some(Arc::new(FileSource::new( + root.join("config/routing/dev.json"), + ))), + Some(Arc::new(FixedSource::new(b"{}".to_vec()))), + ), + fleet_services: vec![], + swarm: SwarmStore::new(repo_root().join("swarm/dev.json")), + docs: None, + }); + + let body = serde_json::json!({ "plan": "pro" }); + let res = app + .oneshot( + Request::builder() + .uri(format!("/admin/v1/tenants/{tenant_id}/billing/checkout")) + .method("POST") + .header(header::AUTHORIZATION, format!("Bearer {token}")) + .header("x-tenant-id", tenant_id.to_string()) + .header(header::CONTENT_TYPE, "application/json") + .body(Body::from(body.to_string())) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(res.status(), StatusCode::CONFLICT); + let _ = std::fs::remove_file(billing_path); + } + + #[tokio::test] + async fn portal_returns_mock_url() { + let token = make_token(&["control:write"]); + let tenant_id = Uuid::new_v4(); + + let res = test_app() + .oneshot( + Request::builder() + .uri(format!("/admin/v1/tenants/{tenant_id}/billing/portal")) + .method("POST") + .header(header::AUTHORIZATION, format!("Bearer {token}")) + .header("x-tenant-id", tenant_id.to_string()) + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(res.status(), StatusCode::OK); + let body = axum::body::to_bytes(res.into_body(), 1024 * 1024) + .await + .unwrap(); + let v: serde_json::Value = serde_json::from_slice(&body).unwrap(); + assert_eq!( + v.get("url").unwrap(), + &serde_json::json!(format!("https://mock.stripe.com/portal/{}", tenant_id)) + ); + } + + #[tokio::test] + async fn webhook_updates_state_idempotently() { + let tenant_id = Uuid::new_v4(); + let event_id = "evt_123".to_string(); + + let app = test_app(); + + let event = crate::billing::BillingEvent::SubscriptionCreated { + tenant_id, + event_id: event_id.clone(), + provider_customer_id: "cus_123".to_string(), + provider_subscription_id: "sub_123".to_string(), + status: crate::billing::SubscriptionStatus::Active, + plan: crate::billing::Plan::Pro, + current_period_end: "2026-04-30T00:00:00Z".to_string(), + ts_ms: 1000, + }; + + let body = serde_json::to_string(&event).unwrap(); + + // 1. Send webhook + let res = app + .clone() + .oneshot( + Request::builder() + .uri("/admin/v1/billing/webhooks/mock") + .method("POST") + .header(header::CONTENT_TYPE, "application/json") + .body(Body::from(body.clone())) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(res.status(), StatusCode::OK); + + // 2. Verify state + let token = make_token(&["control:read"]); + let res = app + .clone() + .oneshot( + Request::builder() + .uri(format!("/admin/v1/tenants/{tenant_id}/billing")) + .header(header::AUTHORIZATION, format!("Bearer {token}")) + .header("x-tenant-id", tenant_id.to_string()) + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let body_bytes = axum::body::to_bytes(res.into_body(), 1024 * 1024) + .await + .unwrap(); + let v: serde_json::Value = serde_json::from_slice(&body_bytes).unwrap(); + assert_eq!(v.get("configured").unwrap(), &serde_json::json!(true)); + assert_eq!(v.get("plan").unwrap(), &serde_json::json!("pro")); + + // 3. Send same webhook again (idempotency) + let res = app + .clone() + .oneshot( + Request::builder() + .uri("/admin/v1/billing/webhooks/mock") + .method("POST") + .header(header::CONTENT_TYPE, "application/json") + .body(Body::from(body)) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(res.status(), StatusCode::OK); + } + + #[tokio::test] + async fn webhook_ignores_stale_events() { + let tenant_id = Uuid::new_v4(); + let app = test_app(); + + // 1. Send recent event (ts=2000) + let event1 = crate::billing::BillingEvent::SubscriptionUpdated { + tenant_id, + event_id: "evt_new".to_string(), + status: crate::billing::SubscriptionStatus::Active, + plan: crate::billing::Plan::Enterprise, + current_period_end: "2026-05-30T00:00:00Z".to_string(), + cancel_at_period_end: false, + ts_ms: 2000, + }; + + app.clone() + .oneshot( + Request::builder() + .uri("/admin/v1/billing/webhooks/mock") + .method("POST") + .body(Body::from(serde_json::to_string(&event1).unwrap())) + .unwrap(), + ) + .await + .unwrap(); + + // 2. Send stale event (ts=1000) + let event2 = crate::billing::BillingEvent::SubscriptionUpdated { + tenant_id, + event_id: "evt_old".to_string(), + status: crate::billing::SubscriptionStatus::PastDue, + plan: crate::billing::Plan::Pro, + current_period_end: "2026-04-30T00:00:00Z".to_string(), + cancel_at_period_end: false, + ts_ms: 1000, + }; + + app.clone() + .oneshot( + Request::builder() + .uri("/admin/v1/billing/webhooks/mock") + .method("POST") + .body(Body::from(serde_json::to_string(&event2).unwrap())) + .unwrap(), + ) + .await + .unwrap(); + + // 3. Verify state is still Enterprise + let token = make_token(&["control:read"]); + let res = app + .clone() + .oneshot( + Request::builder() + .uri(format!("/admin/v1/tenants/{tenant_id}/billing")) + .header(header::AUTHORIZATION, format!("Bearer {token}")) + .header("x-tenant-id", tenant_id.to_string()) + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let body_bytes = axum::body::to_bytes(res.into_body(), 1024 * 1024) + .await + .unwrap(); + let v: serde_json::Value = serde_json::from_slice(&body_bytes).unwrap(); + assert_eq!(v.get("plan").unwrap(), &serde_json::json!("enterprise")); + } + + #[tokio::test] + async fn s3_docs_requires_pro_plan() { + let token = make_token(&["control:read", "control:write"]); + let tenant_id = Uuid::new_v4(); + let app = test_app(); + + // 1. Try to list docs (Free plan by default) + let res = app + .clone() + .oneshot( + Request::builder() + .uri(format!("/admin/v1/tenants/{tenant_id}/docs")) + .header(header::AUTHORIZATION, format!("Bearer {token}")) + .header("x-tenant-id", tenant_id.to_string()) + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(res.status(), StatusCode::PAYMENT_REQUIRED); + + // 2. Update to Pro plan via webhook + let event = crate::billing::BillingEvent::SubscriptionCreated { + tenant_id, + event_id: "evt_pro".to_string(), + provider_customer_id: "cus_pro".to_string(), + provider_subscription_id: "sub_pro".to_string(), + status: crate::billing::SubscriptionStatus::Active, + plan: crate::billing::Plan::Pro, + current_period_end: "2099-01-01T00:00:00Z".to_string(), + ts_ms: 2000, + }; + app.clone() + .oneshot( + Request::builder() + .uri("/admin/v1/billing/webhooks/mock") + .method("POST") + .header(header::CONTENT_TYPE, "application/json") + .body(Body::from(serde_json::to_string(&event).unwrap())) + .unwrap(), + ) + .await + .unwrap(); + + // 3. Try to list docs again (Should fail with 503 if S3 not configured in tests, or 200/502 if it is) + // In test_app(), docs is None by default. + let res = app + .clone() + .oneshot( + Request::builder() + .uri(format!("/admin/v1/tenants/{tenant_id}/docs")) + .header(header::AUTHORIZATION, format!("Bearer {token}")) + .header("x-tenant-id", tenant_id.to_string()) + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + // Since docs is None in test_app(), it returns SERVICE_UNAVAILABLE (503) AFTER passing the entitlement check. + // If it was still PAYMENT_REQUIRED, it would return 402. + assert_eq!(res.status(), StatusCode::SERVICE_UNAVAILABLE); + } } diff --git a/control/api/src/main.rs b/control/api/src/main.rs index 4b9815f..1fbd465 100644 --- a/control/api/src/main.rs +++ b/control/api/src/main.rs @@ -1,6 +1,8 @@ use clap::Parser; use metrics_exporter_prometheus::PrometheusBuilder; use std::net::SocketAddr; +use std::path::PathBuf; +use std::sync::Arc; use tracing_subscriber::EnvFilter; #[derive(Parser, Debug)] @@ -33,16 +35,32 @@ async fn main() { .build() .expect("failed to build http client"); - let placement_path = std::env::var("CONTROL_PLACEMENT_PATH") + let placement_path: PathBuf = std::env::var("CONTROL_PLACEMENT_PATH") .ok() - .unwrap_or_else(|| "placement/dev.json".to_string()) + .unwrap_or_else(|| "config/placement/dev.json".to_string()) .into(); - let swarm_path = std::env::var("CONTROL_SWARM_STATE_PATH") + let billing_path: PathBuf = std::env::var("CONTROL_BILLING_STATE_PATH") .ok() - .unwrap_or_else(|| "swarm/dev.json".to_string()) + .unwrap_or_else(|| "billing/dev.json".to_string()) .into(); + let routing_path: PathBuf = std::env::var("CONTROL_ROUTING_PATH") + .ok() + .unwrap_or_else(|| "config/routing/dev.json".to_string()) + .into(); + + let swarm_mode = std::env::var("CONTROL_SWARM_MODE").ok(); + let swarm = if swarm_mode.as_deref() == Some("docker") { + api::SwarmStore::new_docker_cli() + } else { + let swarm_path: PathBuf = std::env::var("CONTROL_SWARM_STATE_PATH") + .ok() + .unwrap_or_else(|| "swarm/dev.json".to_string()) + .into(); + api::SwarmStore::new(swarm_path) + }; + let self_url = std::env::var("CONTROL_SELF_URL") .ok() .unwrap_or_else(|| "http://127.0.0.1:8080".to_string()); @@ -55,7 +73,70 @@ async fn main() { fleet_services.extend(parse_fleet_services(&spec)); } - let app = api::build_app(api::AppState { + let docs_cfg = + api::s3_docs::DocsConfig::from_env().expect("missing S3 document storage configuration"); + let docs = api::s3_docs::DocsStore::new(docs_cfg) + .await + .expect("failed to initialize S3 document storage client"); + + let config = { + let routing = if let (Ok(nats_url), Ok(bucket), Ok(key)) = ( + std::env::var("CONTROL_ROUTING_NATS_URL"), + std::env::var("CONTROL_ROUTING_NATS_BUCKET"), + std::env::var("CONTROL_ROUTING_NATS_KEY"), + ) { + Some(Arc::new( + api::config_registry::NatsKvSource::connect(nats_url, bucket, key) + .await + .expect("failed to connect to routing config nats kv"), + ) as Arc) + } else { + Some( + Arc::new(api::config_registry::FileSource::new(routing_path)) + as Arc, + ) + }; + + let placement = if let (Ok(nats_url), Ok(bucket), Ok(key)) = ( + std::env::var("CONTROL_PLACEMENT_NATS_URL"), + std::env::var("CONTROL_PLACEMENT_NATS_BUCKET"), + std::env::var("CONTROL_PLACEMENT_NATS_KEY"), + ) { + Some(Arc::new( + api::config_registry::NatsKvSource::connect(nats_url, bucket, key) + .await + .expect("failed to connect to placement config nats kv"), + ) as Arc) + } else { + Some(Arc::new(api::config_registry::FileSource::new( + placement_path.clone(), + )) + as Arc) + }; + + api::ConfigRegistry::new(routing, placement) + }; + + let billing_provider: Arc = + match std::env::var("CONTROL_BILLING_PROVIDER").as_deref() { + Ok("stripe") => { + let secret_key = std::env::var("CONTROL_STRIPE_SECRET_KEY") + .expect("CONTROL_STRIPE_SECRET_KEY required for stripe provider"); + let price_pro = std::env::var("CONTROL_STRIPE_PRICE_ID_PRO") + .expect("CONTROL_STRIPE_PRICE_ID_PRO required for stripe provider"); + let price_enterprise = std::env::var("CONTROL_STRIPE_PRICE_ID_ENTERPRISE") + .expect("CONTROL_STRIPE_PRICE_ID_ENTERPRISE required for stripe provider"); + + Arc::new(api::billing::StripeProvider { + secret_key, + price_pro, + price_enterprise, + }) + } + _ => Arc::new(api::billing::MockProvider), + }; + + let state = api::AppState { prometheus: recorder, auth: api::AuthConfig { hs256_secret: std::env::var("CONTROL_GATEWAY_JWT_HS256_SECRET") @@ -65,11 +146,25 @@ async fn main() { jobs: api::JobStore::default(), audit: api::AuditStore::default(), tenant_locks: api::TenantLocks::default(), + config_locks: api::ConfigLocks::default(), http, placement: api::PlacementStore::new(placement_path), + billing: api::billing::BillingStore::new(billing_path), + billing_provider, + billing_enforcement_enabled: std::env::var("CONTROL_BILLING_ENFORCEMENT_ENABLED") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(false), + config, fleet_services, - swarm: api::SwarmStore::new(swarm_path), - }); + swarm, + docs: Some(docs), + }; + + // Spawn reconciliation loop + tokio::spawn(api::billing::run_reconciliation_loop(state.clone())); + + let app = api::build_app(state); let listener = tokio::net::TcpListener::bind(args.addr) .await diff --git a/control/api/src/placement.rs b/control/api/src/placement.rs index dced748..a2ea76c 100644 --- a/control/api/src/placement.rs +++ b/control/api/src/placement.rs @@ -157,6 +157,7 @@ impl PlacementStore { &self, tenant_id: Uuid, runner_target: String, + max_runners: usize, ) -> Result { let mut inner = self.inner.write().expect("placement lock poisoned"); inner.reload_if_changed(); @@ -178,8 +179,17 @@ impl PlacementStore { .iter_mut() .find(|p| p.tenant_id == tenant_id) { + // If already at or above limit, and we are adding a NEW target (not replacing), it would fail. + // But here update_runner_target REPLACES the target list with a single target for now. + // If in the future we want to append, we check targets.len(). + if 1 > max_runners { + return Err(format!("exceeds max_runners limit of {}", max_runners)); + } existing.targets = vec![runner_target]; } else { + if 1 > max_runners { + return Err(format!("exceeds max_runners limit of {}", max_runners)); + } runner.placements.push(TenantPlacement { tenant_id, targets: vec![runner_target], diff --git a/control/api/src/s3_docs.rs b/control/api/src/s3_docs.rs new file mode 100644 index 0000000..64e542b --- /dev/null +++ b/control/api/src/s3_docs.rs @@ -0,0 +1,508 @@ +use aws_config::Region; +use aws_credential_types::Credentials; +use aws_sdk_s3::presigning::PresigningConfig; +use aws_sdk_s3::types::BucketCannedAcl; +use aws_sdk_s3::{Client, config::Builder as S3ConfigBuilder}; +use sha2::Digest; +use std::time::Duration; + +#[derive(Clone, Debug)] +pub struct DocsConfig { + pub endpoint: String, + pub public_endpoint: Option, + pub region: String, + pub access_key_id: String, + pub secret_access_key: String, + pub force_path_style: bool, + pub insecure: bool, + pub buckets: Vec, + pub prefix: String, +} + +impl DocsConfig { + pub fn from_env() -> Result { + fn get(name: &str) -> Option { + std::env::var(name) + .ok() + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + } + + fn get_secret(name: &str, file_name: &str) -> Result, String> { + if let Some(path) = get(file_name) { + let raw = std::fs::read_to_string(path).map_err(|e| e.to_string())?; + let v = raw.trim().to_string(); + if v.is_empty() { + return Ok(None); + } + return Ok(Some(v)); + } + Ok(get(name)) + } + + let endpoint = get("CONTROL_S3_ENDPOINT") + .or_else(|| get("S3_ENDPOINT")) + .ok_or_else(|| "Missing CONTROL_S3_ENDPOINT".to_string())?; + let public_endpoint = + get("CONTROL_S3_PUBLIC_ENDPOINT").or_else(|| get("S3_PUBLIC_ENDPOINT")); + let region = get("CONTROL_S3_REGION") + .or_else(|| get("S3_REGION")) + .unwrap_or_else(|| "us-east-1".to_string()); + let access_key_id = + get_secret("CONTROL_S3_ACCESS_KEY_ID", "CONTROL_S3_ACCESS_KEY_ID_FILE")? + .or_else(|| { + get_secret("S3_ACCESS_KEY_ID", "S3_ACCESS_KEY_ID_FILE") + .ok() + .flatten() + }) + .ok_or_else(|| "Missing CONTROL_S3_ACCESS_KEY_ID".to_string())?; + let secret_access_key = get_secret( + "CONTROL_S3_SECRET_ACCESS_KEY", + "CONTROL_S3_SECRET_ACCESS_KEY_FILE", + )? + .or_else(|| { + get_secret("S3_SECRET_ACCESS_KEY", "S3_SECRET_ACCESS_KEY_FILE") + .ok() + .flatten() + }) + .ok_or_else(|| "Missing CONTROL_S3_SECRET_ACCESS_KEY".to_string())?; + let force_path_style = get("CONTROL_S3_FORCE_PATH_STYLE") + .or_else(|| get("S3_FORCE_PATH_STYLE")) + .as_deref() + .map(|v| v == "true" || v == "1") + .unwrap_or(true); + let insecure = get("CONTROL_S3_INSECURE") + .or_else(|| get("S3_INSECURE")) + .as_deref() + .map(|v| v == "true" || v == "1") + .unwrap_or(false); + + let bucket_raw = get("CONTROL_S3_BUCKET_DOCS") + .or_else(|| get("S3_BUCKET_DOCS")) + .ok_or_else(|| "Missing CONTROL_S3_BUCKET_DOCS".to_string())?; + let buckets: Vec = bucket_raw + .split(',') + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect(); + if buckets.is_empty() { + return Err("Missing CONTROL_S3_BUCKET_DOCS".to_string()); + } + let prefix = get("CONTROL_S3_PREFIX_DOCS") + .or_else(|| get("S3_PREFIX_DOCS")) + .unwrap_or_else(|| "docs/".to_string()); + let prefix = if prefix.ends_with('/') { + prefix + } else { + format!("{prefix}/") + }; + + // SECURITY: `*_INSECURE=true` is intended for local MinIO setups that use plain HTTP. + // We currently do not disable TLS certificate verification for HTTPS endpoints. + if insecure && endpoint.trim_start().starts_with("https://") { + return Err( + "CONTROL_S3_INSECURE=true is not supported with https:// endpoints (TLS verification is not disabled). Use http:// for local MinIO, or set CONTROL_S3_INSECURE=false for production." + .to_string(), + ); + } + + Ok(Self { + endpoint, + public_endpoint, + region, + access_key_id, + secret_access_key, + force_path_style, + insecure, + buckets, + prefix, + }) + } +} + +#[derive(Clone)] +pub struct DocsStore { + cfg: DocsConfig, + client: Client, + presign_client: Client, +} + +impl DocsStore { + pub async fn new(cfg: DocsConfig) -> Result { + let creds = Credentials::new( + cfg.access_key_id.clone(), + cfg.secret_access_key.clone(), + None, + None, + "static", + ); + let shared = aws_config::from_env() + .region(Region::new(cfg.region.clone())) + .credentials_provider(creds.clone()) + .endpoint_url(cfg.endpoint.clone()) + .load() + .await; + + let s3_conf = S3ConfigBuilder::from(&shared) + .force_path_style(cfg.force_path_style) + .build(); + let client = Client::from_conf(s3_conf); + + let presign_endpoint = cfg + .public_endpoint + .clone() + .unwrap_or_else(|| cfg.endpoint.clone()); + let presign_shared = aws_config::from_env() + .region(Region::new(cfg.region.clone())) + .credentials_provider(creds) + .endpoint_url(presign_endpoint) + .load() + .await; + let presign_conf = S3ConfigBuilder::from(&presign_shared) + .force_path_style(cfg.force_path_style) + .build(); + let presign_client = Client::from_conf(presign_conf); + + Ok(Self { + cfg, + client, + presign_client, + }) + } + + pub fn key_for( + &self, + tenant_id: &str, + doc_type: &str, + doc_id: &str, + filename: &str, + ) -> Result { + validate_segment("tenant_id", tenant_id)?; + validate_segment("doc_type", doc_type)?; + validate_segment("doc_id", doc_id)?; + validate_filename(filename)?; + Ok(format!( + "{}{}/{}/{}/{}", + self.cfg.prefix, tenant_id, doc_type, doc_id, filename + )) + } + + pub fn prefix(&self) -> &str { + self.cfg.prefix.as_str() + } + + pub fn buckets(&self) -> &[String] { + self.cfg.buckets.as_slice() + } + + fn bucket_for_tenant(&self, tenant_id: &str) -> &str { + // Deterministic sharding across buckets. Note: if the bucket list changes, the mapping changes. + // For production, set the full planned bucket set up-front (e.g. `-0,-1,-2`) to keep mapping stable. + let n = self.cfg.buckets.len(); + if n == 1 { + return self.cfg.buckets[0].as_str(); + } + let mut hasher = sha2::Sha256::new(); + hasher.update(tenant_id.as_bytes()); + let digest = hasher.finalize(); + let mut b = [0u8; 8]; + b.copy_from_slice(&digest[..8]); + let v = u64::from_be_bytes(b); + let idx = (v as usize) % n; + self.cfg.buckets[idx].as_str() + } + + pub fn content_hash_sha256_hex(bytes: &[u8]) -> String { + let mut hasher = sha2::Sha256::new(); + hasher.update(bytes); + let digest = hasher.finalize(); + let mut out = String::with_capacity(digest.len() * 2); + for b in digest { + use std::fmt::Write; + let _ = write!(&mut out, "{:02x}", b); + } + out + } + + pub async fn put_for_tenant( + &self, + tenant_id: &str, + key: &str, + bytes: Vec, + content_type: Option, + ) -> Result<(), String> { + let mut req = self + .client + .put_object() + .bucket(self.bucket_for_tenant(tenant_id)) + .key(key) + .body(aws_sdk_s3::primitives::ByteStream::from(bytes)); + if let Some(ct) = content_type { + req = req.content_type(ct); + } + req.send().await.map_err(|e| e.to_string())?; + Ok(()) + } + + pub async fn get_bytes_for_tenant( + &self, + tenant_id: &str, + key: &str, + ) -> Result<(Vec, Option), String> { + let out = self + .client + .get_object() + .bucket(self.bucket_for_tenant(tenant_id)) + .key(key) + .send() + .await + .map_err(|e| e.to_string())?; + let ct = out.content_type().map(|s| s.to_string()); + let bytes = out + .body + .collect() + .await + .map_err(|e| e.to_string())? + .into_bytes() + .to_vec(); + Ok((bytes, ct)) + } + + pub async fn delete_for_tenant(&self, tenant_id: &str, key: &str) -> Result<(), String> { + self.client + .delete_object() + .bucket(self.bucket_for_tenant(tenant_id)) + .key(key) + .send() + .await + .map_err(|e| e.to_string())?; + Ok(()) + } + + pub async fn list_for_tenant( + &self, + tenant_id: &str, + prefix: &str, + ) -> Result, String> { + let out = self + .client + .list_objects_v2() + .bucket(self.bucket_for_tenant(tenant_id)) + .prefix(prefix) + .send() + .await + .map_err(|e| e.to_string())?; + let mut items = Vec::new(); + for o in out.contents() { + if let Some(key) = o.key() { + items.push(DocObject { + key: key.to_string(), + size: o.size().unwrap_or(0), + last_modified: o.last_modified().map(|d| d.to_string()), + }); + } + } + Ok(items) + } + + pub async fn ensure_buckets_exist(&self) -> Result<(), String> { + for bucket in &self.cfg.buckets { + let head = self.client.head_bucket().bucket(bucket).send().await; + if head.is_ok() { + continue; + } + self.client + .create_bucket() + .bucket(bucket) + .acl(BucketCannedAcl::Private) + .send() + .await + .map_err(|e| e.to_string())?; + } + Ok(()) + } + + pub async fn presign_put_for_tenant( + &self, + tenant_id: &str, + key: &str, + content_type: Option, + expires: Duration, + ) -> Result { + let mut req = self + .presign_client + .put_object() + .bucket(self.bucket_for_tenant(tenant_id)) + .key(key); + if let Some(ct) = content_type { + req = req.content_type(ct); + } + let presigned = req + .presigned(PresigningConfig::expires_in(expires).map_err(|e| e.to_string())?) + .await + .map_err(|e| e.to_string())?; + Ok(presigned.uri().to_string()) + } + + pub async fn presign_get_for_tenant( + &self, + tenant_id: &str, + key: &str, + expires: Duration, + ) -> Result { + let req = self + .presign_client + .get_object() + .bucket(self.bucket_for_tenant(tenant_id)) + .key(key); + let presigned = req + .presigned(PresigningConfig::expires_in(expires).map_err(|e| e.to_string())?) + .await + .map_err(|e| e.to_string())?; + Ok(presigned.uri().to_string()) + } +} + +#[derive(Clone, Debug, serde::Serialize)] +pub struct DocObject { + pub key: String, + pub size: i64, + pub last_modified: Option, +} + +fn validate_segment(name: &str, value: &str) -> Result<(), String> { + if value.is_empty() { + return Err(format!("{name} is required")); + } + if value.len() > 128 { + return Err(format!("{name} too long")); + } + if value.contains('/') || value.contains('\\') { + return Err(format!("{name} contains invalid characters")); + } + if value.contains("..") { + return Err(format!("{name} contains invalid characters")); + } + Ok(()) +} + +fn validate_filename(value: &str) -> Result<(), String> { + validate_segment("filename", value)?; + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn env_lock() -> std::sync::MutexGuard<'static, ()> { + static LOCK: std::sync::OnceLock> = std::sync::OnceLock::new(); + LOCK.get_or_init(|| std::sync::Mutex::new(())) + .lock() + .unwrap() + } + + #[test] + fn config_from_env_parses_expected_fields() { + let _guard = env_lock(); + unsafe { + std::env::set_var("CONTROL_S3_ENDPOINT", "http://minio:9000"); + std::env::set_var("CONTROL_S3_REGION", "us-east-1"); + std::env::set_var("CONTROL_S3_ACCESS_KEY_ID", "minioadmin"); + std::env::set_var("CONTROL_S3_SECRET_ACCESS_KEY", "minioadmin"); + std::env::set_var("CONTROL_S3_BUCKET_DOCS", "cloudlysis-docs"); + std::env::set_var("CONTROL_S3_PREFIX_DOCS", "docs/"); + std::env::set_var("CONTROL_S3_FORCE_PATH_STYLE", "true"); + std::env::set_var("CONTROL_S3_INSECURE", "true"); + } + + let cfg = DocsConfig::from_env().unwrap(); + assert_eq!(cfg.endpoint, "http://minio:9000"); + assert_eq!(cfg.buckets, vec!["cloudlysis-docs".to_string()]); + assert_eq!(cfg.prefix, "docs/"); + assert!(cfg.force_path_style); + assert!(cfg.insecure); + + unsafe { + std::env::remove_var("CONTROL_S3_ENDPOINT"); + std::env::remove_var("CONTROL_S3_REGION"); + std::env::remove_var("CONTROL_S3_ACCESS_KEY_ID"); + std::env::remove_var("CONTROL_S3_SECRET_ACCESS_KEY"); + std::env::remove_var("CONTROL_S3_BUCKET_DOCS"); + std::env::remove_var("CONTROL_S3_PREFIX_DOCS"); + std::env::remove_var("CONTROL_S3_FORCE_PATH_STYLE"); + std::env::remove_var("CONTROL_S3_INSECURE"); + } + } + + #[test] + fn config_rejects_insecure_with_https_endpoint() { + let _guard = env_lock(); + unsafe { + std::env::set_var("CONTROL_S3_ENDPOINT", "https://s3.example.com"); + std::env::set_var("CONTROL_S3_ACCESS_KEY_ID", "a"); + std::env::set_var("CONTROL_S3_SECRET_ACCESS_KEY", "b"); + std::env::set_var( + "CONTROL_S3_BUCKET_DOCS", + "cloudlysis-docs-0,cloudlysis-docs-1", + ); + std::env::set_var("CONTROL_S3_INSECURE", "true"); + } + let err = DocsConfig::from_env().unwrap_err(); + assert!( + err.contains("CONTROL_S3_INSECURE=true") && err.contains("https://"), + "unexpected error: {err}" + ); + unsafe { + std::env::remove_var("CONTROL_S3_ENDPOINT"); + std::env::remove_var("CONTROL_S3_ACCESS_KEY_ID"); + std::env::remove_var("CONTROL_S3_SECRET_ACCESS_KEY"); + std::env::remove_var("CONTROL_S3_BUCKET_DOCS"); + std::env::remove_var("CONTROL_S3_INSECURE"); + } + } + + #[tokio::test] + async fn key_scheme_is_stable() { + let cfg = DocsConfig { + endpoint: "http://minio:9000".to_string(), + public_endpoint: None, + region: "us-east-1".to_string(), + access_key_id: "x".to_string(), + secret_access_key: "y".to_string(), + force_path_style: true, + insecure: true, + buckets: vec![ + "cloudlysis-docs-0".to_string(), + "cloudlysis-docs-1".to_string(), + ], + prefix: "docs/".to_string(), + }; + let store = DocsStore::new(cfg).await.unwrap(); + + let key = store + .key_for("tenant-a", "deployments", "v1", "bundle.tar.gz") + .unwrap(); + assert_eq!(key, "docs/tenant-a/deployments/v1/bundle.tar.gz"); + } + + #[tokio::test] + async fn key_scheme_rejects_invalid_segments() { + let cfg = DocsConfig { + endpoint: "http://minio:9000".to_string(), + public_endpoint: None, + region: "us-east-1".to_string(), + access_key_id: "x".to_string(), + secret_access_key: "y".to_string(), + force_path_style: true, + insecure: true, + buckets: vec!["cloudlysis-docs".to_string()], + prefix: "docs/".to_string(), + }; + let store = DocsStore::new(cfg).await.unwrap(); + + assert!(store.key_for("t/a", "x", "y", "z").is_err()); + assert!(store.key_for("t", "x", "../y", "z").is_err()); + assert!(store.key_for("t", "x", "y", "a/b").is_err()); + } +} diff --git a/control/api/src/swarm.rs b/control/api/src/swarm.rs index 148f98d..ab2c0dd 100644 --- a/control/api/src/swarm.rs +++ b/control/api/src/swarm.rs @@ -28,31 +28,49 @@ pub struct SwarmStateFile { #[derive(Clone)] pub struct SwarmStore { - path: std::path::PathBuf, + inner: SwarmStoreInner, +} + +#[derive(Clone)] +enum SwarmStoreInner { + File { path: std::path::PathBuf }, + DockerCli, } impl SwarmStore { pub fn new(path: std::path::PathBuf) -> Self { - Self { path } + Self { + inner: SwarmStoreInner::File { path }, + } + } + + pub fn new_docker_cli() -> Self { + Self { + inner: SwarmStoreInner::DockerCli, + } } pub fn list_services(&self) -> Vec { - self.load().map(|s| s.services).unwrap_or_default() + match &self.inner { + SwarmStoreInner::File { path } => { + load_state(path).map(|s| s.services).unwrap_or_default() + } + SwarmStoreInner::DockerCli => list_services_docker_cli().unwrap_or_default(), + } } pub fn list_tasks(&self, service_name: &str) -> Vec { - self.load() - .map(|s| { - s.tasks - .into_iter() - .filter(|t| t.service == service_name) - .collect() - }) - .unwrap_or_default() - } - - fn load(&self) -> Option { - load_state(&self.path) + match &self.inner { + SwarmStoreInner::File { path } => load_state(path) + .map(|s| { + s.tasks + .into_iter() + .filter(|t| t.service == service_name) + .collect() + }) + .unwrap_or_default(), + SwarmStoreInner::DockerCli => list_tasks_docker_cli(service_name).unwrap_or_default(), + } } } @@ -60,3 +78,120 @@ fn load_state(path: &Path) -> Option { let raw = fs::read_to_string(path).ok()?; serde_json::from_str(&raw).ok() } + +fn list_services_docker_cli() -> Result, String> { + let out = std::process::Command::new("docker") + .args(["service", "ls", "--format", "{{json .}}"]) + .output() + .map_err(|e| format!("docker exec failed: {e}"))?; + if !out.status.success() { + return Err(format!( + "docker service ls failed: {}", + String::from_utf8_lossy(&out.stderr) + )); + } + + #[derive(Deserialize)] + struct ServiceRow { + #[serde(rename = "Name")] + name: String, + #[serde(rename = "Image")] + image: Option, + #[serde(rename = "Mode")] + mode: Option, + #[serde(rename = "Replicas")] + replicas: Option, + #[serde(rename = "UpdatedAt")] + updated_at: Option, + } + + let mut services = Vec::new(); + for line in String::from_utf8_lossy(&out.stdout).lines() { + let line = line.trim(); + if line.is_empty() { + continue; + } + let row: ServiceRow = + serde_json::from_str(line).map_err(|e| format!("invalid json row: {e}"))?; + services.push(SwarmService { + name: row.name, + image: row.image, + mode: row.mode, + replicas: row.replicas, + updated_at: row.updated_at, + }); + } + Ok(services) +} + +fn list_tasks_docker_cli(service_name: &str) -> Result, String> { + let out = std::process::Command::new("docker") + .args([ + "service", + "ps", + service_name, + "--no-trunc", + "--format", + "{{json .}}", + ]) + .output() + .map_err(|e| format!("docker exec failed: {e}"))?; + if !out.status.success() { + return Err(format!( + "docker service ps failed: {}", + String::from_utf8_lossy(&out.stderr) + )); + } + + #[derive(Deserialize)] + struct TaskRow { + #[serde(rename = "ID")] + id: String, + #[serde(rename = "Name")] + name: Option, + #[serde(rename = "Node")] + node: Option, + #[serde(rename = "DesiredState")] + desired_state: Option, + #[serde(rename = "CurrentState")] + current_state: Option, + #[serde(rename = "Error")] + error: Option, + } + + let mut tasks = Vec::new(); + for line in String::from_utf8_lossy(&out.stdout).lines() { + let line = line.trim(); + if line.is_empty() { + continue; + } + let row: TaskRow = + serde_json::from_str(line).map_err(|e| format!("invalid json row: {e}"))?; + let service = row + .name + .as_deref() + .and_then(|n| n.split_once('.').map(|(svc, _)| svc.to_string())) + .unwrap_or_else(|| service_name.to_string()); + tasks.push(SwarmTask { + id: row.id, + service, + node: row.node, + desired_state: row.desired_state, + current_state: row.current_state, + error: row.error, + }); + } + Ok(tasks) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn state_file_parses() { + let raw = r#"{"services":[{"name":"a","image":null,"mode":null,"replicas":null,"updated_at":null}],"tasks":[]}"#; + let parsed: SwarmStateFile = serde_json::from_str(raw).unwrap(); + assert_eq!(parsed.services.len(), 1); + } +} diff --git a/control/api/tests/billing_production_smoke_gated.rs b/control/api/tests/billing_production_smoke_gated.rs new file mode 100644 index 0000000..d198810 --- /dev/null +++ b/control/api/tests/billing_production_smoke_gated.rs @@ -0,0 +1,174 @@ +use api::{ + AppState, AuditStore, AuthConfig, ConfigLocks, JobStore, PlacementStore, SwarmStore, + TenantLocks, billing::BillingStore, config_registry::ConfigRegistry, +}; +use axum::{ + Router, + body::Body, + http::{Request, StatusCode, header}, +}; +use jsonwebtoken::{EncodingKey, Header, encode}; +use metrics_exporter_prometheus::PrometheusBuilder; +use serde::Serialize; +use std::{ + path::PathBuf, + sync::{Arc, OnceLock}, +}; +use tower::ServiceExt; +use uuid::Uuid; + +fn prod_enabled() -> bool { + std::env::var("CONTROL_TEST_BILLING_PROD").ok().as_deref() == Some("1") +} + +static HANDLE: OnceLock = OnceLock::new(); + +fn repo_root() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .parent() + .and_then(|p| p.parent()) + .expect("api crate should live under repo root") + .to_path_buf() +} + +#[derive(Serialize)] +struct TestClaims { + sub: String, + session_id: String, + permissions: Vec, + exp: usize, +} + +fn make_token(secret: &[u8], perms: &[&str]) -> String { + let exp = (std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_secs() + + 60) as usize; + encode( + &Header::default(), + &TestClaims { + sub: "user_1".to_string(), + session_id: "sess_1".to_string(), + permissions: perms.iter().map(|p| (*p).to_string()).collect(), + exp, + }, + &EncodingKey::from_secret(secret), + ) + .unwrap() +} + +fn test_app() -> Router { + let handle = HANDLE + .get_or_init(|| { + PrometheusBuilder::new() + .install_recorder() + .expect("failed to install prometheus recorder") + }) + .clone(); + + let provider_type = + std::env::var("CONTROL_BILLING_PROVIDER").unwrap_or_else(|_| "mock".to_string()); + let billing_provider: Arc = match provider_type.as_str() { + "stripe" => Arc::new(api::billing::StripeProvider { + secret_key: std::env::var("CONTROL_STRIPE_SECRET_KEY").unwrap_or_default(), + price_pro: std::env::var("CONTROL_STRIPE_PRICE_ID_PRO").unwrap_or_default(), + price_enterprise: std::env::var("CONTROL_STRIPE_PRICE_ID_ENTERPRISE") + .unwrap_or_default(), + }), + _ => Arc::new(api::billing::MockProvider), + }; + + api::build_app(AppState { + prometheus: handle, + auth: AuthConfig { + hs256_secret: Some(b"test_secret".to_vec()), + }, + jobs: JobStore::default(), + audit: AuditStore::default(), + tenant_locks: TenantLocks::default(), + config_locks: ConfigLocks::default(), + http: reqwest::Client::new(), + placement: PlacementStore::new(repo_root().join("config/placement/dev.json")), + billing: BillingStore::new(std::env::temp_dir().join("billing-prod-smoke.json")), + billing_provider, + billing_enforcement_enabled: true, + config: ConfigRegistry::new(None, None), + fleet_services: vec![], + swarm: SwarmStore::new(repo_root().join("swarm/dev.json")), + docs: None, + }) +} + +#[tokio::test] +async fn billing_production_smoke_test() { + if !prod_enabled() { + eprintln!("skipping: set CONTROL_TEST_BILLING_PROD=1 to enable production smoke tests"); + return; + } + + let app = test_app(); + let token = make_token(b"test_secret", &["control:read", "control:write"]); + let tenant_id = Uuid::new_v4(); + + // 1. Verify GET billing works (empty initially) + let res = app + .clone() + .oneshot( + Request::builder() + .uri(format!("/admin/v1/tenants/{tenant_id}/billing")) + .header(header::AUTHORIZATION, format!("Bearer {token}")) + .header("x-tenant-id", tenant_id.to_string()) + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(res.status(), StatusCode::OK); + + // 2. Verify Checkout session generation + let res = app + .clone() + .oneshot( + Request::builder() + .uri(format!("/admin/v1/tenants/{tenant_id}/billing/checkout")) + .method("POST") + .header(header::AUTHORIZATION, format!("Bearer {token}")) + .header("x-tenant-id", tenant_id.to_string()) + .header(header::CONTENT_TYPE, "application/json") + .body(Body::from( + serde_json::json!({ + "plan": "pro", + "return_path": "/billing" + }) + .to_string(), + )) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(res.status(), StatusCode::OK); + + let body = axum::body::to_bytes(res.into_body(), 1024 * 1024) + .await + .unwrap(); + let v: serde_json::Value = serde_json::from_slice(&body).unwrap(); + assert!(v.get("url").and_then(|u| u.as_str()).is_some()); + + // 3. Verify Portal session generation (may fail if tenant has no stripe customer id yet, which is expected for fresh tenant) + let res = app + .clone() + .oneshot( + Request::builder() + .uri(format!("/admin/v1/tenants/{tenant_id}/billing/portal")) + .method("POST") + .header(header::AUTHORIZATION, format!("Bearer {token}")) + .header("x-tenant-id", tenant_id.to_string()) + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + // For smoke test, we just want to see it reached the provider and didn't crash + assert!(res.status() == StatusCode::OK || res.status() == StatusCode::INTERNAL_SERVER_ERROR); +} diff --git a/control/api/tests/config_nats_env_gated.rs b/control/api/tests/config_nats_env_gated.rs new file mode 100644 index 0000000..a0e2f8b --- /dev/null +++ b/control/api/tests/config_nats_env_gated.rs @@ -0,0 +1,250 @@ +use api::{ + AppState, AuditStore, AuthConfig, ConfigLocks, ConfigRegistry, JobStore, PlacementStore, + SwarmStore, TenantLocks, config_registry::NatsKvSource, +}; +use axum::{ + Router, + body::Body, + http::{Request, StatusCode, header}, +}; +use jsonwebtoken::{EncodingKey, Header, encode}; +use metrics_exporter_prometheus::PrometheusBuilder; +use serde::Serialize; +use std::{path::PathBuf, sync::OnceLock, time::Duration}; +use tower::ServiceExt; +use uuid::Uuid; + +fn enabled() -> bool { + std::env::var("CONTROL_TEST_NATS").ok().as_deref() == Some("1") + && std::env::var("CONTROL_TEST_NATS_URL").is_ok() +} + +#[derive(Serialize)] +struct TestClaims { + sub: String, + session_id: String, + permissions: Vec, + exp: usize, +} + +fn make_token(secret: &[u8], perms: &[&str]) -> String { + let exp = (std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_secs() + + 60) as usize; + encode( + &Header::default(), + &TestClaims { + sub: "user_1".to_string(), + session_id: "sess_1".to_string(), + permissions: perms.iter().map(|p| (*p).to_string()).collect(), + exp, + }, + &EncodingKey::from_secret(secret), + ) + .unwrap() +} + +static HANDLE: OnceLock = OnceLock::new(); + +fn repo_root() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .parent() + .and_then(|p| p.parent()) + .expect("api crate should live under repo root") + .to_path_buf() +} + +async fn wait_done(app: Router, job_id: Uuid, token: &str) -> serde_json::Value { + let start = tokio::time::Instant::now(); + loop { + let res = app + .clone() + .oneshot( + Request::builder() + .uri(format!("/admin/v1/jobs/{job_id}")) + .header(header::AUTHORIZATION, format!("Bearer {token}")) + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(res.status(), StatusCode::OK); + let body = axum::body::to_bytes(res.into_body(), 1024 * 1024) + .await + .unwrap(); + let job: serde_json::Value = serde_json::from_slice(&body).unwrap(); + let status = job + .get("status") + .and_then(|v| v.as_str()) + .unwrap_or("unknown"); + if status != "pending" && status != "running" { + return job; + } + + if start.elapsed() > Duration::from_secs(2) { + return job; + } + tokio::time::sleep(Duration::from_millis(25)).await; + } +} + +#[tokio::test] +async fn config_jobs_with_nats_kv_are_env_gated() { + if !enabled() { + eprintln!( + "skipping: set CONTROL_TEST_NATS=1 and CONTROL_TEST_NATS_URL=nats://... to enable nats config tests" + ); + return; + } + + let nats_url = std::env::var("CONTROL_TEST_NATS_URL").unwrap(); + unsafe { + std::env::set_var("CONTROL_CONFIG_NATS_URL", &nats_url); + } + + let bucket = format!("cloudlysis-test-config-{}", Uuid::new_v4()); + let routing_key = format!("routing/{}", Uuid::new_v4()); + let placement_key = format!("placement/{}", Uuid::new_v4()); + + let routing_src = NatsKvSource::connect(nats_url.clone(), bucket.clone(), routing_key) + .await + .expect("connect routing kv"); + let placement_src = NatsKvSource::connect(nats_url.clone(), bucket.clone(), placement_key) + .await + .expect("connect placement kv"); + + let config = ConfigRegistry::new( + Some(std::sync::Arc::new(routing_src)), + Some(std::sync::Arc::new(placement_src)), + ); + + let secret = b"test_secret".to_vec(); + let token = make_token(&secret, &["control:write", "control:read"]); + + let handle = HANDLE + .get_or_init(|| { + PrometheusBuilder::new() + .install_recorder() + .expect("failed to install prometheus recorder") + }) + .clone(); + + let app = api::build_app(AppState { + prometheus: handle, + auth: AuthConfig { + hs256_secret: Some(secret), + }, + jobs: JobStore::default(), + audit: AuditStore::default(), + tenant_locks: TenantLocks::default(), + config_locks: ConfigLocks::default(), + http: reqwest::Client::new(), + placement: PlacementStore::new(repo_root().join("config/placement/dev.json")), + billing: api::billing::BillingStore::new(std::env::temp_dir().join("billing-test.json")), + billing_provider: std::sync::Arc::new(api::billing::MockProvider), + billing_enforcement_enabled: false, + config, + fleet_services: vec![], + swarm: SwarmStore::new(repo_root().join("swarm/dev.json")), + docs: None, + }); + + let routing_value = serde_json::json!({ + "revision": 1, + "aggregate_placement": { "t1": "local" }, + "projection_placement": { "t1": "local" }, + "runner_placement": { "t1": "local" }, + "aggregate_shards": { "local": ["http://aggregate:50051"] }, + "projection_shards": { "local": ["http://projection:8080"] }, + "runner_shards": { "local": ["http://runner:8080"] } + }); + + let apply = app + .clone() + .oneshot( + Request::builder() + .uri("/admin/v1/jobs/config/apply") + .method("POST") + .header(header::AUTHORIZATION, format!("Bearer {token}")) + .header("idempotency-key", format!("k-{}", Uuid::new_v4())) + .header(header::CONTENT_TYPE, "application/json") + .body(Body::from( + serde_json::json!({ + "domain": "routing", + "expected_revision": null, + "reason": "test apply", + "value": routing_value + }) + .to_string(), + )) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(apply.status(), StatusCode::OK); + let body = axum::body::to_bytes(apply.into_body(), 1024 * 1024) + .await + .unwrap(); + let v: serde_json::Value = serde_json::from_slice(&body).unwrap(); + let job_id = Uuid::parse_str(v.get("job_id").unwrap().as_str().unwrap()).unwrap(); + + let job = wait_done(app.clone(), job_id, &token).await; + assert_eq!( + job.get("status").and_then(|v| v.as_str()), + Some("succeeded") + ); + + let get = app + .clone() + .oneshot( + Request::builder() + .uri("/admin/v1/config/routing") + .header(header::AUTHORIZATION, format!("Bearer {token}")) + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(get.status(), StatusCode::OK); + let body = axum::body::to_bytes(get.into_body(), 1024 * 1024) + .await + .unwrap(); + let got: serde_json::Value = serde_json::from_slice(&body).unwrap(); + assert_eq!(got.get("domain").unwrap().as_str().unwrap(), "routing"); + assert!(got.get("revision").unwrap().as_u64().unwrap_or(0) > 0); + + let rollback = app + .clone() + .oneshot( + Request::builder() + .uri("/admin/v1/jobs/config/rollback") + .method("POST") + .header(header::AUTHORIZATION, format!("Bearer {token}")) + .header("idempotency-key", format!("k-{}", Uuid::new_v4())) + .header(header::CONTENT_TYPE, "application/json") + .body(Body::from( + serde_json::json!({ + "domain": "routing", + "reason": "test rollback" + }) + .to_string(), + )) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(rollback.status(), StatusCode::OK); + let body = axum::body::to_bytes(rollback.into_body(), 1024 * 1024) + .await + .unwrap(); + let v: serde_json::Value = serde_json::from_slice(&body).unwrap(); + let rb_id = Uuid::parse_str(v.get("job_id").unwrap().as_str().unwrap()).unwrap(); + + let rb_job = wait_done(app.clone(), rb_id, &token).await; + assert_eq!( + rb_job.get("status").and_then(|v| v.as_str()), + Some("succeeded") + ); +} diff --git a/control/api/tests/control_api_smoke_env_gated.rs b/control/api/tests/control_api_smoke_env_gated.rs new file mode 100644 index 0000000..a099f37 --- /dev/null +++ b/control/api/tests/control_api_smoke_env_gated.rs @@ -0,0 +1,157 @@ +use jsonwebtoken::{EncodingKey, Header, encode}; +use reqwest::StatusCode; +use serde::Serialize; +use serde_json::json; +use std::time::Duration; +use uuid::Uuid; + +#[derive(Serialize)] +struct TestClaims { + sub: String, + session_id: String, + permissions: Vec, + exp: usize, +} + +fn make_token(secret: &[u8], perms: &[&str]) -> String { + let exp = (std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_secs() + + 300) as usize; + encode( + &Header::default(), + &TestClaims { + sub: "smoke".to_string(), + session_id: "smoke".to_string(), + permissions: perms.iter().map(|p| (*p).to_string()).collect(), + exp, + }, + &EncodingKey::from_secret(secret), + ) + .unwrap() +} + +#[tokio::test] +async fn control_api_docs_smoke_is_env_gated() { + let enabled = std::env::var("CONTROL_TEST_SMOKE").ok(); + if enabled.as_deref() != Some("1") { + eprintln!("skipping: set CONTROL_TEST_SMOKE=1 to enable env smoke tests"); + return; + } + + let base_url = + std::env::var("CONTROL_TEST_BASE_URL").expect("CONTROL_TEST_BASE_URL is required"); + let base_url = base_url.trim_end_matches('/').to_string(); + + // Either provide a token directly, or provide secret+perms to mint one. + let token = if let Ok(t) = std::env::var("CONTROL_TEST_TOKEN") { + t + } else { + let secret = std::env::var("CONTROL_TEST_JWT_SECRET") + .expect("CONTROL_TEST_TOKEN or CONTROL_TEST_JWT_SECRET is required"); + make_token(secret.as_bytes(), &["control:read", "control:write"]) + }; + + let tenant_id = std::env::var("CONTROL_TEST_TENANT_ID") + .ok() + .unwrap_or_else(|| Uuid::new_v4().to_string()); + + let http = reqwest::Client::builder() + .timeout(Duration::from_secs(15)) + .build() + .unwrap(); + + // Health. + let health = http + .get(format!("{base_url}/health")) + .send() + .await + .expect("health request failed"); + assert!(health.status().is_success(), "health not ok"); + + // Presign upload. + let doc_id = Uuid::new_v4().to_string(); + let filename = "smoke.txt"; + let presign_up = http + .post(format!( + "{base_url}/admin/v1/tenants/{tenant_id}/docs/presign/upload" + )) + .header("authorization", format!("Bearer {token}")) + .header("x-tenant-id", &tenant_id) + .json(&json!({ + "doc_type": "deployments", + "doc_id": doc_id, + "filename": filename, + "content_type": "text/plain", + })) + .send() + .await + .expect("presign upload failed"); + assert!( + presign_up.status().is_success(), + "presign upload not ok: {}", + presign_up.status() + ); + let up_json: serde_json::Value = presign_up.json().await.unwrap(); + let put_url = up_json.get("url").and_then(|v| v.as_str()).unwrap(); + let key = up_json + .get("key") + .and_then(|v| v.as_str()) + .unwrap() + .to_string(); + + // PUT bytes to S3 directly. + let payload = b"hello-smoke".to_vec(); + let put = http + .put(put_url) + .header("content-type", "text/plain") + .body(payload.clone()) + .send() + .await + .expect("s3 put failed"); + assert!(put.status().is_success(), "s3 put not ok: {}", put.status()); + + // List should include key. + let list = http + .get(format!( + "{base_url}/admin/v1/tenants/{tenant_id}/docs?prefix=deployments/" + )) + .header("authorization", format!("Bearer {token}")) + .header("x-tenant-id", &tenant_id) + .send() + .await + .expect("list failed"); + assert!(list.status().is_success(), "list not ok"); + let list_json: serde_json::Value = list.json().await.unwrap(); + let objects = list_json.get("objects").and_then(|v| v.as_array()).unwrap(); + assert!( + objects + .iter() + .any(|o| o.get("key").and_then(|k| k.as_str()) == Some(key.as_str())), + "expected list to include presigned upload key" + ); + + // Presign download and fetch bytes. + let presign_down = http + .post(format!( + "{base_url}/admin/v1/tenants/{tenant_id}/docs/presign/download" + )) + .header("authorization", format!("Bearer {token}")) + .header("x-tenant-id", &tenant_id) + .json(&json!({ "key": key })) + .send() + .await + .expect("presign download failed"); + assert!( + presign_down.status().is_success(), + "presign download not ok" + ); + let down_json: serde_json::Value = presign_down.json().await.unwrap(); + let get_url = down_json.get("url").and_then(|v| v.as_str()).unwrap(); + + let got = http.get(get_url).send().await.expect("s3 get failed"); + assert_eq!(got.status(), StatusCode::OK); + let got_bytes = got.bytes().await.unwrap().to_vec(); + assert_eq!(got_bytes, payload); +} diff --git a/control/api/tests/docker_config_validation.rs b/control/api/tests/docker_config_validation.rs index 5ace774..3972835 100644 --- a/control/api/tests/docker_config_validation.rs +++ b/control/api/tests/docker_config_validation.rs @@ -11,7 +11,7 @@ fn repo_root() -> PathBuf { #[test] fn docker_compose_files_parse_and_include_required_services() { let root = repo_root(); - let compose = fs::read_to_string(root.join("observability/docker-compose.yml")).unwrap(); + let compose = fs::read_to_string(root.join("docker-compose.yml")).unwrap(); let v: serde_yaml::Value = serde_yaml::from_str(&compose).unwrap(); let services = v @@ -19,7 +19,15 @@ fn docker_compose_files_parse_and_include_required_services() { .and_then(|x| x.as_mapping()) .expect("missing services"); - for required in ["grafana", "victoria-metrics", "vmagent", "loki", "tempo"] { + // Core + optional observability services are all declared in one compose file. + for required in [ + "grafana", + "victoria-metrics", + "vmagent", + "loki", + "tempo", + "mailhog", + ] { assert!( services.contains_key(serde_yaml::Value::String(required.to_string())), "missing service {required}" @@ -28,17 +36,19 @@ fn docker_compose_files_parse_and_include_required_services() { } #[tokio::test] -#[ignore] async fn docker_compose_config_validation_is_gated_and_fast() { let enabled = std::env::var("CONTROL_TEST_DOCKER").ok(); - assert_eq!(enabled.as_deref(), Some("1")); + if enabled.as_deref() != Some("1") { + eprintln!("skipping: set CONTROL_TEST_DOCKER=1 to enable docker compose validation"); + return; + } let root = repo_root(); - let compose = root.join("observability/docker-compose.yml"); + let compose = root.join("docker-compose.yml"); let cmd = tokio::process::Command::new("docker") .args(["compose", "-f"]) - .arg(compose) + .arg(&compose) .args(["config"]) .output(); @@ -52,4 +62,22 @@ async fn docker_compose_config_validation_is_gated_and_fast() { "docker compose config failed: {}", String::from_utf8_lossy(&out.stderr) ); + + // Validate full-stack profile wiring too. + let cmd = tokio::process::Command::new("docker") + .args(["compose", "-f"]) + .arg(&compose) + .args(["--profile", "observability", "config"]) + .output(); + + let out = tokio::time::timeout(Duration::from_secs(10), cmd) + .await + .expect("docker compose config (observability profile) timed out") + .expect("failed to run docker compose config (observability profile)"); + + assert!( + out.status.success(), + "docker compose config (observability profile) failed: {}", + String::from_utf8_lossy(&out.stderr) + ); } diff --git a/control/api/tests/docker_gated.rs b/control/api/tests/docker_gated.rs index 4a29179..c2bd33c 100644 --- a/control/api/tests/docker_gated.rs +++ b/control/api/tests/docker_gated.rs @@ -1,6 +1,9 @@ #[test] -#[ignore] fn docker_integration_tests_are_gated() { let enabled = std::env::var("CONTROL_TEST_DOCKER").ok(); + if enabled.as_deref() != Some("1") { + eprintln!("skipping: set CONTROL_TEST_DOCKER=1 to enable docker integration tests"); + return; + } assert_eq!(enabled.as_deref(), Some("1")); } diff --git a/control/api/tests/docs_e2e_docker_gated.rs b/control/api/tests/docs_e2e_docker_gated.rs new file mode 100644 index 0000000..1827062 --- /dev/null +++ b/control/api/tests/docs_e2e_docker_gated.rs @@ -0,0 +1,169 @@ +use jsonwebtoken::{EncodingKey, Header, encode}; +use reqwest::header::{HeaderMap, HeaderValue}; +use serde::Serialize; +use std::{path::PathBuf, process::Command, time::Duration}; +use uuid::Uuid; + +fn repo_root() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .parent() + .and_then(|p| p.parent()) + .expect("api crate should live under repo root") + .to_path_buf() +} + +fn docker_enabled() -> bool { + std::env::var("CONTROL_TEST_DOCKER") + .ok() + .is_some_and(|v| v.trim() == "1") +} + +fn compose_file() -> PathBuf { + repo_root().join("docker-compose.yml") +} + +#[derive(Serialize)] +struct TestClaims { + sub: String, + session_id: String, + permissions: Vec, + exp: usize, +} + +fn make_token(secret: &[u8], perms: &[&str]) -> String { + let exp = (std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_secs() + + 300) as usize; + encode( + &Header::default(), + &TestClaims { + sub: "user_1".to_string(), + session_id: "sess_1".to_string(), + permissions: perms.iter().map(|p| (*p).to_string()).collect(), + exp, + }, + &EncodingKey::from_secret(secret), + ) + .unwrap() +} + +#[tokio::test] +async fn documents_upload_list_download_roundtrip_via_control_api_compose() { + if !docker_enabled() { + eprintln!("skipping: set CONTROL_TEST_DOCKER=1 to enable docker compose tests"); + return; + } + + // Must match docker-compose.yml CONTROL_GATEWAY_JWT_HS256_SECRET. + let jwt_secret = b"dev_secret"; + let token = make_token(jwt_secret, &["control:read", "control:write"]); + + let compose = compose_file(); + + let up = Command::new("docker") + .args(["compose", "-f"]) + .arg(&compose) + .args(["up", "-d", "control-api"]) + .status() + .expect("failed to run docker compose up control-api"); + assert!(up.success(), "docker compose up control-api failed"); + + // Wait for control-api to be reachable (port publish is in compose). + let http = reqwest::Client::builder() + .timeout(Duration::from_secs(10)) + .build() + .unwrap(); + + let base = "http://127.0.0.1:38080"; + let health_deadline = tokio::time::Instant::now() + Duration::from_secs(30); + loop { + if tokio::time::Instant::now() > health_deadline { + panic!("control-api did not become healthy in time"); + } + match http.get(format!("{base}/health")).send().await { + Ok(res) if res.status().is_success() => break, + _ => tokio::time::sleep(Duration::from_millis(250)).await, + } + } + + let tenant_id = Uuid::new_v4().to_string(); + let doc_type = "deployments"; + let doc_id = Uuid::new_v4().to_string(); + let filename = "hello.txt"; + let bytes = b"hello-docs".to_vec(); + + let mut headers = HeaderMap::new(); + headers.insert( + "authorization", + HeaderValue::from_str(&format!("Bearer {token}")).unwrap(), + ); + headers.insert("x-tenant-id", HeaderValue::from_str(&tenant_id).unwrap()); + + // Upload (proxy endpoint). + let put_url = + format!("{base}/admin/v1/tenants/{tenant_id}/docs/{doc_type}/{doc_id}/{filename}"); + let put = http + .put(&put_url) + .headers(headers.clone()) + .header("content-type", "text/plain") + .body(bytes.clone()) + .send() + .await + .expect("upload request failed"); + assert!( + put.status().is_success(), + "upload failed: {}", + put.text().await.unwrap_or_default() + ); + let put_json: serde_json::Value = put.json().await.expect("invalid upload json"); + let key = put_json + .get("key") + .and_then(|v| v.as_str()) + .expect("missing key") + .to_string(); + + // List should include the key. + let list_url = format!("{base}/admin/v1/tenants/{tenant_id}/docs?prefix={doc_type}/"); + let list = http + .get(&list_url) + .headers(headers.clone()) + .send() + .await + .expect("list request failed"); + assert!(list.status().is_success(), "list failed"); + let list_json: serde_json::Value = list.json().await.expect("invalid list json"); + let objects = list_json + .get("objects") + .and_then(|v| v.as_array()) + .expect("missing objects"); + assert!( + objects + .iter() + .any(|o| o.get("key").and_then(|k| k.as_str()) == Some(key.as_str())), + "expected list to include uploaded key" + ); + + // Download (proxy endpoint) returns same bytes. + let get_url = format!( + "{base}/admin/v1/tenants/{tenant_id}/docs/object/{}", + urlencoding::encode(&key) + ); + let got = http + .get(&get_url) + .headers(headers.clone()) + .send() + .await + .expect("download request failed"); + assert!(got.status().is_success(), "download failed"); + let got_bytes = got.bytes().await.expect("download bytes failed").to_vec(); + assert_eq!(got_bytes, bytes); + + // Best-effort cleanup. + let _ = Command::new("docker") + .args(["compose", "-f"]) + .arg(&compose) + .args(["down", "-v"]) + .status(); +} diff --git a/control/api/tests/drift_classification.rs b/control/api/tests/drift_classification.rs new file mode 100644 index 0000000..cf19354 --- /dev/null +++ b/control/api/tests/drift_classification.rs @@ -0,0 +1,123 @@ +use api::{ + AppState, AuditStore, AuthConfig, ConfigLocks, ConfigRegistry, JobStore, PlacementStore, + SwarmStore, TenantLocks, +}; +use axum::{ + Router, + body::Body, + http::{Request, StatusCode, header}, +}; +use jsonwebtoken::{EncodingKey, Header, encode}; +use metrics_exporter_prometheus::PrometheusBuilder; +use serde::Serialize; +use std::{fs, path::PathBuf, sync::OnceLock}; +use tower::ServiceExt; + +static HANDLE: OnceLock = OnceLock::new(); + +fn repo_root() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .parent() + .and_then(|p| p.parent()) + .expect("api crate should live under repo root") + .to_path_buf() +} + +#[derive(Serialize)] +struct TestClaims { + sub: String, + session_id: String, + permissions: Vec, + exp: usize, +} + +fn make_token(perms: &[&str]) -> String { + let exp = (std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_secs() + + 60) as usize; + encode( + &Header::default(), + &TestClaims { + sub: "user_1".to_string(), + session_id: "sess_1".to_string(), + permissions: perms.iter().map(|p| (*p).to_string()).collect(), + exp, + }, + &EncodingKey::from_secret(b"test_secret"), + ) + .unwrap() +} + +fn temp_swarm_file(raw: &str) -> PathBuf { + let mut dst = std::env::temp_dir(); + dst.push(format!( + "cloudlysis-control-swarm-{}-{}.json", + std::process::id(), + uuid::Uuid::new_v4() + )); + fs::write(&dst, raw).expect("failed to write temp swarm file"); + dst +} + +fn test_app_with_swarm(swarm_path: PathBuf) -> Router { + let handle = HANDLE + .get_or_init(|| { + PrometheusBuilder::new() + .install_recorder() + .expect("failed to install prometheus recorder") + }) + .clone(); + api::build_app(AppState { + prometheus: handle, + auth: AuthConfig { + hs256_secret: Some(b"test_secret".to_vec()), + }, + jobs: JobStore::default(), + audit: AuditStore::default(), + tenant_locks: TenantLocks::default(), + config_locks: ConfigLocks::default(), + http: reqwest::Client::new(), + placement: PlacementStore::new(repo_root().join("config/placement/dev.json")), + billing: api::billing::BillingStore::new( + std::env::temp_dir().join("billing-drift-test.json"), + ), + billing_provider: std::sync::Arc::new(api::billing::MockProvider), + billing_enforcement_enabled: false, + config: ConfigRegistry::new(None, None), + fleet_services: vec![], + swarm: SwarmStore::new(swarm_path), + docs: None, + }) +} + +#[tokio::test] +async fn drift_marks_extra_services_vs_desired_observation_set() { + let swarm = temp_swarm_file( + r#"{ "services": [{"name":"extra-1","image":null,"mode":null,"replicas":null,"updated_at":null}], "tasks": [] }"#, + ); + let app = test_app_with_swarm(swarm); + let token = make_token(&["control:read"]); + + let res = app + .oneshot( + Request::builder() + .uri("/admin/v1/platform/drift") + .header(header::AUTHORIZATION, format!("Bearer {token}")) + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(res.status(), StatusCode::OK); + let body = axum::body::to_bytes(res.into_body(), 1024 * 1024) + .await + .unwrap(); + let v: serde_json::Value = serde_json::from_slice(&body).unwrap(); + let items = v.get("items").and_then(|x| x.as_array()).unwrap(); + assert!(items.iter().any(|i| { + i.get("kind").and_then(|k| k.as_str()) == Some("extra") + && i.get("service").and_then(|s| s.as_str()) == Some("extra-1") + })); +} diff --git a/control/api/tests/drift_docker_gated.rs b/control/api/tests/drift_docker_gated.rs new file mode 100644 index 0000000..06dc1d2 --- /dev/null +++ b/control/api/tests/drift_docker_gated.rs @@ -0,0 +1,137 @@ +#[tokio::test] +async fn platform_drift_docker_test_is_gated() { + use tower::ServiceExt; + + let enabled = std::env::var("CONTROL_TEST_DOCKER").ok(); + if enabled.as_deref() != Some("1") { + eprintln!("skipping: set CONTROL_TEST_DOCKER=1 to enable docker drift tests"); + return; + } + + // We only run the "real" drift check when Swarm is available locally. + // If Swarm isn't active, we skip to keep CI/dev machines happy. + let info = std::process::Command::new("docker") + .args(["info", "--format", "{{.Swarm.LocalNodeState}}"]) + .output(); + let Ok(info) = info else { + eprintln!("skipping: docker not available"); + return; + }; + if !info.status.success() { + eprintln!("skipping: docker info failed"); + return; + } + let state = String::from_utf8_lossy(&info.stdout).trim().to_string(); + if state != "active" { + eprintln!("skipping: docker swarm not active (LocalNodeState={state})"); + return; + } + + // Create a short-lived service so drift can see an "extra" observed service. + let name = format!("cloudlysis-drift-extra-{}", uuid::Uuid::new_v4()); + let create = std::process::Command::new("docker") + .args([ + "service", + "create", + "--name", + &name, + "--restart-condition", + "none", + "busybox:1.36", + "sh", + "-c", + "sleep 60", + ]) + .output() + .expect("docker service create"); + if !create.status.success() { + eprintln!("skipping: failed to create swarm service (maybe permissions?)"); + return; + } + + // Ensure cleanup even if assertion fails. + struct Cleanup(String); + impl Drop for Cleanup { + fn drop(&mut self) { + let _ = std::process::Command::new("docker") + .args(["service", "rm", &self.0]) + .output(); + } + } + let _cleanup = Cleanup(name.clone()); + + // Now call drift via a minimal in-process app configured for docker-cli swarm observation. + let handle = metrics_exporter_prometheus::PrometheusBuilder::new() + .install_recorder() + .expect("failed to install prometheus recorder"); + + let app = api::build_app(api::AppState { + prometheus: handle, + auth: api::AuthConfig { + hs256_secret: Some(b"test_secret".to_vec()), + }, + jobs: api::JobStore::default(), + audit: api::AuditStore::default(), + tenant_locks: api::TenantLocks::default(), + config_locks: api::ConfigLocks::default(), + http: reqwest::Client::new(), + placement: api::PlacementStore::new( + std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .parent() + .and_then(|p| p.parent()) + .unwrap() + .join("config/placement/dev.json"), + ), + billing: api::billing::BillingStore::new( + std::env::temp_dir().join("billing-drift-test.json"), + ), + billing_provider: std::sync::Arc::new(api::billing::MockProvider), + billing_enforcement_enabled: false, + config: api::ConfigRegistry::new(None, None), + fleet_services: vec![], + swarm: api::SwarmStore::new_docker_cli(), + docs: None, + }); + + // Auth token (control:read). + let exp = (std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_secs() + + 60) as usize; + let token = jsonwebtoken::encode( + &jsonwebtoken::Header::default(), + &serde_json::json!({ + "sub": "user_1", + "session_id": "sess_1", + "permissions": ["control:read"], + "exp": exp + }), + &jsonwebtoken::EncodingKey::from_secret(b"test_secret"), + ) + .unwrap(); + + let res = app + .oneshot( + axum::http::Request::builder() + .uri("/admin/v1/platform/drift") + .header(axum::http::header::AUTHORIZATION, format!("Bearer {token}")) + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(res.status(), axum::http::StatusCode::OK); + let body = axum::body::to_bytes(res.into_body(), 1024 * 1024) + .await + .unwrap(); + let v: serde_json::Value = serde_json::from_slice(&body).unwrap(); + let items = v.get("items").and_then(|x| x.as_array()).unwrap(); + assert!( + items.iter().any(|i| { + i.get("kind").and_then(|k| k.as_str()) == Some("extra") + && i.get("service").and_then(|s| s.as_str()) == Some(name.as_str()) + }), + "expected drift to include extra service {name}, got: {v}" + ); +} diff --git a/control/api/tests/minio_compose_gated.rs b/control/api/tests/minio_compose_gated.rs new file mode 100644 index 0000000..ffc1031 --- /dev/null +++ b/control/api/tests/minio_compose_gated.rs @@ -0,0 +1,77 @@ +use std::{path::PathBuf, process::Command, time::Duration}; + +fn repo_root() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .parent() + .and_then(|p| p.parent()) + .expect("api crate should live under repo root") + .to_path_buf() +} + +fn docker_enabled() -> bool { + std::env::var("CONTROL_TEST_DOCKER") + .ok() + .is_some_and(|v| v.trim() == "1") +} + +fn compose_file() -> PathBuf { + repo_root().join("docker-compose.yml") +} + +#[test] +fn minio_docs_bucket_exists_and_credentials_work_in_compose_network() { + if !docker_enabled() { + eprintln!("skipping: set CONTROL_TEST_DOCKER=1 to enable docker compose tests"); + return; + } + + let compose = compose_file(); + + let up = Command::new("docker") + .args(["compose", "-f"]) + .arg(&compose) + .args(["up", "-d", "minio"]) + .status() + .expect("failed to run docker compose up minio"); + assert!(up.success(), "docker compose up minio failed"); + + // The `minio-init` service runs `mc` inside the compose network. + let out = Command::new("docker") + .args(["compose", "-f"]) + .arg(&compose) + .args([ + "run", + "--rm", + "minio-init", + "/bin/sh", + "-lc", + "mc alias set local http://minio:9000 minioadmin minioadmin && mc ls local/cloudlysis-docs-0 && mc ls local/cloudlysis-docs-1 && mc ls local/cloudlysis-docs-2", + ]) + .output() + .expect("failed to run docker compose run minio-init"); + + // Best-effort cleanup (keep it short; other docker tests may reuse this env). + let _ = Command::new("docker") + .args(["compose", "-f"]) + .arg(&compose) + .args(["down", "-v"]) + .status(); + + assert!( + out.status.success(), + "minio-init bucket check failed: {}", + String::from_utf8_lossy(&out.stderr) + ); + + // `mc ls` prints at least one line when the bucket exists (even if empty it prints the bucket line). + let stdout = String::from_utf8_lossy(&out.stdout); + assert!( + stdout.contains("cloudlysis-docs-0") + && stdout.contains("cloudlysis-docs-1") + && stdout.contains("cloudlysis-docs-2"), + "expected mc ls output to mention bucket: {stdout}" + ); + + // Avoid tests hanging due to docker flakiness. + std::thread::sleep(Duration::from_millis(10)); +} diff --git a/control/api/tests/observability_configs.rs b/control/api/tests/observability_configs.rs index b1a15d1..0ab88f4 100644 --- a/control/api/tests/observability_configs.rs +++ b/control/api/tests/observability_configs.rs @@ -8,6 +8,20 @@ fn repo_root() -> PathBuf { .to_path_buf() } +#[test] +fn loki_and_tempo_s3_config_variants_are_syntactically_valid() { + let root = repo_root(); + + for file in [ + root.join("observability/loki/config.s3.yml"), + root.join("observability/tempo/config.s3.yml"), + ] { + let raw = fs::read_to_string(&file).unwrap_or_else(|e| panic!("{file:?}: {e}")); + let _: serde_yaml::Value = + serde_yaml::from_str(&raw).unwrap_or_else(|e| panic!("{file:?}: {e}")); + } +} + #[test] fn grafana_provisioning_files_are_syntactically_valid() { let root = repo_root(); diff --git a/control/api/tests/observability_s3_docker_gated.rs b/control/api/tests/observability_s3_docker_gated.rs new file mode 100644 index 0000000..1142fc2 --- /dev/null +++ b/control/api/tests/observability_s3_docker_gated.rs @@ -0,0 +1,218 @@ +use reqwest::StatusCode; +use serde_json::json; +use std::{ + net::TcpStream, + path::PathBuf, + process::Command, + time::{Duration, Instant}, +}; + +fn repo_root() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .parent() + .and_then(|p| p.parent()) + .expect("api crate should live under repo root") + .to_path_buf() +} + +fn docker_enabled() -> bool { + std::env::var("CONTROL_TEST_DOCKER") + .ok() + .is_some_and(|v| v.trim() == "1") +} + +fn wait_for_tcp(addr: &str, timeout: Duration) -> bool { + let start = Instant::now(); + while start.elapsed() < timeout { + if TcpStream::connect_timeout( + &addr.parse().expect("invalid socket addr"), + Duration::from_secs(1), + ) + .is_ok() + { + return true; + } + std::thread::sleep(Duration::from_millis(250)); + } + false +} + +fn mc_ls_bucket(compose: &PathBuf, bucket: &str) -> std::process::Output { + // Run inside compose network so it can reach `minio:9000`. + Command::new("docker") + .args(["compose", "-f"]) + .arg(compose) + .args([ + "run", + "--rm", + "minio-init", + "/bin/sh", + "-lc", + &format!( + "mc alias set local http://minio:9000 minioadmin minioadmin >/dev/null && mc ls --recursive local/{bucket}" + ), + ]) + .output() + .expect("failed to run mc ls") +} + +#[tokio::test] +async fn loki_and_tempo_write_objects_to_minio_in_s3_mode() { + if !docker_enabled() { + eprintln!("skipping: set CONTROL_TEST_DOCKER=1 to enable docker tests"); + return; + } + + let root = repo_root(); + let base = root.join("docker-compose.yml"); + let obs = root.join("observability/docker-compose.yml"); + let obs_s3 = root.join("observability/docker-compose.s3.yml"); + + let up = Command::new("docker") + .args(["compose", "-f"]) + .arg(&base) + .args(["-f"]) + .arg(&obs) + .args(["-f"]) + .arg(&obs_s3) + .args(["up", "-d"]) + .status() + .expect("failed to run docker compose up"); + assert!(up.success(), "docker compose up failed"); + + let reachable = wait_for_tcp("127.0.0.1:3100", Duration::from_secs(45)) + && wait_for_tcp("127.0.0.1:3200", Duration::from_secs(45)) + && wait_for_tcp("127.0.0.1:9411", Duration::from_secs(45)) + && wait_for_tcp("127.0.0.1:9000", Duration::from_secs(45)); + assert!(reachable, "loki/tempo/minio ports not reachable in time"); + + let http = reqwest::Client::builder() + .timeout(Duration::from_secs(10)) + .build() + .unwrap(); + + // Push one log line into Loki. + let ts_ns = (std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_nanos()) + .to_string(); + + let push = http + .post("http://127.0.0.1:3100/loki/api/v1/push") + .json(&json!({ + "streams": [{ + "stream": { "app": "cloudlysis-test" }, + "values": [[ts_ns, "hello from test"]] + }] + })) + .send() + .await + .expect("loki push request failed"); + assert!( + push.status() == StatusCode::NO_CONTENT, + "unexpected loki push status: {}", + push.status() + ); + + // Emit one trace span via Zipkin v2. + let zipkin = http + .post("http://127.0.0.1:9411/api/v2/spans") + .json(&json!([{ + "traceId": "463ac35c9f6413ad48485a3953bb6124", + "id": "a2fb4a1d1a96d312", + "name": "test-span", + "timestamp": 1700000000000000u64, + "duration": 1000u64, + "localEndpoint": { "serviceName": "cloudlysis-test" } + }])) + .send() + .await + .expect("zipkin post failed"); + assert!( + zipkin.status().is_success(), + "zipkin ingest failed: {}", + zipkin.status() + ); + + // Query Loki back to ensure the line is retrievable (not just accepted). + // Loki may need a short delay to index. + let loki_deadline = Instant::now() + Duration::from_secs(30); + let mut loki_ok = false; + while Instant::now() < loki_deadline && !loki_ok { + let q = http + .get("http://127.0.0.1:3100/loki/api/v1/query") + .query(&[("query", r#"{app="cloudlysis-test"}"#)]) + .send() + .await + .expect("loki query failed"); + if q.status().is_success() { + let v: serde_json::Value = q.json().await.expect("invalid loki query json"); + // We only need to see any non-empty result. + let has = v + .get("data") + .and_then(|d| d.get("result")) + .and_then(|r| r.as_array()) + .is_some_and(|a| !a.is_empty()); + if has { + loki_ok = true; + break; + } + } + tokio::time::sleep(Duration::from_millis(500)).await; + } + + // Query Tempo back by trace id (Zipkin traceId used above). + let tempo_deadline = Instant::now() + Duration::from_secs(30); + let mut tempo_ok = false; + while Instant::now() < tempo_deadline && !tempo_ok { + let res = http + .get("http://127.0.0.1:3200/api/traces/463ac35c9f6413ad48485a3953bb6124") + .send() + .await + .expect("tempo get trace failed"); + if res.status().is_success() { + tempo_ok = true; + break; + } + tokio::time::sleep(Duration::from_millis(500)).await; + } + + // Poll buckets until at least one object appears. + let deadline = Instant::now() + Duration::from_secs(45); + let mut loki_has_objects = false; + let mut tempo_has_objects = false; + while Instant::now() < deadline && (!loki_has_objects || !tempo_has_objects) { + let loki_out = mc_ls_bucket(&base, "cloudlysis-loki"); + if loki_out.status.success() && !loki_out.stdout.is_empty() { + loki_has_objects = true; + } + + let tempo_out = mc_ls_bucket(&base, "cloudlysis-tempo"); + if tempo_out.status.success() && !tempo_out.stdout.is_empty() { + tempo_has_objects = true; + } + + if !loki_has_objects || !tempo_has_objects { + tokio::time::sleep(Duration::from_millis(500)).await; + } + } + + let _ = Command::new("docker") + .args(["compose", "-f"]) + .arg(&base) + .args(["-f"]) + .arg(&obs) + .args(["-f"]) + .arg(&obs_s3) + .args(["down", "-v"]) + .status(); + + assert!(loki_has_objects, "expected Loki to write objects to MinIO"); + assert!( + tempo_has_objects, + "expected Tempo to write objects to MinIO" + ); + assert!(loki_ok, "expected Loki query to return a result"); + assert!(tempo_ok, "expected Tempo to return the ingested trace"); +} diff --git a/control/api/tests/observability_smoke_docker.rs b/control/api/tests/observability_smoke_docker.rs index a69dd34..e871abb 100644 --- a/control/api/tests/observability_smoke_docker.rs +++ b/control/api/tests/observability_smoke_docker.rs @@ -30,10 +30,12 @@ fn wait_for_tcp(addr: &str, timeout: Duration) -> bool { } #[test] -#[ignore] fn observability_stack_reaches_healthy_state_fast() { let enabled = std::env::var("CONTROL_TEST_DOCKER").ok(); - assert_eq!(enabled.as_deref(), Some("1")); + if enabled.as_deref() != Some("1") { + eprintln!("skipping: set CONTROL_TEST_DOCKER=1 to enable docker observability smoke test"); + return; + } let root = repo_root(); let compose = root.join("observability/docker-compose.yml"); diff --git a/control/api/tests/s3_docs_gated.rs b/control/api/tests/s3_docs_gated.rs new file mode 100644 index 0000000..8d2be65 --- /dev/null +++ b/control/api/tests/s3_docs_gated.rs @@ -0,0 +1,116 @@ +use api::s3_docs::{DocsConfig, DocsStore}; +use uuid::Uuid; + +fn s3_env_ready() -> bool { + // Gate integration tests without requiring `-- --ignored`. + // If CI/local wants these tests to run, it must provide S3 env vars. + let required = [ + "CONTROL_S3_ENDPOINT", + "CONTROL_S3_ACCESS_KEY_ID", + "CONTROL_S3_SECRET_ACCESS_KEY", + "CONTROL_S3_BUCKET_DOCS", + ]; + required + .iter() + .all(|k| std::env::var(k).ok().is_some_and(|v| !v.trim().is_empty())) +} + +#[tokio::test] +async fn s3_docs_roundtrip_put_get_list_delete() { + if !s3_env_ready() { + eprintln!("skipping: missing S3 env (see S3_PLAN.md)"); + return; + } + let cfg = DocsConfig::from_env().expect("missing S3 env (see S3_PLAN.md)"); + let store = DocsStore::new(cfg) + .await + .expect("failed to init docs store"); + + let tenant_id = Uuid::new_v4().to_string(); + let doc_type = "test"; + let doc_id = Uuid::new_v4().to_string(); + let filename = "hello.txt"; + let key = store + .key_for(&tenant_id, doc_type, &doc_id, filename) + .expect("invalid key"); + + store + .put_for_tenant( + &tenant_id, + &key, + b"hello".to_vec(), + Some("text/plain".to_string()), + ) + .await + .expect("put failed"); + + let (bytes, _ct) = store + .get_bytes_for_tenant(&tenant_id, &key) + .await + .expect("get failed"); + assert_eq!(bytes, b"hello"); + + let prefix = format!("{}{}", store.prefix(), tenant_id); + let objects = store + .list_for_tenant(&tenant_id, &format!("{prefix}/")) + .await + .expect("list failed"); + assert!(objects.iter().any(|o| o.key == key)); + + store + .delete_for_tenant(&tenant_id, &key) + .await + .expect("delete failed"); +} + +#[tokio::test] +async fn s3_docs_tenant_prefix_isolation() { + if !s3_env_ready() { + eprintln!("skipping: missing S3 env (see S3_PLAN.md)"); + return; + } + let cfg = DocsConfig::from_env().expect("missing S3 env (see S3_PLAN.md)"); + let store = DocsStore::new(cfg) + .await + .expect("failed to init docs store"); + + let tenant_a = Uuid::new_v4().to_string(); + let tenant_b = Uuid::new_v4().to_string(); + + let doc_type = "test"; + let doc_id = Uuid::new_v4().to_string(); + let filename = "hello.txt"; + + let key_a = store + .key_for(&tenant_a, doc_type, &doc_id, filename) + .expect("invalid key"); + store + .put_for_tenant( + &tenant_a, + &key_a, + b"hello-a".to_vec(), + Some("text/plain".to_string()), + ) + .await + .expect("put failed"); + + let prefix_a = format!("{}{tenant_a}/", store.prefix()); + let prefix_b = format!("{}{tenant_b}/", store.prefix()); + + let objects_a = store + .list_for_tenant(&tenant_a, &prefix_a) + .await + .expect("list a failed"); + let objects_b = store + .list_for_tenant(&tenant_b, &prefix_b) + .await + .expect("list b failed"); + + assert!(objects_a.iter().any(|o| o.key == key_a)); + assert!(!objects_b.iter().any(|o| o.key == key_a)); + + store + .delete_for_tenant(&tenant_a, &key_a) + .await + .expect("delete failed"); +} diff --git a/control/api/tests/s3_permissions_awscli_env_gated.rs b/control/api/tests/s3_permissions_awscli_env_gated.rs new file mode 100644 index 0000000..9d35619 --- /dev/null +++ b/control/api/tests/s3_permissions_awscli_env_gated.rs @@ -0,0 +1,36 @@ +use std::{path::PathBuf, process::Command}; + +fn repo_root() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .parent() + .and_then(|p| p.parent()) + .expect("api crate should live under repo root") + .to_path_buf() +} + +fn is_enabled() -> bool { + std::env::var("CONTROL_TEST_AWSCLI") + .ok() + .is_some_and(|v| v.trim() == "1") +} + +#[test] +fn s3_docs_permissions_can_be_verified_with_aws_cli() { + if !is_enabled() { + eprintln!("skipping: set CONTROL_TEST_AWSCLI=1 to enable aws-cli S3 permission checks"); + return; + } + + let script = repo_root().join("docker/scripts/s3_verify_docs.sh"); + let out = Command::new("sh") + .arg(script) + .output() + .expect("failed to run s3_verify_docs.sh (requires aws cli and S3_* env)"); + + assert!( + out.status.success(), + "s3 verify script failed: {}\n{}", + String::from_utf8_lossy(&out.stdout), + String::from_utf8_lossy(&out.stderr) + ); +} diff --git a/control/api/tests/swarm_stack_yaml.rs b/control/api/tests/swarm_stack_yaml.rs index 21e1325..a5ba261 100644 --- a/control/api/tests/swarm_stack_yaml.rs +++ b/control/api/tests/swarm_stack_yaml.rs @@ -13,6 +13,7 @@ fn stack_files_parse_as_yaml() { let root = repo_root(); for file in [ root.join("swarm/stacks/control-plane.yml"), + root.join("swarm/stacks/control-plane-prod.yml"), root.join("swarm/stacks/observability.yml"), ] { let raw = fs::read_to_string(&file).unwrap(); @@ -38,3 +39,36 @@ fn control_plane_stack_has_required_services() { ); } } + +#[test] +fn control_plane_prod_stack_has_control_api_and_external_s3_secrets() { + let root = repo_root(); + let raw = fs::read_to_string(root.join("swarm/stacks/control-plane-prod.yml")).unwrap(); + let v: serde_yaml::Value = serde_yaml::from_str(&raw).unwrap(); + + let services = v + .get("services") + .and_then(|x| x.as_mapping()) + .expect("missing services"); + assert!(services.contains_key(serde_yaml::Value::String("control-api".to_string()))); + assert!(services.contains_key(serde_yaml::Value::String("control-ui".to_string()))); + assert!( + !services.contains_key(serde_yaml::Value::String("minio".to_string())), + "prod stack must not bundle MinIO" + ); + + let secrets = v + .get("secrets") + .and_then(|x| x.as_mapping()) + .expect("missing secrets"); + for name in ["control_s3_access_key_id", "control_s3_secret_access_key"] { + let entry = secrets + .get(serde_yaml::Value::String(name.to_string())) + .unwrap_or_else(|| panic!("missing secret {name}")); + let external = entry + .get(serde_yaml::Value::String("external".to_string())) + .and_then(|x| x.as_bool()) + .unwrap_or(false); + assert!(external, "secret {name} must be external: true"); + } +} diff --git a/control/ui/src/api/control.ts b/control/ui/src/api/control.ts index ec9fbe4..80b62c4 100644 --- a/control/ui/src/api/control.ts +++ b/control/ui/src/api/control.ts @@ -26,6 +26,48 @@ async function apiJson(path: string): Promise { } } +async function apiJsonWithHeaders(path: string, extra: HeadersInit): Promise { + const controller = new AbortController() + const t = window.setTimeout(() => controller.abort(), 5000) + + const token = getAccessToken() + const headers: HeadersInit = { ...(token ? { Authorization: `Bearer ${token}` } : {}), ...extra } + + try { + const res = await apiFetch(`${baseUrl()}${path}`, { + headers, + signal: controller.signal, + useLastCorrelationId: true, + useLastTraceparent: true, + }) + return (await res.json()) as T + } finally { + window.clearTimeout(t) + } +} + +async function apiFetchWithHeaders(path: string, init: RequestInit, extra: Record) { + const controller = new AbortController() + const t = window.setTimeout(() => controller.abort(), 15000) + + const token = getAccessToken() + const headers = new Headers(init.headers) + if (token) headers.set('authorization', `Bearer ${token}`) + for (const [k, v] of Object.entries(extra)) headers.set(k, v) + + try { + return await apiFetch(`${baseUrl()}${path}`, { + ...init, + headers, + signal: controller.signal, + useLastCorrelationId: true, + useLastTraceparent: true, + }) + } finally { + window.clearTimeout(t) + } +} + async function apiPostJson(path: string, body: unknown, idempotencyKey?: string): Promise { const controller = new AbortController() const t = window.setTimeout(() => controller.abort(), 2000) @@ -100,6 +142,65 @@ export function getFleetSnapshot(): Promise { return apiJson('/admin/v1/fleet/snapshot') } +export type DriftKind = 'missing' | 'extra' | 'unhealthy' | 'version_mismatch' + +export type DriftResponse = { + summary: Record + items: Array<{ kind: DriftKind; service: string; details: unknown }> +} + +export function getPlatformDrift(): Promise { + return apiJson('/admin/v1/platform/drift') +} + +export type ConfigDomain = 'routing' | 'placement' + +export type ConfigGetResponse = { + domain: ConfigDomain + revision: number + source: unknown + value: unknown +} + +export function listConfigDomains(): Promise<{ domains: ConfigDomain[] }> { + return apiJson('/admin/v1/config') +} + +export function getConfig(domain: ConfigDomain): Promise { + return apiJson(`/admin/v1/config/${domain}`) +} + +export function startConfigValidateJob(args: { + domain: ConfigDomain + reason: string + value: unknown + idempotencyKey: string +}): Promise<{ job_id: string }> { + return apiPostJson('/admin/v1/jobs/config/validate', { domain: args.domain, reason: args.reason, value: args.value }, args.idempotencyKey) +} + +export function startConfigApplyJob(args: { + domain: ConfigDomain + reason: string + expectedRevision?: number + value: unknown + idempotencyKey: string +}): Promise<{ job_id: string }> { + return apiPostJson( + '/admin/v1/jobs/config/apply', + { domain: args.domain, reason: args.reason, expected_revision: args.expectedRevision, value: args.value }, + args.idempotencyKey, + ) +} + +export function startConfigRollbackJob(args: { + domain: ConfigDomain + reason: string + idempotencyKey: string +}): Promise<{ job_id: string }> { + return apiPostJson('/admin/v1/jobs/config/rollback', { domain: args.domain, reason: args.reason }, args.idempotencyKey) +} + export function getPlacement(kind: 'aggregate' | 'projection' | 'runner'): Promise { return apiJson(`/admin/v1/placement/${kind}`) } @@ -177,3 +278,111 @@ export function getSwarmServices(): Promise<{ services: SwarmService[] }> { export function getSwarmTasks(serviceName: string): Promise<{ service: string; tasks: SwarmTask[] }> { return apiJson(`/admin/v1/swarm/services/${encodeURIComponent(serviceName)}/tasks`) } + +export type DocumentObject = { + key: string + size: number + last_modified?: string | null +} + +export function listDocuments(args: { tenantId: string; prefix?: string }): Promise<{ objects: DocumentObject[] }> { + const qs = args.prefix ? `?prefix=${encodeURIComponent(args.prefix)}` : '' + return apiJsonWithHeaders(`/admin/v1/tenants/${encodeURIComponent(args.tenantId)}/docs${qs}`, { + 'x-tenant-id': args.tenantId, + }) +} + +export async function uploadDocument(args: { + tenantId: string + docType: string + docId: string + filename: string + file: File +}): Promise<{ key: string; sha256: string }> { + const path = `/admin/v1/tenants/${encodeURIComponent(args.tenantId)}/docs/${encodeURIComponent( + args.docType, + )}/${encodeURIComponent(args.docId)}/${encodeURIComponent(args.filename)}` + + const res = await apiFetchWithHeaders( + path, + { + method: 'PUT', + headers: { 'content-type': args.file.type || 'application/octet-stream' }, + body: args.file, + }, + { 'x-tenant-id': args.tenantId }, + ) + return (await res.json()) as { key: string; sha256: string } +} + +export async function downloadDocument(args: { tenantId: string; key: string }): Promise { + const res = await apiFetchWithHeaders( + `/admin/v1/tenants/${encodeURIComponent(args.tenantId)}/docs/object/${encodeURIComponent(args.key)}`, + { method: 'GET' }, + { 'x-tenant-id': args.tenantId }, + ) + return await res.blob() +} + +export async function deleteDocument(args: { tenantId: string; key: string }): Promise { + await apiFetchWithHeaders( + `/admin/v1/tenants/${encodeURIComponent(args.tenantId)}/docs/object/${encodeURIComponent(args.key)}`, + { method: 'DELETE' }, + { 'x-tenant-id': args.tenantId }, + ) +} + +export type PresignResponse = { + method: 'PUT' | 'GET' + url: string + key: string +} + +export function presignUpload(args: { + tenantId: string + docType: string + docId?: string + filename: string + contentType?: string +}): Promise { + return apiPostJsonWithTenant(`/admin/v1/tenants/${encodeURIComponent(args.tenantId)}/docs/presign/upload`, args.tenantId, { + doc_type: args.docType, + doc_id: args.docId, + filename: args.filename, + content_type: args.contentType, + }) +} + +export function presignDownload(args: { tenantId: string; key: string }): Promise { + return apiPostJsonWithTenant( + `/admin/v1/tenants/${encodeURIComponent(args.tenantId)}/docs/presign/download`, + args.tenantId, + { key: args.key }, + ) +} + +async function apiPostJsonWithTenant(path: string, tenantId: string, body: unknown): Promise { + const controller = new AbortController() + const t = window.setTimeout(() => controller.abort(), 5000) + + const token = getAccessToken() + const headers: HeadersInit = { + 'content-type': 'application/json', + ...(token ? { Authorization: `Bearer ${token}` } : {}), + 'x-tenant-id': tenantId, + } + + try { + const res = await apiFetch(`${baseUrl()}${path}`, { + method: 'POST', + headers, + body: JSON.stringify(body), + signal: controller.signal, + useLastCorrelationId: true, + useLastTraceparent: true, + }) + return (await res.json()) as T + } finally { + window.clearTimeout(t) + } +} diff --git a/control/ui/src/app/layout.tsx b/control/ui/src/app/layout.tsx index e2d2ef1..8447bc3 100644 --- a/control/ui/src/app/layout.tsx +++ b/control/ui/src/app/layout.tsx @@ -16,9 +16,11 @@ const navItems: NavItem[] = [ { label: 'Roles & Permissions', to: '/roles-permissions' }, { label: 'Config', to: '/config' }, { label: 'Definitions', to: '/definitions' }, + { label: 'Documents', to: '/documents' }, { label: 'Scale & Placement', to: '/scale-placement' }, { label: 'Deployments', to: '/deployments' }, { label: 'Observability', to: '/observability' }, + { label: 'Platform Drift', to: '/drift' }, { label: 'Audit Log', to: '/audit-log' }, { label: 'Settings', to: '/settings' }, ] diff --git a/control/ui/src/app/router.test.tsx b/control/ui/src/app/router.test.tsx index 085737a..b74f056 100644 --- a/control/ui/src/app/router.test.tsx +++ b/control/ui/src/app/router.test.tsx @@ -15,9 +15,11 @@ const paths = [ '/roles-permissions', '/config', '/definitions', + '/documents', '/scale-placement', '/deployments', '/observability', + '/drift', '/audit-log', '/settings', ] diff --git a/control/ui/src/app/router.tsx b/control/ui/src/app/router.tsx index af96a4b..06e75bf 100644 --- a/control/ui/src/app/router.tsx +++ b/control/ui/src/app/router.tsx @@ -6,10 +6,12 @@ import { DefinitionsPage, DeploymentDetailPage, DeploymentsPage, + DocumentsPage, JobPage, NotFoundPage, ObservabilityPage, OverviewPage, + PlatformDriftPage, RolesPermissionsPage, ScalePlacementPage, SessionsPage, @@ -30,10 +32,12 @@ export const routes: RouteObject[] = [ { path: 'roles-permissions', element: }, { path: 'config', element: }, { path: 'definitions', element: }, + { path: 'documents', element: }, { path: 'scale-placement', element: }, { path: 'deployments', element: }, { path: 'deployments/:serviceName', element: }, { path: 'observability', element: }, + { path: 'drift', element: }, { path: 'audit-log', element: }, { path: 'jobs/:jobId', element: }, { path: 'settings', element: }, diff --git a/control/ui/src/pages.tsx b/control/ui/src/pages.tsx index d0df2b1..3144781 100644 --- a/control/ui/src/pages.tsx +++ b/control/ui/src/pages.tsx @@ -9,6 +9,18 @@ import { listAudit, getSwarmServices, getSwarmTasks, + listConfigDomains, + getConfig, + startConfigValidateJob, + startConfigApplyJob, + startConfigRollbackJob, + getPlatformDrift, + listDocuments, + uploadDocument, + downloadDocument, + deleteDocument, + presignUpload, + presignDownload, startTenantDrainJob, startTenantMigrateJob, type FleetSnapshot, @@ -18,6 +30,10 @@ import { type AuditEvent, type SwarmService, type SwarmTask, + type DocumentObject, + type ConfigDomain, + type ConfigGetResponse, + type DriftResponse, } from './api/control' import { getAccessToken, setAccessToken } from './auth/token' import { Button, Code, ErrorText, Modal, MutedText, Table, TextInput } from './components/primitives' @@ -226,13 +242,443 @@ export function RolesPermissionsPage() { } export function ConfigPage() { - return + const [domains, setDomains] = useState(undefined) + const [selected, setSelected] = useState('routing') + const [cfg, setCfg] = useState(undefined) + const [draft, setDraft] = useState('') + const [reason, setReason] = useState('') + const [error, setError] = useState(undefined) + const [busy, setBusy] = useState(false) + const navigate = useNavigate() + + function newIdempotencyKey() { + if (typeof crypto !== 'undefined' && 'randomUUID' in crypto) return crypto.randomUUID() + return `${Date.now()}-${Math.random().toString(16).slice(2)}` + } + + useEffect(() => { + let cancelled = false + listConfigDomains() + .then((d) => { + if (cancelled) return + setDomains(d.domains) + if (d.domains.length > 0 && !d.domains.includes(selected)) { + setSelected(d.domains[0] ?? 'routing') + } + }) + .catch((e: unknown) => { + if (cancelled) return + setError(e instanceof Error ? e.message : 'failed to load domains') + }) + return () => { + cancelled = true + } + }, []) + + async function refresh(domain: ConfigDomain) { + setBusy(true) + try { + const c = await getConfig(domain) + setCfg(c) + setDraft(JSON.stringify(c.value ?? null, null, 2)) + setError(undefined) + } catch (e: unknown) { + setError(e instanceof Error ? e.message : 'failed to load config') + } finally { + setBusy(false) + } + } + + useEffect(() => { + void refresh(selected) + }, [selected]) + + return ( + + {error ? {error} : null} +
+
+
+ + +
+ + + +
+ + +
+
+ + + Current revision: {String(cfg?.revision ?? '')} + + +
+ +