Monorepo consolidation: workspace, shared types, transport plans, docker/swam assets
Some checks failed
ci / rust (push) Failing after 2m34s
ci / ui (push) Failing after 30s

This commit is contained in:
2026-03-30 11:40:42 +03:00
parent 7e7041cf8b
commit 1298d9a3df
246 changed files with 55434 additions and 0 deletions

25
control/api/Cargo.toml Normal file
View File

@@ -0,0 +1,25 @@
[package]
name = "api"
version = "0.1.0"
edition = "2024"
publish = ["madapes"]
[dependencies]
axum = "0.8.6"
clap = { version = "4.5.48", features = ["derive", "env"] }
jsonwebtoken = "9.3.1"
metrics = "0.23.0"
metrics-exporter-prometheus = "0.16.0"
reqwest = { version = "0.12.23", default-features = false, features = ["json", "rustls-tls"] }
serde = { version = "1.0.228", features = ["derive"] }
serde_json = "1.0.149"
thiserror = "2.0.16"
tokio = { version = "1.45.0", features = ["macros", "net", "process", "rt-multi-thread", "signal", "time"] }
tower-http = { version = "0.6.6", features = ["trace"] }
tracing = "0.1.41"
tracing-subscriber = { version = "0.3.20", features = ["env-filter"] }
uuid = { version = "1.18.1", features = ["serde", "v4"] }
[dev-dependencies]
serde_yaml = "0.9.34"
tower = "0.5.2"

417
control/api/src/admin.rs Normal file
View File

@@ -0,0 +1,417 @@
use crate::{
AppState, RequestIds,
auth::{Principal, has_permission},
fleet,
job_engine::{JobEngine, StartJobError},
jobs::{Job, JobStatus, JobStep},
placement::{PlacementResponse, ServiceKind},
swarm::{SwarmService, SwarmTask},
};
use axum::{
Json, Router,
extract::{Extension, Path, State},
http::{HeaderMap, StatusCode},
response::IntoResponse,
routing::{get, post},
};
use serde::Deserialize;
use std::time::{SystemTime, UNIX_EPOCH};
use uuid::Uuid;
const HEADER_IDEMPOTENCY_KEY: &str = "idempotency-key";
const HEADER_TENANT_ID: &str = "x-tenant-id";
pub fn admin_router() -> Router<AppState> {
Router::new()
.route("/whoami", get(whoami))
.route("/platform/info", get(platform_info))
.route("/fleet/snapshot", get(fleet_snapshot))
.route("/tenants", get(list_tenants))
.route("/placement/{kind}", get(get_placement))
.route("/tenants/echo", get(tenant_echo))
.route("/jobs/echo", post(create_echo_job))
.route("/jobs/{job_id}", get(get_job))
.route("/jobs/{job_id}/cancel", post(cancel_job))
.route("/jobs/tenant/drain", post(start_tenant_drain))
.route("/jobs/tenant/migrate", post(start_tenant_migrate))
.route("/plan/tenant/migrate", post(plan_tenant_migrate))
.route("/audit", get(list_audit))
.route("/swarm/services", get(list_swarm_services))
.route("/swarm/services/{name}/tasks", get(list_swarm_tasks))
}
async fn whoami(Extension(principal): Extension<Principal>) -> impl IntoResponse {
if !has_permission(&principal, "control:read") {
return StatusCode::FORBIDDEN.into_response();
}
(
StatusCode::OK,
Json(serde_json::json!({
"sub": principal.sub,
"session_id": principal.session_id,
"permissions": principal.permissions,
})),
)
.into_response()
}
async fn platform_info(Extension(principal): Extension<Principal>) -> impl IntoResponse {
if !has_permission(&principal, "control:read") {
return StatusCode::FORBIDDEN.into_response();
}
(
StatusCode::OK,
Json(serde_json::json!({
"service": "control-api",
})),
)
.into_response()
}
async fn fleet_snapshot(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
Extension(request_ids): Extension<RequestIds>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:read") {
return StatusCode::FORBIDDEN.into_response();
}
let services =
fleet::snapshot_with_context(&state.http, &state.fleet_services, Some(&request_ids)).await;
(
StatusCode::OK,
Json(serde_json::json!({ "services": services })),
)
.into_response()
}
async fn get_placement(
State(state): State<AppState>,
Path(kind): Path<String>,
Extension(principal): Extension<Principal>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:read") {
return StatusCode::FORBIDDEN.into_response();
}
let kind = match kind.as_str() {
"aggregate" => ServiceKind::Aggregate,
"projection" => ServiceKind::Projection,
"runner" => ServiceKind::Runner,
_ => return StatusCode::NOT_FOUND.into_response(),
};
let resp: PlacementResponse = state.placement.get_for_kind(kind);
(StatusCode::OK, Json(resp)).into_response()
}
async fn list_tenants(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:read") {
return StatusCode::FORBIDDEN.into_response();
}
let tenants = state.placement.tenant_summaries();
(
StatusCode::OK,
Json(serde_json::json!({ "tenants": tenants })),
)
.into_response()
}
async fn tenant_echo(
headers: HeaderMap,
Extension(principal): Extension<Principal>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:read") {
return StatusCode::FORBIDDEN.into_response();
}
let tenant_id = headers
.get(HEADER_TENANT_ID)
.and_then(|v| v.to_str().ok())
.ok_or(StatusCode::BAD_REQUEST)
.and_then(|s| Uuid::parse_str(s).map_err(|_| StatusCode::BAD_REQUEST));
match tenant_id {
Ok(tenant_id) => (
StatusCode::OK,
Json(serde_json::json!({
"tenant_id": tenant_id,
})),
)
.into_response(),
Err(status) => status.into_response(),
}
}
async fn create_echo_job(
State(state): State<AppState>,
headers: HeaderMap,
Extension(principal): Extension<Principal>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:write") {
return StatusCode::FORBIDDEN.into_response();
}
let key = headers
.get(HEADER_IDEMPOTENCY_KEY)
.and_then(|v| v.to_str().ok())
.ok_or(StatusCode::BAD_REQUEST);
let key = match key {
Ok(k) if !k.is_empty() => k,
_ => return StatusCode::BAD_REQUEST.into_response(),
};
let now = now_ms();
let job_id = Uuid::new_v4();
let job = Job {
job_id,
status: JobStatus::Succeeded,
steps: vec![JobStep {
name: "echo".to_string(),
status: JobStatus::Succeeded,
attempts: 1,
error: None,
}],
error: None,
created_at_ms: now,
started_at_ms: Some(now),
finished_at_ms: Some(now),
};
let job_id = state.jobs.insert_idempotent(key, job);
state.audit.record(crate::audit::AuditEvent {
ts_ms: now,
principal_sub: principal.sub.clone(),
action: "job.echo".to_string(),
tenant_id: None,
reason: "echo".to_string(),
job_id: Some(job_id),
});
(
StatusCode::OK,
Json(serde_json::json!({
"job_id": job_id,
})),
)
.into_response()
}
async fn get_job(
State(state): State<AppState>,
Path(job_id): Path<Uuid>,
Extension(principal): Extension<Principal>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:read") {
return StatusCode::FORBIDDEN.into_response();
}
match state.jobs.get(job_id) {
Some(job) => (StatusCode::OK, Json(job)).into_response(),
None => StatusCode::NOT_FOUND.into_response(),
}
}
#[derive(Debug, Deserialize)]
struct TenantDrainRequest {
tenant_id: Uuid,
reason: String,
}
#[derive(Debug, Deserialize)]
struct TenantMigrateRequest {
tenant_id: Uuid,
runner_target: String,
reason: String,
}
async fn start_tenant_drain(
State(state): State<AppState>,
headers: HeaderMap,
Extension(principal): Extension<Principal>,
Json(body): Json<TenantDrainRequest>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:write") {
return StatusCode::FORBIDDEN.into_response();
}
let key = headers
.get(HEADER_IDEMPOTENCY_KEY)
.and_then(|v| v.to_str().ok())
.ok_or(StatusCode::BAD_REQUEST);
let key = match key {
Ok(k) if !k.is_empty() => k,
_ => return StatusCode::BAD_REQUEST.into_response(),
};
let engine = JobEngine::new(
state.jobs.clone(),
state.audit.clone(),
state.tenant_locks.clone(),
);
let job_id = match engine.start_tenant_drain(
state.clone(),
&principal,
body.tenant_id,
body.reason,
key,
) {
Ok(id) => id,
Err(StartJobError::TenantLocked) => return StatusCode::CONFLICT.into_response(),
};
(
StatusCode::OK,
Json(serde_json::json!({ "job_id": job_id })),
)
.into_response()
}
async fn start_tenant_migrate(
State(state): State<AppState>,
headers: HeaderMap,
Extension(principal): Extension<Principal>,
Json(body): Json<TenantMigrateRequest>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:write") {
return StatusCode::FORBIDDEN.into_response();
}
let key = headers
.get(HEADER_IDEMPOTENCY_KEY)
.and_then(|v| v.to_str().ok())
.ok_or(StatusCode::BAD_REQUEST);
let key = match key {
Ok(k) if !k.is_empty() => k,
_ => return StatusCode::BAD_REQUEST.into_response(),
};
let engine = JobEngine::new(
state.jobs.clone(),
state.audit.clone(),
state.tenant_locks.clone(),
);
let job_id = match engine.start_tenant_migrate(
state.clone(),
&principal,
body.tenant_id,
body.runner_target,
body.reason,
key,
) {
Ok(id) => id,
Err(StartJobError::TenantLocked) => return StatusCode::CONFLICT.into_response(),
};
(
StatusCode::OK,
Json(serde_json::json!({ "job_id": job_id })),
)
.into_response()
}
async fn cancel_job(
State(state): State<AppState>,
Path(job_id): Path<Uuid>,
Extension(principal): Extension<Principal>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:write") {
return StatusCode::FORBIDDEN.into_response();
}
if state.jobs.request_cancel(job_id) {
state.audit.record(crate::audit::AuditEvent {
ts_ms: now_ms(),
principal_sub: principal.sub.clone(),
action: "job.cancel".to_string(),
tenant_id: None,
reason: "cancel requested".to_string(),
job_id: Some(job_id),
});
StatusCode::OK.into_response()
} else {
StatusCode::NOT_FOUND.into_response()
}
}
fn now_ms() -> u64 {
SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_millis() as u64
}
async fn list_audit(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:read") {
return StatusCode::FORBIDDEN.into_response();
}
let events = state.audit.list_recent(200);
(
StatusCode::OK,
Json(serde_json::json!({ "events": events })),
)
.into_response()
}
async fn plan_tenant_migrate(
Extension(principal): Extension<Principal>,
Json(body): Json<TenantMigrateRequest>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:write") {
return StatusCode::FORBIDDEN.into_response();
}
let _ = (body.tenant_id, body.runner_target, body.reason);
(
StatusCode::OK,
Json(serde_json::json!({
"steps": ["preflight", "drain", "update_placement", "reload", "verify"]
})),
)
.into_response()
}
async fn list_swarm_services(
State(state): State<AppState>,
Extension(principal): Extension<Principal>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:read") {
return StatusCode::FORBIDDEN.into_response();
}
let services: Vec<SwarmService> = state.swarm.list_services();
(
StatusCode::OK,
Json(serde_json::json!({ "services": services })),
)
.into_response()
}
async fn list_swarm_tasks(
State(state): State<AppState>,
Path(name): Path<String>,
Extension(principal): Extension<Principal>,
) -> impl IntoResponse {
if !has_permission(&principal, "control:read") {
return StatusCode::FORBIDDEN.into_response();
}
let tasks: Vec<SwarmTask> = state.swarm.list_tasks(&name);
(
StatusCode::OK,
Json(serde_json::json!({ "service": name, "tasks": tasks })),
)
.into_response()
}

31
control/api/src/audit.rs Normal file
View File

@@ -0,0 +1,31 @@
use serde::{Deserialize, Serialize};
use std::sync::{Arc, Mutex};
use uuid::Uuid;
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct AuditEvent {
pub ts_ms: u64,
pub principal_sub: String,
pub action: String,
pub tenant_id: Option<Uuid>,
pub reason: String,
pub job_id: Option<Uuid>,
}
#[derive(Clone, Default)]
pub struct AuditStore {
inner: Arc<Mutex<Vec<AuditEvent>>>,
}
impl AuditStore {
pub fn record(&self, event: AuditEvent) {
let mut events = self.inner.lock().expect("audit lock poisoned");
events.push(event);
}
pub fn list_recent(&self, limit: usize) -> Vec<AuditEvent> {
let events = self.inner.lock().expect("audit lock poisoned");
let start = events.len().saturating_sub(limit);
events[start..].to_vec()
}
}

78
control/api/src/auth.rs Normal file
View File

@@ -0,0 +1,78 @@
use crate::AppState;
use axum::{
extract::State,
http::{Request, StatusCode},
middleware::Next,
response::{IntoResponse, Response},
};
use jsonwebtoken::{Algorithm, DecodingKey, Validation, decode};
use serde::{Deserialize, Serialize};
#[derive(Clone)]
pub struct AuthConfig {
pub hs256_secret: Option<Vec<u8>>,
}
#[derive(Clone, Debug)]
pub struct Principal {
pub sub: String,
pub session_id: String,
pub permissions: Vec<String>,
}
#[derive(Debug, Serialize, Deserialize)]
struct Claims {
sub: String,
session_id: String,
permissions: Vec<String>,
exp: usize,
}
pub async fn auth_middleware(
State(state): State<AppState>,
mut req: Request<axum::body::Body>,
next: Next,
) -> Response {
match authenticate(
&state.auth,
req.headers().get(axum::http::header::AUTHORIZATION),
) {
Ok(principal) => {
req.extensions_mut().insert(principal);
next.run(req).await
}
Err(status) => status.into_response(),
}
}
fn authenticate(
cfg: &AuthConfig,
auth_header: Option<&axum::http::HeaderValue>,
) -> Result<Principal, StatusCode> {
let secret = cfg
.hs256_secret
.as_ref()
.ok_or(StatusCode::SERVICE_UNAVAILABLE)?;
let header = auth_header.ok_or(StatusCode::UNAUTHORIZED)?;
let header_str = header.to_str().map_err(|_| StatusCode::UNAUTHORIZED)?;
let token = header_str
.strip_prefix("Bearer ")
.ok_or(StatusCode::UNAUTHORIZED)?;
let mut validation = Validation::new(Algorithm::HS256);
validation.required_spec_claims.insert("exp".to_string());
let data = decode::<Claims>(token, &DecodingKey::from_secret(secret), &validation)
.map_err(|_| StatusCode::UNAUTHORIZED)?;
Ok(Principal {
sub: data.claims.sub,
session_id: data.claims.session_id,
permissions: data.claims.permissions,
})
}
pub fn has_permission(principal: &Principal, permission: &str) -> bool {
principal.permissions.iter().any(|p| p == permission)
}

View File

@@ -0,0 +1,57 @@
use serde::{Deserialize, Serialize};
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
pub struct BuildInfo {
pub service: String,
pub version: String,
pub git_sha: String,
}
pub fn extract_build_info(metrics: &str) -> Vec<BuildInfo> {
let mut out = Vec::new();
for line in metrics.lines() {
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
let Some((metric_and_labels, value)) = line.split_once(' ') else {
continue;
};
if value.trim() != "1" {
continue;
}
if !metric_and_labels.ends_with('}') {
continue;
}
let Some((name, labels)) = metric_and_labels.split_once('{') else {
continue;
};
if !name.ends_with("_build_info") {
continue;
}
let labels = labels.trim_end_matches('}');
let mut service = None;
let mut version = None;
let mut git_sha = None;
for part in labels.split(',') {
let Some((k, v)) = part.split_once('=') else {
continue;
};
let v = v.trim().trim_matches('"');
match k.trim() {
"service" => service = Some(v.to_string()),
"version" => version = Some(v.to_string()),
"git_sha" => git_sha = Some(v.to_string()),
_ => {}
}
}
if let (Some(service), Some(version), Some(git_sha)) = (service, version, git_sha) {
out.push(BuildInfo {
service,
version,
git_sha,
});
}
}
out
}

View File

@@ -0,0 +1,42 @@
use serde::{Deserialize, Serialize};
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct GrafanaAnnotation {
pub time: i64,
pub tags: Vec<String>,
pub text: String,
}
pub fn build_grafana_deploy_annotation(args: DeployAnnotationArgs) -> GrafanaAnnotation {
let mut tags = vec![
"cloudlysis".to_string(),
"deploy".to_string(),
format!("service:{}", args.service),
];
if let Some(v) = args.version {
tags.push(format!("version:{v}"));
}
if let Some(sha) = args.git_sha {
tags.push(format!("git_sha:{sha}"));
}
let text = match (args.version, args.git_sha) {
(Some(v), Some(sha)) => format!("deploy {} v={} git_sha={sha}", args.service, v),
(Some(v), None) => format!("deploy {} v={}", args.service, v),
(None, Some(sha)) => format!("deploy {} git_sha={sha}", args.service),
(None, None) => format!("deploy {}", args.service),
};
GrafanaAnnotation {
time: args.time_ms,
tags,
text,
}
}
pub struct DeployAnnotationArgs<'a> {
pub service: &'a str,
pub version: Option<&'a str>,
pub git_sha: Option<&'a str>,
pub time_ms: i64,
}

67
control/api/src/fleet.rs Normal file
View File

@@ -0,0 +1,67 @@
use serde::{Deserialize, Serialize};
use std::time::Duration;
use crate::RequestIds;
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct FleetService {
pub name: String,
pub base_url: String,
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct FleetServiceSnapshot {
pub name: String,
pub base_url: String,
pub health_ok: bool,
pub ready_ok: bool,
pub metrics_ok: bool,
}
pub async fn snapshot(
client: &reqwest::Client,
services: &[FleetService],
) -> Vec<FleetServiceSnapshot> {
snapshot_with_context(client, services, None).await
}
pub async fn snapshot_with_context(
client: &reqwest::Client,
services: &[FleetService],
ctx: Option<&RequestIds>,
) -> Vec<FleetServiceSnapshot> {
let mut out = Vec::with_capacity(services.len());
for svc in services {
let base = svc.base_url.trim_end_matches('/');
let health_ok = get_ok(client, &format!("{base}/health"), ctx).await;
let ready_ok = get_ok(client, &format!("{base}/ready"), ctx).await;
let metrics_ok = get_ok(client, &format!("{base}/metrics"), ctx).await;
out.push(FleetServiceSnapshot {
name: svc.name.clone(),
base_url: svc.base_url.clone(),
health_ok,
ready_ok,
metrics_ok,
});
}
out
}
async fn get_ok(client: &reqwest::Client, url: &str, ctx: Option<&RequestIds>) -> bool {
let mut req = client.get(url).timeout(Duration::from_secs(2));
if let Some(ctx) = ctx {
req = req.header("x-request-id", &ctx.request_id);
if let Some(cid) = &ctx.correlation_id {
req = req.header("x-correlation-id", cid);
}
if let Some(tp) = &ctx.traceparent {
req = req.header("traceparent", tp);
}
}
let res = req.send().await;
match res {
Ok(r) => r.status().is_success(),
Err(_) => false,
}
}

View File

@@ -0,0 +1,348 @@
use crate::{
AppState, Principal,
audit::{AuditEvent, AuditStore},
fleet,
jobs::{Job, JobStatus, JobStep, JobStore},
};
use std::{
collections::HashMap,
sync::{Arc, Mutex},
time::{Duration, SystemTime, UNIX_EPOCH},
};
use uuid::Uuid;
#[derive(Clone, Default)]
pub struct TenantLocks {
inner: Arc<Mutex<HashMap<Uuid, Uuid>>>,
}
impl TenantLocks {
pub fn try_lock(&self, tenant_id: Uuid, job_id: Uuid) -> bool {
let mut map = self.inner.lock().expect("tenant locks poisoned");
if map.contains_key(&tenant_id) {
return false;
}
map.insert(tenant_id, job_id);
true
}
pub fn unlock(&self, tenant_id: Uuid, job_id: Uuid) {
let mut map = self.inner.lock().expect("tenant locks poisoned");
if map.get(&tenant_id).copied() == Some(job_id) {
map.remove(&tenant_id);
}
}
}
#[derive(Clone)]
pub struct JobEngine {
pub jobs: JobStore,
pub audit: AuditStore,
pub tenant_locks: TenantLocks,
pub step_timeout: Duration,
}
impl JobEngine {
pub fn new(jobs: JobStore, audit: AuditStore, tenant_locks: TenantLocks) -> Self {
Self {
jobs,
audit,
tenant_locks,
step_timeout: Duration::from_millis(500),
}
}
pub fn start_tenant_drain(
&self,
state: AppState,
principal: &Principal,
tenant_id: Uuid,
reason: String,
idempotency_key: &str,
) -> Result<Uuid, StartJobError> {
if let Some(existing) = self.jobs.get_idempotent(idempotency_key) {
return Ok(existing);
}
let job_id = Uuid::new_v4();
if !self.tenant_locks.try_lock(tenant_id, job_id) {
return Err(StartJobError::TenantLocked);
}
let now = now_ms();
let job = Job {
job_id,
status: JobStatus::Pending,
steps: vec![step("preflight"), step("drain"), step("verify")],
error: None,
created_at_ms: now,
started_at_ms: None,
finished_at_ms: None,
};
let inserted = self.jobs.insert_idempotent(idempotency_key, job);
self.audit.record(AuditEvent {
ts_ms: now,
principal_sub: principal.sub.clone(),
action: "tenant.drain".to_string(),
tenant_id: Some(tenant_id),
reason,
job_id: Some(inserted),
});
let engine = self.clone();
tokio::spawn(async move {
engine
.run_job(state, inserted, Some(tenant_id), RunSpec::Drain)
.await;
});
Ok(inserted)
}
pub fn start_tenant_migrate(
&self,
state: AppState,
principal: &Principal,
tenant_id: Uuid,
runner_target: String,
reason: String,
idempotency_key: &str,
) -> Result<Uuid, StartJobError> {
if let Some(existing) = self.jobs.get_idempotent(idempotency_key) {
return Ok(existing);
}
let job_id = Uuid::new_v4();
if !self.tenant_locks.try_lock(tenant_id, job_id) {
return Err(StartJobError::TenantLocked);
}
let now = now_ms();
let job = Job {
job_id,
status: JobStatus::Pending,
steps: vec![
step("preflight"),
step("drain"),
step("update_placement"),
step("reload"),
step("verify"),
],
error: None,
created_at_ms: now,
started_at_ms: None,
finished_at_ms: None,
};
let inserted = self.jobs.insert_idempotent(idempotency_key, job);
self.audit.record(AuditEvent {
ts_ms: now,
principal_sub: principal.sub.clone(),
action: "tenant.migrate".to_string(),
tenant_id: Some(tenant_id),
reason,
job_id: Some(inserted),
});
let engine = self.clone();
tokio::spawn(async move {
engine
.run_job(
state,
inserted,
Some(tenant_id),
RunSpec::Migrate { runner_target },
)
.await;
});
Ok(inserted)
}
async fn run_job(&self, state: AppState, job_id: Uuid, tenant_id: Option<Uuid>, spec: RunSpec) {
self.jobs.update(job_id, |j| {
j.status = JobStatus::Running;
j.started_at_ms = Some(now_ms());
});
let mut ok = true;
for idx in 0.. {
if self.jobs.cancel_requested(job_id) {
ok = false;
self.jobs.update(job_id, |j| {
j.status = JobStatus::Cancelled;
j.finished_at_ms = Some(now_ms());
j.error = Some("cancelled".to_string());
for step in &mut j.steps {
if step.status == JobStatus::Pending || step.status == JobStatus::Running {
step.status = JobStatus::Cancelled;
}
}
});
break;
}
let step_name = {
let Some(job) = self.jobs.get(job_id) else {
break;
};
let Some(step) = job.steps.get(idx) else {
break;
};
step.name.clone()
};
self.jobs.update(job_id, |j| {
if let Some(step) = j.steps.get_mut(idx) {
step.status = JobStatus::Running;
step.attempts += 1;
}
});
let r = tokio::time::timeout(
self.step_timeout,
run_step(&state, &spec, &step_name, tenant_id),
)
.await;
match r {
Ok(Ok(())) => {
self.jobs.update(job_id, |j| {
if let Some(step) = j.steps.get_mut(idx) {
step.status = JobStatus::Succeeded;
step.error = None;
}
});
}
Ok(Err(e)) => {
ok = false;
self.jobs.update(job_id, |j| {
if let Some(step) = j.steps.get_mut(idx) {
step.status = JobStatus::Failed;
step.error = Some(e.clone());
}
j.status = JobStatus::Failed;
j.error = Some(e);
j.finished_at_ms = Some(now_ms());
});
break;
}
Err(_) => {
ok = false;
self.jobs.update(job_id, |j| {
if let Some(step) = j.steps.get_mut(idx) {
step.status = JobStatus::Failed;
step.error = Some("step timeout".to_string());
}
j.status = JobStatus::Failed;
j.error = Some("step timeout".to_string());
j.finished_at_ms = Some(now_ms());
});
break;
}
}
if !ok {
break;
}
let done = match self.jobs.get(job_id) {
Some(job) => idx + 1 >= job.steps.len(),
None => true,
};
if done {
break;
}
}
if ok {
self.jobs.update(job_id, |j| {
j.status = JobStatus::Succeeded;
j.finished_at_ms = Some(now_ms());
});
}
if let Some(tid) = tenant_id {
self.tenant_locks.unlock(tid, job_id);
}
}
}
#[derive(Debug)]
pub enum StartJobError {
TenantLocked,
}
#[derive(Clone)]
enum RunSpec {
Drain,
Migrate { runner_target: String },
}
fn step(name: &str) -> JobStep {
JobStep {
name: name.to_string(),
status: JobStatus::Pending,
attempts: 0,
error: None,
}
}
fn now_ms() -> u64 {
SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_millis() as u64
}
async fn run_step(
state: &AppState,
spec: &RunSpec,
step: &str,
tenant_id: Option<Uuid>,
) -> Result<(), String> {
match step {
"preflight" => {
let snapshots = fleet::snapshot(&state.http, &state.fleet_services).await;
if snapshots.iter().any(|s| !s.ready_ok) {
return Err("preflight failed: fleet not ready".to_string());
}
Ok(())
}
"drain" => {
tokio::time::sleep(Duration::from_millis(50)).await;
Ok(())
}
"update_placement" => match spec {
RunSpec::Migrate { runner_target } => {
let tenant_id = tenant_id.ok_or_else(|| "missing tenant_id".to_string())?;
state
.placement
.update_runner_target(tenant_id, runner_target.clone())
.map(|_| ())
}
_ => Ok(()),
},
"reload" => {
let _ = state.placement.tenant_summaries();
Ok(())
}
"verify" => match spec {
RunSpec::Migrate { runner_target } => {
let tenant_id = tenant_id.ok_or_else(|| "missing tenant_id".to_string())?;
let summaries = state.placement.tenant_summaries();
let found = summaries
.iter()
.find(|t| t.tenant_id == tenant_id)
.map(|t| t.runner_targets.iter().any(|x| x == runner_target))
.unwrap_or(false);
if !found {
return Err("verify failed: placement not updated".to_string());
}
Ok(())
}
_ => Ok(()),
},
_ => Ok(()),
}
}

122
control/api/src/jobs.rs Normal file
View File

@@ -0,0 +1,122 @@
use serde::{Deserialize, Serialize};
use std::{
collections::HashMap,
sync::{
Arc, Mutex,
atomic::{AtomicBool, Ordering},
},
};
use uuid::Uuid;
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum JobStatus {
Pending,
Running,
Succeeded,
Failed,
Cancelled,
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct Job {
pub job_id: Uuid,
pub status: JobStatus,
pub steps: Vec<JobStep>,
pub error: Option<String>,
pub created_at_ms: u64,
pub started_at_ms: Option<u64>,
pub finished_at_ms: Option<u64>,
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct JobStep {
pub name: String,
pub status: JobStatus,
pub attempts: u32,
pub error: Option<String>,
}
struct JobRecord {
job: Mutex<Job>,
cancel: AtomicBool,
}
#[derive(Clone, Default)]
pub struct JobStore {
inner: Arc<Inner>,
}
#[derive(Default)]
struct Inner {
jobs: Mutex<HashMap<Uuid, Arc<JobRecord>>>,
idempotency: Mutex<HashMap<String, Uuid>>,
}
impl JobStore {
pub fn get(&self, job_id: Uuid) -> Option<Job> {
let jobs = self.inner.jobs.lock().ok()?;
let rec = jobs.get(&job_id)?.clone();
rec.job.lock().ok().map(|j| j.clone())
}
pub fn get_idempotent(&self, key: &str) -> Option<Uuid> {
let map = self.inner.idempotency.lock().ok()?;
map.get(key).copied()
}
pub fn insert_idempotent(&self, key: &str, job: Job) -> Uuid {
let mut idempotency = self
.inner
.idempotency
.lock()
.expect("idempotency lock poisoned");
if let Some(existing) = idempotency.get(key) {
return *existing;
}
let job_id = job.job_id;
let rec = Arc::new(JobRecord {
job: Mutex::new(job),
cancel: AtomicBool::new(false),
});
self.inner
.jobs
.lock()
.expect("jobs lock poisoned")
.insert(job_id, rec);
idempotency.insert(key.to_string(), job_id);
job_id
}
pub fn request_cancel(&self, job_id: Uuid) -> bool {
let jobs = self.inner.jobs.lock().expect("jobs lock poisoned");
let Some(rec) = jobs.get(&job_id) else {
return false;
};
rec.cancel.store(true, Ordering::SeqCst);
true
}
pub fn cancel_requested(&self, job_id: Uuid) -> bool {
let jobs = self.inner.jobs.lock().expect("jobs lock poisoned");
let Some(rec) = jobs.get(&job_id) else {
return false;
};
rec.cancel.load(Ordering::SeqCst)
}
pub fn update<F>(&self, job_id: Uuid, f: F) -> bool
where
F: FnOnce(&mut Job),
{
let jobs = self.inner.jobs.lock().expect("jobs lock poisoned");
let Some(rec) = jobs.get(&job_id) else {
return false;
};
let mut job = rec.job.lock().expect("job lock poisoned");
f(&mut job);
true
}
}

692
control/api/src/lib.rs Normal file
View File

@@ -0,0 +1,692 @@
mod admin;
mod audit;
mod auth;
mod build_info;
mod deployments;
mod fleet;
mod job_engine;
mod jobs;
mod placement;
mod swarm;
pub use audit::AuditStore;
pub use auth::{AuthConfig, Principal};
use axum::{
Router,
extract::State,
http::{HeaderName, HeaderValue, Request, StatusCode},
middleware::{Next, from_fn, from_fn_with_state},
response::{IntoResponse, Response},
routing::get,
};
pub use build_info::{BuildInfo, extract_build_info};
pub use deployments::{DeployAnnotationArgs, GrafanaAnnotation, build_grafana_deploy_annotation};
pub use fleet::FleetService;
pub use job_engine::TenantLocks;
pub use jobs::JobStore;
use metrics_exporter_prometheus::PrometheusHandle;
pub use placement::PlacementStore;
pub use placement::ServiceKind;
use std::time::Instant;
pub use swarm::SwarmStore;
use tower_http::trace::TraceLayer;
use tracing::{Span, field};
use uuid::Uuid;
#[derive(Clone)]
pub struct AppState {
pub prometheus: PrometheusHandle,
pub auth: AuthConfig,
pub jobs: JobStore,
pub audit: AuditStore,
pub tenant_locks: TenantLocks,
pub http: reqwest::Client,
pub placement: PlacementStore,
pub fleet_services: Vec<FleetService>,
pub swarm: SwarmStore,
}
#[derive(Clone, Debug)]
pub struct RequestIds {
pub request_id: String,
pub correlation_id: Option<String>,
pub traceparent: Option<String>,
}
const HEADER_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
const HEADER_CORRELATION_ID: HeaderName = HeaderName::from_static("x-correlation-id");
const HEADER_TRACEPARENT: HeaderName = HeaderName::from_static("traceparent");
pub fn build_app(state: AppState) -> Router {
let trace = TraceLayer::new_for_http()
.make_span_with(|req: &Request<_>| {
let request_id = req
.headers()
.get(&HEADER_REQUEST_ID)
.and_then(|v| v.to_str().ok())
.unwrap_or("")
.to_owned();
let correlation_id = req
.headers()
.get(&HEADER_CORRELATION_ID)
.and_then(|v| v.to_str().ok())
.unwrap_or("")
.to_owned();
tracing::info_span!(
"http_request",
request.method = %req.method(),
request.path = %req.uri().path(),
request_id = %request_id,
correlation_id = %correlation_id,
trace_id = "",
status = field::Empty,
duration_ms = field::Empty,
)
})
.on_response(
|res: &Response, latency: std::time::Duration, span: &Span| {
span.record("status", field::display(res.status()));
span.record("duration_ms", field::display(latency.as_millis()));
tracing::info!("response");
},
);
let admin =
admin::admin_router().layer(from_fn_with_state(state.clone(), auth::auth_middleware));
Router::new()
.route("/health", get(health))
.route("/ready", get(ready))
.route("/metrics", get(metrics))
.nest("/admin/v1", admin)
.with_state(state)
.layer(trace)
.layer(from_fn(request_id_middleware))
}
async fn health() -> impl IntoResponse {
(StatusCode::OK, "ok")
}
async fn ready() -> impl IntoResponse {
(StatusCode::OK, "ready")
}
async fn metrics(State(state): State<AppState>) -> impl IntoResponse {
(StatusCode::OK, state.prometheus.render())
}
async fn request_id_middleware(mut req: Request<axum::body::Body>, next: Next) -> Response {
let request_id = req
.headers()
.get(&HEADER_REQUEST_ID)
.and_then(|v| v.to_str().ok())
.map(|s| s.to_owned())
.unwrap_or_else(|| Uuid::new_v4().to_string());
let correlation_id = req
.headers()
.get(&HEADER_CORRELATION_ID)
.and_then(|v| v.to_str().ok())
.map(|s| s.to_owned());
let traceparent = req
.headers()
.get(&HEADER_TRACEPARENT)
.and_then(|v| v.to_str().ok())
.map(|s| s.to_owned());
if req.headers().get(&HEADER_REQUEST_ID).is_none()
&& let Ok(v) = HeaderValue::from_str(&request_id)
{
req.headers_mut().insert(HEADER_REQUEST_ID.clone(), v);
}
req.extensions_mut().insert(RequestIds {
request_id: request_id.clone(),
correlation_id: correlation_id.clone(),
traceparent: traceparent.clone(),
});
let start = Instant::now();
let mut res = next.run(req).await;
if let Ok(v) = HeaderValue::from_str(&request_id) {
res.headers_mut().insert(HEADER_REQUEST_ID.clone(), v);
}
if let Some(correlation_id) = correlation_id
&& let Ok(v) = HeaderValue::from_str(&correlation_id)
{
res.headers_mut().insert(HEADER_CORRELATION_ID.clone(), v);
}
metrics::histogram!("http_request_duration_ms").record(start.elapsed().as_millis() as f64);
res
}
#[cfg(test)]
mod tests {
use super::*;
use crate::jobs::JobStatus;
use axum::{
body::Body,
http::{Request, StatusCode, header},
};
use jsonwebtoken::{EncodingKey, Header, encode};
use metrics_exporter_prometheus::PrometheusBuilder;
use serde::Serialize;
use std::fs;
use std::path::PathBuf;
use std::sync::OnceLock;
use tower::ServiceExt;
use uuid::Uuid;
static HANDLE: OnceLock<PrometheusHandle> = OnceLock::new();
#[derive(Serialize)]
struct TestClaims {
sub: String,
session_id: String,
permissions: Vec<String>,
exp: usize,
}
fn test_app() -> Router {
test_app_with_fleet(vec![])
}
fn test_app_with_fleet(fleet_services: Vec<FleetService>) -> Router {
let handle = HANDLE
.get_or_init(|| {
PrometheusBuilder::new()
.install_recorder()
.expect("failed to install prometheus recorder")
})
.clone();
let placement_path = temp_placement_file();
build_app(AppState {
prometheus: handle,
auth: AuthConfig {
hs256_secret: Some(b"test_secret".to_vec()),
},
jobs: JobStore::default(),
audit: AuditStore::default(),
tenant_locks: TenantLocks::default(),
http: reqwest::Client::new(),
placement: PlacementStore::new(placement_path),
fleet_services,
swarm: SwarmStore::new(repo_root().join("swarm/dev.json")),
})
}
fn repo_root() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.and_then(|p| p.parent())
.expect("api crate should live under repo root")
.to_path_buf()
}
fn temp_placement_file() -> PathBuf {
let root = repo_root();
let src = root.join("placement/dev.json");
let mut dst = std::env::temp_dir();
dst.push(format!(
"cloudlysis-control-placement-{}-{}.json",
std::process::id(),
Uuid::new_v4()
));
let raw = fs::read_to_string(src).expect("missing placement/dev.json");
fs::write(&dst, raw).expect("failed to write temp placement file");
dst
}
fn assert_send_sync<T: Send + Sync>() {}
#[test]
fn core_state_types_are_send_sync() {
assert_send_sync::<AppState>();
assert_send_sync::<JobStore>();
assert_send_sync::<AuthConfig>();
}
#[tokio::test]
async fn health_returns_200() {
let res = test_app()
.oneshot(
Request::builder()
.uri("/health")
.body(Body::empty())
.unwrap(),
)
.await
.unwrap();
assert_eq!(res.status(), StatusCode::OK);
}
#[tokio::test]
async fn ready_returns_200() {
let res = test_app()
.oneshot(
Request::builder()
.uri("/ready")
.body(Body::empty())
.unwrap(),
)
.await
.unwrap();
assert_eq!(res.status(), StatusCode::OK);
}
#[tokio::test]
async fn metrics_returns_200() {
let res = test_app()
.oneshot(
Request::builder()
.uri("/metrics")
.body(Body::empty())
.unwrap(),
)
.await
.unwrap();
assert_eq!(res.status(), StatusCode::OK);
}
fn make_token(perms: &[&str]) -> String {
let exp = (std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_secs()
+ 60) as usize;
encode(
&Header::default(),
&TestClaims {
sub: "user_1".to_string(),
session_id: "sess_1".to_string(),
permissions: perms.iter().map(|p| (*p).to_string()).collect(),
exp,
},
&EncodingKey::from_secret(b"test_secret"),
)
.unwrap()
}
#[tokio::test]
async fn unauthorized_admin_calls_return_401() {
let res = test_app()
.oneshot(
Request::builder()
.uri("/admin/v1/platform/info")
.body(Body::empty())
.unwrap(),
)
.await
.unwrap();
assert_eq!(res.status(), StatusCode::UNAUTHORIZED);
}
#[tokio::test]
async fn forbidden_admin_calls_return_403() {
let token = make_token(&["control:read"]);
let res = test_app()
.oneshot(
Request::builder()
.uri("/admin/v1/jobs/echo")
.method("POST")
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.header("idempotency-key", "k1")
.body(Body::empty())
.unwrap(),
)
.await
.unwrap();
assert_eq!(res.status(), StatusCode::FORBIDDEN);
}
#[tokio::test]
async fn tenant_scoped_endpoints_require_x_tenant_id() {
let token = make_token(&["control:read"]);
let res = test_app()
.oneshot(
Request::builder()
.uri("/admin/v1/tenants/echo")
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.body(Body::empty())
.unwrap(),
)
.await
.unwrap();
assert_eq!(res.status(), StatusCode::BAD_REQUEST);
}
#[tokio::test]
async fn job_create_is_idempotent() {
let token = make_token(&["control:write"]);
let app = test_app();
let res1 = app
.clone()
.oneshot(
Request::builder()
.uri("/admin/v1/jobs/echo")
.method("POST")
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.header("idempotency-key", "same-key")
.body(Body::empty())
.unwrap(),
)
.await
.unwrap();
assert_eq!(res1.status(), StatusCode::OK);
let body1 = axum::body::to_bytes(res1.into_body(), 1024 * 1024)
.await
.unwrap();
let v1: serde_json::Value = serde_json::from_slice(&body1).unwrap();
let id1 = Uuid::parse_str(v1.get("job_id").unwrap().as_str().unwrap()).unwrap();
let res2 = app
.clone()
.oneshot(
Request::builder()
.uri("/admin/v1/jobs/echo")
.method("POST")
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.header("idempotency-key", "same-key")
.body(Body::empty())
.unwrap(),
)
.await
.unwrap();
assert_eq!(res2.status(), StatusCode::OK);
let body2 = axum::body::to_bytes(res2.into_body(), 1024 * 1024)
.await
.unwrap();
let v2: serde_json::Value = serde_json::from_slice(&body2).unwrap();
let id2 = Uuid::parse_str(v2.get("job_id").unwrap().as_str().unwrap()).unwrap();
assert_eq!(id1, id2);
}
async fn wait_for_terminal_status(app: Router, job_id: Uuid) -> JobStatus {
let start = tokio::time::Instant::now();
loop {
let res = app
.clone()
.oneshot(
Request::builder()
.uri(format!("/admin/v1/jobs/{job_id}"))
.header(
header::AUTHORIZATION,
format!("Bearer {}", make_token(&["control:read"])),
)
.body(Body::empty())
.unwrap(),
)
.await
.unwrap();
if res.status() == StatusCode::OK {
let body = axum::body::to_bytes(res.into_body(), 1024 * 1024)
.await
.unwrap();
let job: crate::jobs::Job = serde_json::from_slice(&body).unwrap();
if job.status != JobStatus::Pending && job.status != JobStatus::Running {
return job.status;
}
}
if start.elapsed() > std::time::Duration::from_millis(500) {
return JobStatus::Failed;
}
tokio::time::sleep(std::time::Duration::from_millis(10)).await;
}
}
#[tokio::test]
async fn tenant_job_idempotency_does_not_duplicate_effects() {
let token = make_token(&["control:write", "control:read"]);
let app = test_app();
let tenant_id = Uuid::new_v4();
let body = serde_json::json!({
"tenant_id": tenant_id,
"reason": "test",
});
let res1 = app
.clone()
.oneshot(
Request::builder()
.uri("/admin/v1/jobs/tenant/drain")
.method("POST")
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.header("idempotency-key", "same-key")
.header(header::CONTENT_TYPE, "application/json")
.body(Body::from(body.to_string()))
.unwrap(),
)
.await
.unwrap();
assert_eq!(res1.status(), StatusCode::OK);
let res2 = app
.clone()
.oneshot(
Request::builder()
.uri("/admin/v1/jobs/tenant/drain")
.method("POST")
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.header("idempotency-key", "same-key")
.header(header::CONTENT_TYPE, "application/json")
.body(Body::from(body.to_string()))
.unwrap(),
)
.await
.unwrap();
assert_eq!(res2.status(), StatusCode::OK);
let b1 = axum::body::to_bytes(res1.into_body(), 1024 * 1024)
.await
.unwrap();
let b2 = axum::body::to_bytes(res2.into_body(), 1024 * 1024)
.await
.unwrap();
let v1: serde_json::Value = serde_json::from_slice(&b1).unwrap();
let v2: serde_json::Value = serde_json::from_slice(&b2).unwrap();
assert_eq!(v1.get("job_id"), v2.get("job_id"));
}
#[tokio::test]
async fn tenant_lock_prevents_concurrent_mutations() {
let token = make_token(&["control:write", "control:read"]);
let app = test_app();
let tenant_id = Uuid::new_v4();
let res1 = app
.clone()
.oneshot(
Request::builder()
.uri("/admin/v1/jobs/tenant/drain")
.method("POST")
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.header("idempotency-key", "k1")
.header(header::CONTENT_TYPE, "application/json")
.body(Body::from(
serde_json::json!({ "tenant_id": tenant_id, "reason": "r" }).to_string(),
))
.unwrap(),
)
.await
.unwrap();
assert_eq!(res1.status(), StatusCode::OK);
let res2 = app
.clone()
.oneshot(
Request::builder()
.uri("/admin/v1/jobs/tenant/migrate")
.method("POST")
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.header("idempotency-key", "k2")
.header(header::CONTENT_TYPE, "application/json")
.body(Body::from(
serde_json::json!({
"tenant_id": tenant_id,
"runner_target": "node-2",
"reason": "r2"
})
.to_string(),
))
.unwrap(),
)
.await
.unwrap();
assert_eq!(res2.status(), StatusCode::CONFLICT);
}
#[tokio::test]
async fn migrate_preflight_fails_when_fleet_not_ready() {
let token = make_token(&["control:write", "control:read"]);
let app = test_app_with_fleet(vec![FleetService {
name: "unreachable".to_string(),
base_url: "http://127.0.0.1:1".to_string(),
}]);
let tenant_id = Uuid::new_v4();
let res = app
.clone()
.oneshot(
Request::builder()
.uri("/admin/v1/jobs/tenant/migrate")
.method("POST")
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.header("idempotency-key", "k3")
.header(header::CONTENT_TYPE, "application/json")
.body(Body::from(
serde_json::json!({
"tenant_id": tenant_id,
"runner_target": "node-2",
"reason": "r"
})
.to_string(),
))
.unwrap(),
)
.await
.unwrap();
assert_eq!(res.status(), StatusCode::OK);
let body = axum::body::to_bytes(res.into_body(), 1024 * 1024)
.await
.unwrap();
let v: serde_json::Value = serde_json::from_slice(&body).unwrap();
let job_id = Uuid::parse_str(v.get("job_id").unwrap().as_str().unwrap()).unwrap();
let status = wait_for_terminal_status(app, job_id).await;
assert_eq!(status, JobStatus::Failed);
}
#[tokio::test]
async fn cancel_marks_job_cancelled() {
let token = make_token(&["control:write", "control:read"]);
let app = test_app();
let tenant_id = Uuid::new_v4();
let res = app
.clone()
.oneshot(
Request::builder()
.uri("/admin/v1/jobs/tenant/migrate")
.method("POST")
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.header("idempotency-key", "k4")
.header(header::CONTENT_TYPE, "application/json")
.body(Body::from(
serde_json::json!({
"tenant_id": tenant_id,
"runner_target": "node-2",
"reason": "r"
})
.to_string(),
))
.unwrap(),
)
.await
.unwrap();
assert_eq!(res.status(), StatusCode::OK);
let body = axum::body::to_bytes(res.into_body(), 1024 * 1024)
.await
.unwrap();
let v: serde_json::Value = serde_json::from_slice(&body).unwrap();
let job_id = Uuid::parse_str(v.get("job_id").unwrap().as_str().unwrap()).unwrap();
let res = app
.clone()
.oneshot(
Request::builder()
.uri(format!("/admin/v1/jobs/{job_id}/cancel"))
.method("POST")
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.body(Body::empty())
.unwrap(),
)
.await
.unwrap();
assert_eq!(res.status(), StatusCode::OK);
let status = wait_for_terminal_status(app, job_id).await;
assert_eq!(status, JobStatus::Cancelled);
}
#[tokio::test]
async fn migration_plan_is_deterministic() {
let token = make_token(&["control:write"]);
let app = test_app();
let tenant_id = Uuid::new_v4();
let res = app
.oneshot(
Request::builder()
.uri("/admin/v1/plan/tenant/migrate")
.method("POST")
.header(header::AUTHORIZATION, format!("Bearer {token}"))
.header(header::CONTENT_TYPE, "application/json")
.body(Body::from(
serde_json::json!({
"tenant_id": tenant_id,
"runner_target": "node-2",
"reason": "r"
})
.to_string(),
))
.unwrap(),
)
.await
.unwrap();
assert_eq!(res.status(), StatusCode::OK);
let body = axum::body::to_bytes(res.into_body(), 1024 * 1024)
.await
.unwrap();
let v: serde_json::Value = serde_json::from_slice(&body).unwrap();
assert_eq!(
v.get("steps").unwrap(),
&serde_json::json!(["preflight", "drain", "update_placement", "reload", "verify"])
);
}
}

109
control/api/src/main.rs Normal file
View File

@@ -0,0 +1,109 @@
use clap::Parser;
use metrics_exporter_prometheus::PrometheusBuilder;
use std::net::SocketAddr;
use tracing_subscriber::EnvFilter;
#[derive(Parser, Debug)]
#[command(name = "control-api")]
struct Args {
#[arg(long, env = "CONTROL_API_ADDR", default_value = "127.0.0.1:8080")]
addr: SocketAddr,
}
#[tokio::main]
async fn main() {
let args = Args::parse();
tracing_subscriber::fmt()
.with_env_filter(
EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")),
)
.init();
let recorder = PrometheusBuilder::new()
.set_buckets(&[
1.0, 2.5, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0, 1000.0, 2500.0, 5000.0,
])
.expect("invalid prometheus buckets")
.install_recorder()
.expect("failed to install prometheus recorder");
let http = reqwest::Client::builder()
.user_agent("cloudlysis-control-api")
.build()
.expect("failed to build http client");
let placement_path = std::env::var("CONTROL_PLACEMENT_PATH")
.ok()
.unwrap_or_else(|| "placement/dev.json".to_string())
.into();
let swarm_path = std::env::var("CONTROL_SWARM_STATE_PATH")
.ok()
.unwrap_or_else(|| "swarm/dev.json".to_string())
.into();
let self_url = std::env::var("CONTROL_SELF_URL")
.ok()
.unwrap_or_else(|| "http://127.0.0.1:8080".to_string());
let mut fleet_services = vec![api::FleetService {
name: "control-api".to_string(),
base_url: self_url,
}];
if let Ok(spec) = std::env::var("CONTROL_FLEET_SERVICES") {
fleet_services.extend(parse_fleet_services(&spec));
}
let app = api::build_app(api::AppState {
prometheus: recorder,
auth: api::AuthConfig {
hs256_secret: std::env::var("CONTROL_GATEWAY_JWT_HS256_SECRET")
.ok()
.map(|s| s.into_bytes()),
},
jobs: api::JobStore::default(),
audit: api::AuditStore::default(),
tenant_locks: api::TenantLocks::default(),
http,
placement: api::PlacementStore::new(placement_path),
fleet_services,
swarm: api::SwarmStore::new(swarm_path),
});
let listener = tokio::net::TcpListener::bind(args.addr)
.await
.expect("failed to bind");
tracing::info!(addr = %args.addr, "control api listening");
axum::serve(listener, app)
.with_graceful_shutdown(shutdown_signal())
.await
.expect("server failed");
}
async fn shutdown_signal() {
let _ = tokio::signal::ctrl_c().await;
}
fn parse_fleet_services(spec: &str) -> Vec<api::FleetService> {
spec.split(',')
.filter_map(|pair| {
let pair = pair.trim();
if pair.is_empty() {
return None;
}
let (name, url) = pair.split_once('=')?;
let name = name.trim();
let url = url.trim();
if name.is_empty() || url.is_empty() {
return None;
}
Some(api::FleetService {
name: name.to_string(),
base_url: url.to_string(),
})
})
.collect()
}

View File

@@ -0,0 +1,227 @@
use serde::{Deserialize, Serialize};
use std::{
collections::BTreeMap,
fs,
path::{Path, PathBuf},
sync::{Arc, RwLock},
time::SystemTime,
};
use uuid::Uuid;
#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ServiceKind {
Aggregate,
Projection,
Runner,
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct PlacementFile {
pub revision: Option<String>,
pub aggregate_placement: Option<PlacementKind>,
pub projection_placement: Option<PlacementKind>,
pub runner_placement: Option<PlacementKind>,
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct PlacementKind {
pub placements: Vec<TenantPlacement>,
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct TenantPlacement {
pub tenant_id: Uuid,
pub targets: Vec<String>,
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct PlacementResponse {
pub kind: ServiceKind,
pub revision: String,
pub placements: Vec<TenantPlacement>,
}
impl PlacementFile {
pub fn load(path: &Path) -> Option<Self> {
let raw = fs::read_to_string(path).ok()?;
serde_json::from_str(&raw).ok()
}
pub fn for_kind(&self, kind: ServiceKind) -> PlacementResponse {
let revision = self.revision.clone().unwrap_or_else(|| "dev".to_string());
let placements = match kind {
ServiceKind::Aggregate => self
.aggregate_placement
.as_ref()
.map(|p| p.placements.clone())
.unwrap_or_default(),
ServiceKind::Projection => self
.projection_placement
.as_ref()
.map(|p| p.placements.clone())
.unwrap_or_default(),
ServiceKind::Runner => self
.runner_placement
.as_ref()
.map(|p| p.placements.clone())
.unwrap_or_default(),
};
PlacementResponse {
kind,
revision,
placements,
}
}
}
#[derive(Clone)]
pub struct PlacementStore {
inner: Arc<RwLock<Inner>>,
}
struct Inner {
path: PathBuf,
last_modified: Option<SystemTime>,
cached: Option<PlacementFile>,
}
impl PlacementStore {
pub fn new(path: PathBuf) -> Self {
Self {
inner: Arc::new(RwLock::new(Inner {
path,
last_modified: None,
cached: None,
})),
}
}
pub fn get_for_kind(&self, kind: ServiceKind) -> PlacementResponse {
let mut inner = self.inner.write().expect("placement lock poisoned");
inner.reload_if_changed();
match inner.cached.as_ref() {
Some(p) => p.for_kind(kind),
None => PlacementResponse {
kind,
revision: "dev".to_string(),
placements: vec![],
},
}
}
pub fn tenant_summaries(&self) -> Vec<TenantSummary> {
let mut inner = self.inner.write().expect("placement lock poisoned");
inner.reload_if_changed();
let Some(p) = inner.cached.as_ref() else {
return vec![];
};
let mut map: BTreeMap<Uuid, TenantSummary> = BTreeMap::new();
for (kind, placements) in [
(
ServiceKind::Aggregate,
p.for_kind(ServiceKind::Aggregate).placements,
),
(
ServiceKind::Projection,
p.for_kind(ServiceKind::Projection).placements,
),
(
ServiceKind::Runner,
p.for_kind(ServiceKind::Runner).placements,
),
] {
for tp in placements {
let entry = map.entry(tp.tenant_id).or_insert_with(|| TenantSummary {
tenant_id: tp.tenant_id,
aggregate_targets: vec![],
projection_targets: vec![],
runner_targets: vec![],
});
match kind {
ServiceKind::Aggregate => entry.aggregate_targets = tp.targets,
ServiceKind::Projection => entry.projection_targets = tp.targets,
ServiceKind::Runner => entry.runner_targets = tp.targets,
}
}
}
map.into_values().collect()
}
pub fn update_runner_target(
&self,
tenant_id: Uuid,
runner_target: String,
) -> Result<String, String> {
let mut inner = self.inner.write().expect("placement lock poisoned");
inner.reload_if_changed();
let mut file = inner.cached.clone().unwrap_or(PlacementFile {
revision: Some("dev".to_string()),
aggregate_placement: Some(PlacementKind { placements: vec![] }),
projection_placement: Some(PlacementKind { placements: vec![] }),
runner_placement: Some(PlacementKind { placements: vec![] }),
});
let mut runner = file
.runner_placement
.take()
.unwrap_or(PlacementKind { placements: vec![] });
if let Some(existing) = runner
.placements
.iter_mut()
.find(|p| p.tenant_id == tenant_id)
{
existing.targets = vec![runner_target];
} else {
runner.placements.push(TenantPlacement {
tenant_id,
targets: vec![runner_target],
});
}
runner.placements.sort_by_key(|p| p.tenant_id);
file.runner_placement = Some(runner);
let revision = format!("rev-{}", Uuid::new_v4());
file.revision = Some(revision.clone());
let raw = serde_json::to_string_pretty(&file).map_err(|e| e.to_string())?;
let tmp = inner.path.with_extension("json.tmp");
fs::write(&tmp, raw).map_err(|e| e.to_string())?;
fs::rename(&tmp, &inner.path).map_err(|e| e.to_string())?;
inner.last_modified = None;
inner.cached = Some(file);
Ok(revision)
}
}
impl Inner {
fn reload_if_changed(&mut self) {
let meta = fs::metadata(&self.path).ok();
let modified = meta.and_then(|m| m.modified().ok());
if self.cached.is_some() && modified.is_some() && modified == self.last_modified {
return;
}
self.last_modified = modified;
self.cached = PlacementFile::load(&self.path);
}
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct TenantSummary {
pub tenant_id: Uuid,
pub aggregate_targets: Vec<String>,
pub projection_targets: Vec<String>,
pub runner_targets: Vec<String>,
}

62
control/api/src/swarm.rs Normal file
View File

@@ -0,0 +1,62 @@
use serde::{Deserialize, Serialize};
use std::{fs, path::Path};
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct SwarmService {
pub name: String,
pub image: Option<String>,
pub mode: Option<String>,
pub replicas: Option<String>,
pub updated_at: Option<String>,
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct SwarmTask {
pub id: String,
pub service: String,
pub node: Option<String>,
pub desired_state: Option<String>,
pub current_state: Option<String>,
pub error: Option<String>,
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct SwarmStateFile {
pub services: Vec<SwarmService>,
pub tasks: Vec<SwarmTask>,
}
#[derive(Clone)]
pub struct SwarmStore {
path: std::path::PathBuf,
}
impl SwarmStore {
pub fn new(path: std::path::PathBuf) -> Self {
Self { path }
}
pub fn list_services(&self) -> Vec<SwarmService> {
self.load().map(|s| s.services).unwrap_or_default()
}
pub fn list_tasks(&self, service_name: &str) -> Vec<SwarmTask> {
self.load()
.map(|s| {
s.tasks
.into_iter()
.filter(|t| t.service == service_name)
.collect()
})
.unwrap_or_default()
}
fn load(&self) -> Option<SwarmStateFile> {
load_state(&self.path)
}
}
fn load_state(path: &Path) -> Option<SwarmStateFile> {
let raw = fs::read_to_string(path).ok()?;
serde_json::from_str(&raw).ok()
}

View File

@@ -0,0 +1,16 @@
#[test]
fn annotation_writer_produces_expected_grafana_payload() {
let a = api::build_grafana_deploy_annotation(api::DeployAnnotationArgs {
service: "gateway",
version: Some("1.2.3"),
git_sha: Some("abc123"),
time_ms: 1234567890,
});
assert_eq!(a.time, 1234567890);
assert!(a.tags.iter().any(|t| t == "deploy"));
assert!(a.tags.iter().any(|t| t == "service:gateway"));
assert!(a.tags.iter().any(|t| t == "version:1.2.3"));
assert!(a.tags.iter().any(|t| t == "git_sha:abc123"));
assert!(a.text.contains("deploy gateway"));
}

View File

@@ -0,0 +1,39 @@
#[test]
fn build_info_parser_extracts_expected_labels() {
let metrics = r#"
# HELP gateway_build_info build info
# TYPE gateway_build_info gauge
gateway_build_info{service="gateway",version="1.2.3",git_sha="abc"} 1
runner_build_info{service="runner",version="2.0.0",git_sha="def"} 1
unrelated_metric 5
"#;
let info = api::extract_build_info(metrics);
assert_eq!(info.len(), 2);
assert!(
info.iter()
.any(|i| i.service == "gateway" && i.version == "1.2.3" && i.git_sha == "abc")
);
assert!(
info.iter()
.any(|i| i.service == "runner" && i.version == "2.0.0" && i.git_sha == "def")
);
}
#[test]
fn build_info_snapshot_has_required_services() {
let metrics = r#"
gateway_build_info{service="gateway",version="1.2.3",git_sha="abc"} 1
aggregate_build_info{service="aggregate",version="1.0.0",git_sha="aaa"} 1
projection_build_info{service="projection",version="1.0.0",git_sha="bbb"} 1
runner_build_info{service="runner",version="2.0.0",git_sha="ccc"} 1
"#;
let info = api::extract_build_info(metrics);
for required in ["gateway", "aggregate", "projection", "runner"] {
assert!(
info.iter().any(|i| i.service == required),
"missing build_info for service={required}"
);
}
}

View File

@@ -0,0 +1,55 @@
use std::{fs, path::PathBuf, time::Duration};
fn repo_root() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.and_then(|p| p.parent())
.expect("api crate should live under repo root")
.to_path_buf()
}
#[test]
fn docker_compose_files_parse_and_include_required_services() {
let root = repo_root();
let compose = fs::read_to_string(root.join("observability/docker-compose.yml")).unwrap();
let v: serde_yaml::Value = serde_yaml::from_str(&compose).unwrap();
let services = v
.get("services")
.and_then(|x| x.as_mapping())
.expect("missing services");
for required in ["grafana", "victoria-metrics", "vmagent", "loki", "tempo"] {
assert!(
services.contains_key(serde_yaml::Value::String(required.to_string())),
"missing service {required}"
);
}
}
#[tokio::test]
#[ignore]
async fn docker_compose_config_validation_is_gated_and_fast() {
let enabled = std::env::var("CONTROL_TEST_DOCKER").ok();
assert_eq!(enabled.as_deref(), Some("1"));
let root = repo_root();
let compose = root.join("observability/docker-compose.yml");
let cmd = tokio::process::Command::new("docker")
.args(["compose", "-f"])
.arg(compose)
.args(["config"])
.output();
let out = tokio::time::timeout(Duration::from_secs(10), cmd)
.await
.expect("docker compose config timed out")
.expect("failed to run docker compose config");
assert!(
out.status.success(),
"docker compose config failed: {}",
String::from_utf8_lossy(&out.stderr)
);
}

View File

@@ -0,0 +1,6 @@
#[test]
#[ignore]
fn docker_integration_tests_are_gated() {
let enabled = std::env::var("CONTROL_TEST_DOCKER").ok();
assert_eq!(enabled.as_deref(), Some("1"));
}

View File

@@ -0,0 +1,183 @@
use jsonwebtoken::{EncodingKey, Header, encode};
use serde::Serialize;
use std::{fs, net::TcpListener, time::Duration};
#[derive(Serialize)]
struct Claims {
sub: String,
session_id: String,
permissions: Vec<String>,
exp: usize,
}
fn free_port() -> u16 {
TcpListener::bind("127.0.0.1:0")
.unwrap()
.local_addr()
.unwrap()
.port()
}
fn token(secret: &[u8], perms: &[&str]) -> String {
let exp = (std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_secs()
+ 60) as usize;
encode(
&Header::default(),
&Claims {
sub: "op_1".to_string(),
session_id: "sess_1".to_string(),
permissions: perms.iter().map(|p| (*p).to_string()).collect(),
exp,
},
&EncodingKey::from_secret(secret),
)
.unwrap()
}
async fn wait_ready(url: &str) {
let client = reqwest::Client::new();
let start = tokio::time::Instant::now();
loop {
let ok = client
.get(format!("{url}/ready"))
.send()
.await
.map(|r| r.status().is_success())
.unwrap_or(false);
if ok {
return;
}
if start.elapsed() > Duration::from_secs(10) {
panic!("control-api did not become ready");
}
tokio::time::sleep(Duration::from_millis(100)).await;
}
}
#[tokio::test]
#[ignore]
async fn control_plane_can_see_the_fleet_via_docker_stubs() {
let enabled = std::env::var("CONTROL_TEST_DOCKER").ok();
assert_eq!(enabled.as_deref(), Some("1"));
let nginx_conf = r#"
server {
listen 80;
server_name _;
location = /health { return 200 "ok\n"; }
location = /ready { return 200 "ready\n"; }
location = /metrics { return 200 "stub_build_info{service=\"stub\",version=\"dev\",git_sha=\"000\"} 1\n"; }
}
"#;
let mut conf_path = std::env::temp_dir();
conf_path.push(format!(
"cloudlysis-control-nginx-{}.conf",
uuid::Uuid::new_v4()
));
fs::write(&conf_path, nginx_conf).unwrap();
let gateway_port = free_port();
let runner_port = free_port();
let aggregate_port = free_port();
let projection_port = free_port();
async fn run_stub(name: &str, port: u16, conf: &std::path::Path) -> String {
let out = tokio::process::Command::new("docker")
.args(["run", "-d", "--rm"])
.args(["-p", &format!("{port}:80")])
.args([
"-v",
&format!("{}:/etc/nginx/conf.d/default.conf:ro", conf.display()),
])
.arg("nginx:1.29-alpine")
.output()
.await
.expect("failed to run docker");
assert!(
out.status.success(),
"{name} stub failed: {}",
String::from_utf8_lossy(&out.stderr)
);
String::from_utf8_lossy(&out.stdout).trim().to_string()
}
let gateway_id = run_stub("gateway", gateway_port, &conf_path).await;
let runner_id = run_stub("runner", runner_port, &conf_path).await;
let aggregate_id = run_stub("aggregate", aggregate_port, &conf_path).await;
let projection_id = run_stub("projection", projection_port, &conf_path).await;
let secret = b"e2e_secret";
let api_port = free_port();
let api_url = format!("http://127.0.0.1:{api_port}");
let mut placement_path = std::env::temp_dir();
placement_path.push(format!(
"cloudlysis-control-placement-{}.json",
uuid::Uuid::new_v4()
));
fs::write(
&placement_path,
r#"{"revision":"e2e","aggregate_placement":{"placements":[]},"projection_placement":{"placements":[]},"runner_placement":{"placements":[]}}"#,
)
.unwrap();
let mut child = tokio::process::Command::new(env!("CARGO_BIN_EXE_api"))
.env("CONTROL_API_ADDR", format!("127.0.0.1:{api_port}"))
.env("CONTROL_GATEWAY_JWT_HS256_SECRET", "e2e_secret")
.env("CONTROL_PLACEMENT_PATH", placement_path.to_string_lossy().to_string())
.env(
"CONTROL_FLEET_SERVICES",
format!(
"gateway=http://127.0.0.1:{gateway_port},aggregate=http://127.0.0.1:{aggregate_port},projection=http://127.0.0.1:{projection_port},runner=http://127.0.0.1:{runner_port}"
),
)
.spawn()
.expect("failed to spawn control-api");
wait_ready(&api_url).await;
let client = reqwest::Client::new();
let t = token(secret, &["control:read"]);
let res = client
.get(format!("{api_url}/admin/v1/fleet/snapshot"))
.header(reqwest::header::AUTHORIZATION, format!("Bearer {t}"))
.send()
.await
.unwrap();
assert!(res.status().is_success());
let v: serde_json::Value = res.json().await.unwrap();
let services = v.get("services").and_then(|x| x.as_array()).unwrap();
assert!(
services.len() >= 5,
"expected at least 5 services (including control-api), got {}",
services.len()
);
let res = client
.get(format!("{api_url}/admin/v1/tenants"))
.header(reqwest::header::AUTHORIZATION, format!("Bearer {t}"))
.send()
.await
.unwrap();
assert!(res.status().is_success());
let _ = child.kill().await;
for id in [gateway_id, runner_id, aggregate_id, projection_id] {
let _ = tokio::process::Command::new("docker")
.args(["stop", &id])
.output()
.await;
}
let _ = fs::remove_file(&conf_path);
let _ = fs::remove_file(&placement_path);
}

View File

@@ -0,0 +1,30 @@
#[test]
fn fleet_services_env_parser_is_lenient() {
let services = {
fn parse(spec: &str) -> Vec<api::FleetService> {
spec.split(',')
.filter_map(|pair| {
let pair = pair.trim();
if pair.is_empty() {
return None;
}
let (name, url) = pair.split_once('=')?;
let name = name.trim();
let url = url.trim();
if name.is_empty() || url.is_empty() {
return None;
}
Some(api::FleetService {
name: name.to_string(),
base_url: url.to_string(),
})
})
.collect()
}
parse(" gateway=http://x , ,runner=http://y,broken, =http://z ")
};
assert_eq!(services.len(), 2);
assert_eq!(services[0].name, "gateway");
assert_eq!(services[1].name, "runner");
}

View File

@@ -0,0 +1,23 @@
use std::time::Duration;
#[tokio::test]
#[ignore]
async fn nats_integration_tests_are_gated_and_fast_fail() {
let url = std::env::var("CONTROL_TEST_NATS_URL").expect("CONTROL_TEST_NATS_URL is required");
let without_scheme = url.strip_prefix("nats://").unwrap_or(url.as_str());
let hostport = without_scheme.split('/').next().unwrap_or(without_scheme);
let mut parts = hostport.split(':');
let host = parts.next().unwrap_or("127.0.0.1");
let port: u16 = parts
.next()
.unwrap_or("4222")
.parse()
.expect("invalid port in CONTROL_TEST_NATS_URL");
let connect = tokio::net::TcpStream::connect((host, port));
tokio::time::timeout(Duration::from_secs(2), connect)
.await
.expect("tcp connect to NATS timed out")
.expect("failed to connect to NATS");
}

View File

@@ -0,0 +1,75 @@
use std::{collections::BTreeSet, fs, path::PathBuf};
fn repo_root() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.and_then(|p| p.parent())
.expect("api crate should live under repo root")
.to_path_buf()
}
#[test]
fn grafana_provisioning_files_are_syntactically_valid() {
let root = repo_root();
let datasources = fs::read_to_string(
root.join("observability/grafana/provisioning/datasources/datasources.yml"),
)
.expect("missing grafana datasources provisioning file");
let dashboards = fs::read_to_string(
root.join("observability/grafana/provisioning/dashboards/dashboards.yml"),
)
.expect("missing grafana dashboards provisioning file");
let _datasources_yaml: serde_yaml::Value =
serde_yaml::from_str(&datasources).expect("invalid grafana datasources yaml");
let _dashboards_yaml: serde_yaml::Value =
serde_yaml::from_str(&dashboards).expect("invalid grafana dashboards yaml");
}
#[test]
fn grafana_dashboards_are_syntactically_valid_json() {
let root = repo_root();
let dashboards_dir = root.join("observability/grafana/dashboards");
let mut found = 0usize;
for entry in fs::read_dir(&dashboards_dir).expect("missing dashboards dir") {
let entry = entry.expect("failed to read dashboards dir entry");
let path = entry.path();
if path.extension().and_then(|e| e.to_str()) != Some("json") {
continue;
}
found += 1;
let raw = fs::read_to_string(&path).expect("failed to read dashboard json");
let _: serde_json::Value =
serde_json::from_str(&raw).unwrap_or_else(|e| panic!("{path:?}: {e}"));
}
assert!(found > 0, "expected at least one dashboard json file");
}
#[test]
fn vmagent_config_parses_and_includes_required_jobs() {
let root = repo_root();
let scrape = fs::read_to_string(root.join("observability/vmagent/scrape.yml"))
.expect("missing vmagent scrape config");
let value: serde_yaml::Value =
serde_yaml::from_str(&scrape).expect("invalid vmagent scrape yaml");
let mut job_names = BTreeSet::<String>::new();
if let Some(scrape_configs) = value.get("scrape_configs").and_then(|v| v.as_sequence()) {
for cfg in scrape_configs {
if let Some(job) = cfg.get("job_name").and_then(|v| v.as_str()) {
job_names.insert(job.to_string());
}
}
}
for required in ["victoria-metrics", "vmagent", "control-api"] {
assert!(
job_names.contains(required),
"vmagent scrape config missing required job_name={required}"
);
}
}

View File

@@ -0,0 +1,61 @@
use std::{
net::TcpStream,
path::PathBuf,
process::Command,
time::{Duration, Instant},
};
fn repo_root() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.and_then(|p| p.parent())
.expect("api crate should live under repo root")
.to_path_buf()
}
fn wait_for_tcp(addr: &str, timeout: Duration) -> bool {
let start = Instant::now();
while start.elapsed() < timeout {
if TcpStream::connect_timeout(
&addr.parse().expect("invalid socket addr"),
Duration::from_secs(1),
)
.is_ok()
{
return true;
}
std::thread::sleep(Duration::from_millis(250));
}
false
}
#[test]
#[ignore]
fn observability_stack_reaches_healthy_state_fast() {
let enabled = std::env::var("CONTROL_TEST_DOCKER").ok();
assert_eq!(enabled.as_deref(), Some("1"));
let root = repo_root();
let compose = root.join("observability/docker-compose.yml");
let up = Command::new("docker")
.args(["compose", "-f"])
.arg(&compose)
.args(["up", "-d"])
.status()
.expect("failed to run docker compose up");
assert!(up.success(), "docker compose up failed");
let ok = wait_for_tcp("127.0.0.1:3000", Duration::from_secs(30))
&& wait_for_tcp("127.0.0.1:8428", Duration::from_secs(30))
&& wait_for_tcp("127.0.0.1:3100", Duration::from_secs(30))
&& wait_for_tcp("127.0.0.1:3200", Duration::from_secs(30));
let _ = Command::new("docker")
.args(["compose", "-f"])
.arg(&compose)
.args(["down", "-v"])
.status();
assert!(ok, "observability stack did not become reachable in time");
}

View File

@@ -0,0 +1,43 @@
use std::{fs, path::PathBuf, thread, time::Duration};
use api::PlacementStore;
fn tmp_file(name: &str) -> PathBuf {
let mut p = std::env::temp_dir();
p.push(format!(
"cloudlysis-control-{name}-{}-{}.json",
std::process::id(),
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_nanos()
));
p
}
#[test]
fn placement_store_hot_reload_swaps_atomically() {
let path = tmp_file("placement");
fs::write(
&path,
r#"{"revision":"r1","aggregate_placement":{"placements":[]},"projection_placement":{"placements":[]},"runner_placement":{"placements":[]}}"#,
)
.unwrap();
let store = PlacementStore::new(path.clone());
let a1 = store.get_for_kind(api::ServiceKind::Aggregate);
assert_eq!(a1.revision, "r1");
thread::sleep(Duration::from_millis(5));
fs::write(
&path,
r#"{"revision":"r2","aggregate_placement":{"placements":[]},"projection_placement":{"placements":[]},"runner_placement":{"placements":[]}}"#,
)
.unwrap();
let a2 = store.get_for_kind(api::ServiceKind::Aggregate);
assert_eq!(a2.revision, "r2");
let _ = fs::remove_file(&path);
}

View File

@@ -0,0 +1,31 @@
use std::{fs, path::PathBuf};
#[test]
fn swarm_store_is_deterministic_from_file() {
let mut path = std::env::temp_dir();
path.push(format!(
"cloudlysis-control-swarm-{}-{}.json",
std::process::id(),
uuid::Uuid::new_v4()
));
fs::write(
&path,
r#"{"services":[{"name":"gateway","image":"x","mode":"replicated","replicas":"1/1","updated_at":null}],"tasks":[{"id":"t1","service":"gateway","node":"n1","desired_state":"running","current_state":"running","error":null}]}"#,
)
.unwrap();
let store = api::SwarmStore::new(PathBuf::from(&path));
let services = store.list_services();
assert_eq!(services.len(), 1);
assert_eq!(services[0].name, "gateway");
let tasks = store.list_tasks("gateway");
assert_eq!(tasks.len(), 1);
assert_eq!(tasks[0].id, "t1");
let none = store.list_tasks("missing");
assert_eq!(none.len(), 0);
let _ = fs::remove_file(&path);
}

View File

@@ -0,0 +1,42 @@
use std::time::Duration;
#[tokio::test]
#[ignore]
async fn docker_swarm_smoke_test_is_gated_and_times_out() {
let enabled = std::env::var("CONTROL_TEST_DOCKER").ok();
assert_eq!(enabled.as_deref(), Some("1"));
let stack = "cloudlysis_control_test";
let compose = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.and_then(|p| p.parent())
.unwrap()
.join("swarm/stacks/control-plane.yml");
let deploy = tokio::process::Command::new("docker")
.args(["stack", "deploy", "-c"])
.arg(&compose)
.arg(stack)
.output();
let out = tokio::time::timeout(Duration::from_secs(30), deploy)
.await
.expect("docker stack deploy timed out")
.expect("failed to run docker stack deploy");
assert!(
out.status.success(),
"docker stack deploy failed: {}",
String::from_utf8_lossy(&out.stderr)
);
let ls = tokio::process::Command::new("docker")
.args(["service", "ls"])
.output();
let _ = tokio::time::timeout(Duration::from_secs(10), ls).await;
let rm = tokio::process::Command::new("docker")
.args(["stack", "rm"])
.arg(stack)
.output();
let _ = tokio::time::timeout(Duration::from_secs(10), rm).await;
}

View File

@@ -0,0 +1,40 @@
use std::{fs, path::PathBuf};
fn repo_root() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.and_then(|p| p.parent())
.expect("api crate should live under repo root")
.to_path_buf()
}
#[test]
fn stack_files_parse_as_yaml() {
let root = repo_root();
for file in [
root.join("swarm/stacks/control-plane.yml"),
root.join("swarm/stacks/observability.yml"),
] {
let raw = fs::read_to_string(&file).unwrap();
let _: serde_yaml::Value = serde_yaml::from_str(&raw).unwrap();
}
}
#[test]
fn control_plane_stack_has_required_services() {
let root = repo_root();
let raw = fs::read_to_string(root.join("swarm/stacks/control-plane.yml")).unwrap();
let v: serde_yaml::Value = serde_yaml::from_str(&raw).unwrap();
let services = v
.get("services")
.and_then(|x| x.as_mapping())
.expect("missing services");
for required in ["control-api", "control-ui"] {
assert!(
services.contains_key(serde_yaml::Value::String(required.to_string())),
"missing service {required}"
);
}
}