Monorepo consolidation: workspace, shared types, transport plans, docker/swam assets
This commit is contained in:
25
control/api/Cargo.toml
Normal file
25
control/api/Cargo.toml
Normal file
@@ -0,0 +1,25 @@
|
||||
[package]
|
||||
name = "api"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
publish = ["madapes"]
|
||||
|
||||
[dependencies]
|
||||
axum = "0.8.6"
|
||||
clap = { version = "4.5.48", features = ["derive", "env"] }
|
||||
jsonwebtoken = "9.3.1"
|
||||
metrics = "0.23.0"
|
||||
metrics-exporter-prometheus = "0.16.0"
|
||||
reqwest = { version = "0.12.23", default-features = false, features = ["json", "rustls-tls"] }
|
||||
serde = { version = "1.0.228", features = ["derive"] }
|
||||
serde_json = "1.0.149"
|
||||
thiserror = "2.0.16"
|
||||
tokio = { version = "1.45.0", features = ["macros", "net", "process", "rt-multi-thread", "signal", "time"] }
|
||||
tower-http = { version = "0.6.6", features = ["trace"] }
|
||||
tracing = "0.1.41"
|
||||
tracing-subscriber = { version = "0.3.20", features = ["env-filter"] }
|
||||
uuid = { version = "1.18.1", features = ["serde", "v4"] }
|
||||
|
||||
[dev-dependencies]
|
||||
serde_yaml = "0.9.34"
|
||||
tower = "0.5.2"
|
||||
417
control/api/src/admin.rs
Normal file
417
control/api/src/admin.rs
Normal file
@@ -0,0 +1,417 @@
|
||||
use crate::{
|
||||
AppState, RequestIds,
|
||||
auth::{Principal, has_permission},
|
||||
fleet,
|
||||
job_engine::{JobEngine, StartJobError},
|
||||
jobs::{Job, JobStatus, JobStep},
|
||||
placement::{PlacementResponse, ServiceKind},
|
||||
swarm::{SwarmService, SwarmTask},
|
||||
};
|
||||
use axum::{
|
||||
Json, Router,
|
||||
extract::{Extension, Path, State},
|
||||
http::{HeaderMap, StatusCode},
|
||||
response::IntoResponse,
|
||||
routing::{get, post},
|
||||
};
|
||||
use serde::Deserialize;
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
use uuid::Uuid;
|
||||
|
||||
const HEADER_IDEMPOTENCY_KEY: &str = "idempotency-key";
|
||||
const HEADER_TENANT_ID: &str = "x-tenant-id";
|
||||
|
||||
pub fn admin_router() -> Router<AppState> {
|
||||
Router::new()
|
||||
.route("/whoami", get(whoami))
|
||||
.route("/platform/info", get(platform_info))
|
||||
.route("/fleet/snapshot", get(fleet_snapshot))
|
||||
.route("/tenants", get(list_tenants))
|
||||
.route("/placement/{kind}", get(get_placement))
|
||||
.route("/tenants/echo", get(tenant_echo))
|
||||
.route("/jobs/echo", post(create_echo_job))
|
||||
.route("/jobs/{job_id}", get(get_job))
|
||||
.route("/jobs/{job_id}/cancel", post(cancel_job))
|
||||
.route("/jobs/tenant/drain", post(start_tenant_drain))
|
||||
.route("/jobs/tenant/migrate", post(start_tenant_migrate))
|
||||
.route("/plan/tenant/migrate", post(plan_tenant_migrate))
|
||||
.route("/audit", get(list_audit))
|
||||
.route("/swarm/services", get(list_swarm_services))
|
||||
.route("/swarm/services/{name}/tasks", get(list_swarm_tasks))
|
||||
}
|
||||
|
||||
async fn whoami(Extension(principal): Extension<Principal>) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:read") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
(
|
||||
StatusCode::OK,
|
||||
Json(serde_json::json!({
|
||||
"sub": principal.sub,
|
||||
"session_id": principal.session_id,
|
||||
"permissions": principal.permissions,
|
||||
})),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
async fn platform_info(Extension(principal): Extension<Principal>) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:read") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
(
|
||||
StatusCode::OK,
|
||||
Json(serde_json::json!({
|
||||
"service": "control-api",
|
||||
})),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
async fn fleet_snapshot(
|
||||
State(state): State<AppState>,
|
||||
Extension(principal): Extension<Principal>,
|
||||
Extension(request_ids): Extension<RequestIds>,
|
||||
) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:read") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
let services =
|
||||
fleet::snapshot_with_context(&state.http, &state.fleet_services, Some(&request_ids)).await;
|
||||
(
|
||||
StatusCode::OK,
|
||||
Json(serde_json::json!({ "services": services })),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
async fn get_placement(
|
||||
State(state): State<AppState>,
|
||||
Path(kind): Path<String>,
|
||||
Extension(principal): Extension<Principal>,
|
||||
) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:read") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
let kind = match kind.as_str() {
|
||||
"aggregate" => ServiceKind::Aggregate,
|
||||
"projection" => ServiceKind::Projection,
|
||||
"runner" => ServiceKind::Runner,
|
||||
_ => return StatusCode::NOT_FOUND.into_response(),
|
||||
};
|
||||
|
||||
let resp: PlacementResponse = state.placement.get_for_kind(kind);
|
||||
|
||||
(StatusCode::OK, Json(resp)).into_response()
|
||||
}
|
||||
|
||||
async fn list_tenants(
|
||||
State(state): State<AppState>,
|
||||
Extension(principal): Extension<Principal>,
|
||||
) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:read") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
let tenants = state.placement.tenant_summaries();
|
||||
(
|
||||
StatusCode::OK,
|
||||
Json(serde_json::json!({ "tenants": tenants })),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
async fn tenant_echo(
|
||||
headers: HeaderMap,
|
||||
Extension(principal): Extension<Principal>,
|
||||
) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:read") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
let tenant_id = headers
|
||||
.get(HEADER_TENANT_ID)
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.ok_or(StatusCode::BAD_REQUEST)
|
||||
.and_then(|s| Uuid::parse_str(s).map_err(|_| StatusCode::BAD_REQUEST));
|
||||
|
||||
match tenant_id {
|
||||
Ok(tenant_id) => (
|
||||
StatusCode::OK,
|
||||
Json(serde_json::json!({
|
||||
"tenant_id": tenant_id,
|
||||
})),
|
||||
)
|
||||
.into_response(),
|
||||
Err(status) => status.into_response(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn create_echo_job(
|
||||
State(state): State<AppState>,
|
||||
headers: HeaderMap,
|
||||
Extension(principal): Extension<Principal>,
|
||||
) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:write") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
let key = headers
|
||||
.get(HEADER_IDEMPOTENCY_KEY)
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.ok_or(StatusCode::BAD_REQUEST);
|
||||
|
||||
let key = match key {
|
||||
Ok(k) if !k.is_empty() => k,
|
||||
_ => return StatusCode::BAD_REQUEST.into_response(),
|
||||
};
|
||||
|
||||
let now = now_ms();
|
||||
let job_id = Uuid::new_v4();
|
||||
let job = Job {
|
||||
job_id,
|
||||
status: JobStatus::Succeeded,
|
||||
steps: vec![JobStep {
|
||||
name: "echo".to_string(),
|
||||
status: JobStatus::Succeeded,
|
||||
attempts: 1,
|
||||
error: None,
|
||||
}],
|
||||
error: None,
|
||||
created_at_ms: now,
|
||||
started_at_ms: Some(now),
|
||||
finished_at_ms: Some(now),
|
||||
};
|
||||
|
||||
let job_id = state.jobs.insert_idempotent(key, job);
|
||||
state.audit.record(crate::audit::AuditEvent {
|
||||
ts_ms: now,
|
||||
principal_sub: principal.sub.clone(),
|
||||
action: "job.echo".to_string(),
|
||||
tenant_id: None,
|
||||
reason: "echo".to_string(),
|
||||
job_id: Some(job_id),
|
||||
});
|
||||
(
|
||||
StatusCode::OK,
|
||||
Json(serde_json::json!({
|
||||
"job_id": job_id,
|
||||
})),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
async fn get_job(
|
||||
State(state): State<AppState>,
|
||||
Path(job_id): Path<Uuid>,
|
||||
Extension(principal): Extension<Principal>,
|
||||
) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:read") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
match state.jobs.get(job_id) {
|
||||
Some(job) => (StatusCode::OK, Json(job)).into_response(),
|
||||
None => StatusCode::NOT_FOUND.into_response(),
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct TenantDrainRequest {
|
||||
tenant_id: Uuid,
|
||||
reason: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct TenantMigrateRequest {
|
||||
tenant_id: Uuid,
|
||||
runner_target: String,
|
||||
reason: String,
|
||||
}
|
||||
|
||||
async fn start_tenant_drain(
|
||||
State(state): State<AppState>,
|
||||
headers: HeaderMap,
|
||||
Extension(principal): Extension<Principal>,
|
||||
Json(body): Json<TenantDrainRequest>,
|
||||
) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:write") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
let key = headers
|
||||
.get(HEADER_IDEMPOTENCY_KEY)
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.ok_or(StatusCode::BAD_REQUEST);
|
||||
let key = match key {
|
||||
Ok(k) if !k.is_empty() => k,
|
||||
_ => return StatusCode::BAD_REQUEST.into_response(),
|
||||
};
|
||||
|
||||
let engine = JobEngine::new(
|
||||
state.jobs.clone(),
|
||||
state.audit.clone(),
|
||||
state.tenant_locks.clone(),
|
||||
);
|
||||
let job_id = match engine.start_tenant_drain(
|
||||
state.clone(),
|
||||
&principal,
|
||||
body.tenant_id,
|
||||
body.reason,
|
||||
key,
|
||||
) {
|
||||
Ok(id) => id,
|
||||
Err(StartJobError::TenantLocked) => return StatusCode::CONFLICT.into_response(),
|
||||
};
|
||||
|
||||
(
|
||||
StatusCode::OK,
|
||||
Json(serde_json::json!({ "job_id": job_id })),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
async fn start_tenant_migrate(
|
||||
State(state): State<AppState>,
|
||||
headers: HeaderMap,
|
||||
Extension(principal): Extension<Principal>,
|
||||
Json(body): Json<TenantMigrateRequest>,
|
||||
) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:write") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
let key = headers
|
||||
.get(HEADER_IDEMPOTENCY_KEY)
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.ok_or(StatusCode::BAD_REQUEST);
|
||||
let key = match key {
|
||||
Ok(k) if !k.is_empty() => k,
|
||||
_ => return StatusCode::BAD_REQUEST.into_response(),
|
||||
};
|
||||
|
||||
let engine = JobEngine::new(
|
||||
state.jobs.clone(),
|
||||
state.audit.clone(),
|
||||
state.tenant_locks.clone(),
|
||||
);
|
||||
let job_id = match engine.start_tenant_migrate(
|
||||
state.clone(),
|
||||
&principal,
|
||||
body.tenant_id,
|
||||
body.runner_target,
|
||||
body.reason,
|
||||
key,
|
||||
) {
|
||||
Ok(id) => id,
|
||||
Err(StartJobError::TenantLocked) => return StatusCode::CONFLICT.into_response(),
|
||||
};
|
||||
|
||||
(
|
||||
StatusCode::OK,
|
||||
Json(serde_json::json!({ "job_id": job_id })),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
async fn cancel_job(
|
||||
State(state): State<AppState>,
|
||||
Path(job_id): Path<Uuid>,
|
||||
Extension(principal): Extension<Principal>,
|
||||
) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:write") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
if state.jobs.request_cancel(job_id) {
|
||||
state.audit.record(crate::audit::AuditEvent {
|
||||
ts_ms: now_ms(),
|
||||
principal_sub: principal.sub.clone(),
|
||||
action: "job.cancel".to_string(),
|
||||
tenant_id: None,
|
||||
reason: "cancel requested".to_string(),
|
||||
job_id: Some(job_id),
|
||||
});
|
||||
StatusCode::OK.into_response()
|
||||
} else {
|
||||
StatusCode::NOT_FOUND.into_response()
|
||||
}
|
||||
}
|
||||
|
||||
fn now_ms() -> u64 {
|
||||
SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_millis() as u64
|
||||
}
|
||||
|
||||
async fn list_audit(
|
||||
State(state): State<AppState>,
|
||||
Extension(principal): Extension<Principal>,
|
||||
) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:read") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
let events = state.audit.list_recent(200);
|
||||
(
|
||||
StatusCode::OK,
|
||||
Json(serde_json::json!({ "events": events })),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
async fn plan_tenant_migrate(
|
||||
Extension(principal): Extension<Principal>,
|
||||
Json(body): Json<TenantMigrateRequest>,
|
||||
) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:write") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
let _ = (body.tenant_id, body.runner_target, body.reason);
|
||||
(
|
||||
StatusCode::OK,
|
||||
Json(serde_json::json!({
|
||||
"steps": ["preflight", "drain", "update_placement", "reload", "verify"]
|
||||
})),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
async fn list_swarm_services(
|
||||
State(state): State<AppState>,
|
||||
Extension(principal): Extension<Principal>,
|
||||
) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:read") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
let services: Vec<SwarmService> = state.swarm.list_services();
|
||||
(
|
||||
StatusCode::OK,
|
||||
Json(serde_json::json!({ "services": services })),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
async fn list_swarm_tasks(
|
||||
State(state): State<AppState>,
|
||||
Path(name): Path<String>,
|
||||
Extension(principal): Extension<Principal>,
|
||||
) -> impl IntoResponse {
|
||||
if !has_permission(&principal, "control:read") {
|
||||
return StatusCode::FORBIDDEN.into_response();
|
||||
}
|
||||
|
||||
let tasks: Vec<SwarmTask> = state.swarm.list_tasks(&name);
|
||||
(
|
||||
StatusCode::OK,
|
||||
Json(serde_json::json!({ "service": name, "tasks": tasks })),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
31
control/api/src/audit.rs
Normal file
31
control/api/src/audit.rs
Normal file
@@ -0,0 +1,31 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use uuid::Uuid;
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct AuditEvent {
|
||||
pub ts_ms: u64,
|
||||
pub principal_sub: String,
|
||||
pub action: String,
|
||||
pub tenant_id: Option<Uuid>,
|
||||
pub reason: String,
|
||||
pub job_id: Option<Uuid>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub struct AuditStore {
|
||||
inner: Arc<Mutex<Vec<AuditEvent>>>,
|
||||
}
|
||||
|
||||
impl AuditStore {
|
||||
pub fn record(&self, event: AuditEvent) {
|
||||
let mut events = self.inner.lock().expect("audit lock poisoned");
|
||||
events.push(event);
|
||||
}
|
||||
|
||||
pub fn list_recent(&self, limit: usize) -> Vec<AuditEvent> {
|
||||
let events = self.inner.lock().expect("audit lock poisoned");
|
||||
let start = events.len().saturating_sub(limit);
|
||||
events[start..].to_vec()
|
||||
}
|
||||
}
|
||||
78
control/api/src/auth.rs
Normal file
78
control/api/src/auth.rs
Normal file
@@ -0,0 +1,78 @@
|
||||
use crate::AppState;
|
||||
use axum::{
|
||||
extract::State,
|
||||
http::{Request, StatusCode},
|
||||
middleware::Next,
|
||||
response::{IntoResponse, Response},
|
||||
};
|
||||
use jsonwebtoken::{Algorithm, DecodingKey, Validation, decode};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct AuthConfig {
|
||||
pub hs256_secret: Option<Vec<u8>>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Principal {
|
||||
pub sub: String,
|
||||
pub session_id: String,
|
||||
pub permissions: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct Claims {
|
||||
sub: String,
|
||||
session_id: String,
|
||||
permissions: Vec<String>,
|
||||
exp: usize,
|
||||
}
|
||||
|
||||
pub async fn auth_middleware(
|
||||
State(state): State<AppState>,
|
||||
mut req: Request<axum::body::Body>,
|
||||
next: Next,
|
||||
) -> Response {
|
||||
match authenticate(
|
||||
&state.auth,
|
||||
req.headers().get(axum::http::header::AUTHORIZATION),
|
||||
) {
|
||||
Ok(principal) => {
|
||||
req.extensions_mut().insert(principal);
|
||||
next.run(req).await
|
||||
}
|
||||
Err(status) => status.into_response(),
|
||||
}
|
||||
}
|
||||
|
||||
fn authenticate(
|
||||
cfg: &AuthConfig,
|
||||
auth_header: Option<&axum::http::HeaderValue>,
|
||||
) -> Result<Principal, StatusCode> {
|
||||
let secret = cfg
|
||||
.hs256_secret
|
||||
.as_ref()
|
||||
.ok_or(StatusCode::SERVICE_UNAVAILABLE)?;
|
||||
let header = auth_header.ok_or(StatusCode::UNAUTHORIZED)?;
|
||||
let header_str = header.to_str().map_err(|_| StatusCode::UNAUTHORIZED)?;
|
||||
|
||||
let token = header_str
|
||||
.strip_prefix("Bearer ")
|
||||
.ok_or(StatusCode::UNAUTHORIZED)?;
|
||||
|
||||
let mut validation = Validation::new(Algorithm::HS256);
|
||||
validation.required_spec_claims.insert("exp".to_string());
|
||||
|
||||
let data = decode::<Claims>(token, &DecodingKey::from_secret(secret), &validation)
|
||||
.map_err(|_| StatusCode::UNAUTHORIZED)?;
|
||||
|
||||
Ok(Principal {
|
||||
sub: data.claims.sub,
|
||||
session_id: data.claims.session_id,
|
||||
permissions: data.claims.permissions,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn has_permission(principal: &Principal, permission: &str) -> bool {
|
||||
principal.permissions.iter().any(|p| p == permission)
|
||||
}
|
||||
57
control/api/src/build_info.rs
Normal file
57
control/api/src/build_info.rs
Normal file
@@ -0,0 +1,57 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct BuildInfo {
|
||||
pub service: String,
|
||||
pub version: String,
|
||||
pub git_sha: String,
|
||||
}
|
||||
|
||||
pub fn extract_build_info(metrics: &str) -> Vec<BuildInfo> {
|
||||
let mut out = Vec::new();
|
||||
for line in metrics.lines() {
|
||||
let line = line.trim();
|
||||
if line.is_empty() || line.starts_with('#') {
|
||||
continue;
|
||||
}
|
||||
let Some((metric_and_labels, value)) = line.split_once(' ') else {
|
||||
continue;
|
||||
};
|
||||
if value.trim() != "1" {
|
||||
continue;
|
||||
}
|
||||
if !metric_and_labels.ends_with('}') {
|
||||
continue;
|
||||
}
|
||||
let Some((name, labels)) = metric_and_labels.split_once('{') else {
|
||||
continue;
|
||||
};
|
||||
if !name.ends_with("_build_info") {
|
||||
continue;
|
||||
}
|
||||
let labels = labels.trim_end_matches('}');
|
||||
let mut service = None;
|
||||
let mut version = None;
|
||||
let mut git_sha = None;
|
||||
for part in labels.split(',') {
|
||||
let Some((k, v)) = part.split_once('=') else {
|
||||
continue;
|
||||
};
|
||||
let v = v.trim().trim_matches('"');
|
||||
match k.trim() {
|
||||
"service" => service = Some(v.to_string()),
|
||||
"version" => version = Some(v.to_string()),
|
||||
"git_sha" => git_sha = Some(v.to_string()),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
if let (Some(service), Some(version), Some(git_sha)) = (service, version, git_sha) {
|
||||
out.push(BuildInfo {
|
||||
service,
|
||||
version,
|
||||
git_sha,
|
||||
});
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
42
control/api/src/deployments.rs
Normal file
42
control/api/src/deployments.rs
Normal file
@@ -0,0 +1,42 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct GrafanaAnnotation {
|
||||
pub time: i64,
|
||||
pub tags: Vec<String>,
|
||||
pub text: String,
|
||||
}
|
||||
|
||||
pub fn build_grafana_deploy_annotation(args: DeployAnnotationArgs) -> GrafanaAnnotation {
|
||||
let mut tags = vec![
|
||||
"cloudlysis".to_string(),
|
||||
"deploy".to_string(),
|
||||
format!("service:{}", args.service),
|
||||
];
|
||||
if let Some(v) = args.version {
|
||||
tags.push(format!("version:{v}"));
|
||||
}
|
||||
if let Some(sha) = args.git_sha {
|
||||
tags.push(format!("git_sha:{sha}"));
|
||||
}
|
||||
|
||||
let text = match (args.version, args.git_sha) {
|
||||
(Some(v), Some(sha)) => format!("deploy {} v={} git_sha={sha}", args.service, v),
|
||||
(Some(v), None) => format!("deploy {} v={}", args.service, v),
|
||||
(None, Some(sha)) => format!("deploy {} git_sha={sha}", args.service),
|
||||
(None, None) => format!("deploy {}", args.service),
|
||||
};
|
||||
|
||||
GrafanaAnnotation {
|
||||
time: args.time_ms,
|
||||
tags,
|
||||
text,
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DeployAnnotationArgs<'a> {
|
||||
pub service: &'a str,
|
||||
pub version: Option<&'a str>,
|
||||
pub git_sha: Option<&'a str>,
|
||||
pub time_ms: i64,
|
||||
}
|
||||
67
control/api/src/fleet.rs
Normal file
67
control/api/src/fleet.rs
Normal file
@@ -0,0 +1,67 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::time::Duration;
|
||||
|
||||
use crate::RequestIds;
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct FleetService {
|
||||
pub name: String,
|
||||
pub base_url: String,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct FleetServiceSnapshot {
|
||||
pub name: String,
|
||||
pub base_url: String,
|
||||
pub health_ok: bool,
|
||||
pub ready_ok: bool,
|
||||
pub metrics_ok: bool,
|
||||
}
|
||||
|
||||
pub async fn snapshot(
|
||||
client: &reqwest::Client,
|
||||
services: &[FleetService],
|
||||
) -> Vec<FleetServiceSnapshot> {
|
||||
snapshot_with_context(client, services, None).await
|
||||
}
|
||||
|
||||
pub async fn snapshot_with_context(
|
||||
client: &reqwest::Client,
|
||||
services: &[FleetService],
|
||||
ctx: Option<&RequestIds>,
|
||||
) -> Vec<FleetServiceSnapshot> {
|
||||
let mut out = Vec::with_capacity(services.len());
|
||||
for svc in services {
|
||||
let base = svc.base_url.trim_end_matches('/');
|
||||
let health_ok = get_ok(client, &format!("{base}/health"), ctx).await;
|
||||
let ready_ok = get_ok(client, &format!("{base}/ready"), ctx).await;
|
||||
let metrics_ok = get_ok(client, &format!("{base}/metrics"), ctx).await;
|
||||
out.push(FleetServiceSnapshot {
|
||||
name: svc.name.clone(),
|
||||
base_url: svc.base_url.clone(),
|
||||
health_ok,
|
||||
ready_ok,
|
||||
metrics_ok,
|
||||
});
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
async fn get_ok(client: &reqwest::Client, url: &str, ctx: Option<&RequestIds>) -> bool {
|
||||
let mut req = client.get(url).timeout(Duration::from_secs(2));
|
||||
if let Some(ctx) = ctx {
|
||||
req = req.header("x-request-id", &ctx.request_id);
|
||||
if let Some(cid) = &ctx.correlation_id {
|
||||
req = req.header("x-correlation-id", cid);
|
||||
}
|
||||
if let Some(tp) = &ctx.traceparent {
|
||||
req = req.header("traceparent", tp);
|
||||
}
|
||||
}
|
||||
|
||||
let res = req.send().await;
|
||||
match res {
|
||||
Ok(r) => r.status().is_success(),
|
||||
Err(_) => false,
|
||||
}
|
||||
}
|
||||
348
control/api/src/job_engine.rs
Normal file
348
control/api/src/job_engine.rs
Normal file
@@ -0,0 +1,348 @@
|
||||
use crate::{
|
||||
AppState, Principal,
|
||||
audit::{AuditEvent, AuditStore},
|
||||
fleet,
|
||||
jobs::{Job, JobStatus, JobStep, JobStore},
|
||||
};
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
sync::{Arc, Mutex},
|
||||
time::{Duration, SystemTime, UNIX_EPOCH},
|
||||
};
|
||||
use uuid::Uuid;
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub struct TenantLocks {
|
||||
inner: Arc<Mutex<HashMap<Uuid, Uuid>>>,
|
||||
}
|
||||
|
||||
impl TenantLocks {
|
||||
pub fn try_lock(&self, tenant_id: Uuid, job_id: Uuid) -> bool {
|
||||
let mut map = self.inner.lock().expect("tenant locks poisoned");
|
||||
if map.contains_key(&tenant_id) {
|
||||
return false;
|
||||
}
|
||||
map.insert(tenant_id, job_id);
|
||||
true
|
||||
}
|
||||
|
||||
pub fn unlock(&self, tenant_id: Uuid, job_id: Uuid) {
|
||||
let mut map = self.inner.lock().expect("tenant locks poisoned");
|
||||
if map.get(&tenant_id).copied() == Some(job_id) {
|
||||
map.remove(&tenant_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct JobEngine {
|
||||
pub jobs: JobStore,
|
||||
pub audit: AuditStore,
|
||||
pub tenant_locks: TenantLocks,
|
||||
pub step_timeout: Duration,
|
||||
}
|
||||
|
||||
impl JobEngine {
|
||||
pub fn new(jobs: JobStore, audit: AuditStore, tenant_locks: TenantLocks) -> Self {
|
||||
Self {
|
||||
jobs,
|
||||
audit,
|
||||
tenant_locks,
|
||||
step_timeout: Duration::from_millis(500),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start_tenant_drain(
|
||||
&self,
|
||||
state: AppState,
|
||||
principal: &Principal,
|
||||
tenant_id: Uuid,
|
||||
reason: String,
|
||||
idempotency_key: &str,
|
||||
) -> Result<Uuid, StartJobError> {
|
||||
if let Some(existing) = self.jobs.get_idempotent(idempotency_key) {
|
||||
return Ok(existing);
|
||||
}
|
||||
|
||||
let job_id = Uuid::new_v4();
|
||||
if !self.tenant_locks.try_lock(tenant_id, job_id) {
|
||||
return Err(StartJobError::TenantLocked);
|
||||
}
|
||||
|
||||
let now = now_ms();
|
||||
let job = Job {
|
||||
job_id,
|
||||
status: JobStatus::Pending,
|
||||
steps: vec![step("preflight"), step("drain"), step("verify")],
|
||||
error: None,
|
||||
created_at_ms: now,
|
||||
started_at_ms: None,
|
||||
finished_at_ms: None,
|
||||
};
|
||||
|
||||
let inserted = self.jobs.insert_idempotent(idempotency_key, job);
|
||||
self.audit.record(AuditEvent {
|
||||
ts_ms: now,
|
||||
principal_sub: principal.sub.clone(),
|
||||
action: "tenant.drain".to_string(),
|
||||
tenant_id: Some(tenant_id),
|
||||
reason,
|
||||
job_id: Some(inserted),
|
||||
});
|
||||
|
||||
let engine = self.clone();
|
||||
tokio::spawn(async move {
|
||||
engine
|
||||
.run_job(state, inserted, Some(tenant_id), RunSpec::Drain)
|
||||
.await;
|
||||
});
|
||||
|
||||
Ok(inserted)
|
||||
}
|
||||
|
||||
pub fn start_tenant_migrate(
|
||||
&self,
|
||||
state: AppState,
|
||||
principal: &Principal,
|
||||
tenant_id: Uuid,
|
||||
runner_target: String,
|
||||
reason: String,
|
||||
idempotency_key: &str,
|
||||
) -> Result<Uuid, StartJobError> {
|
||||
if let Some(existing) = self.jobs.get_idempotent(idempotency_key) {
|
||||
return Ok(existing);
|
||||
}
|
||||
|
||||
let job_id = Uuid::new_v4();
|
||||
if !self.tenant_locks.try_lock(tenant_id, job_id) {
|
||||
return Err(StartJobError::TenantLocked);
|
||||
}
|
||||
|
||||
let now = now_ms();
|
||||
let job = Job {
|
||||
job_id,
|
||||
status: JobStatus::Pending,
|
||||
steps: vec![
|
||||
step("preflight"),
|
||||
step("drain"),
|
||||
step("update_placement"),
|
||||
step("reload"),
|
||||
step("verify"),
|
||||
],
|
||||
error: None,
|
||||
created_at_ms: now,
|
||||
started_at_ms: None,
|
||||
finished_at_ms: None,
|
||||
};
|
||||
|
||||
let inserted = self.jobs.insert_idempotent(idempotency_key, job);
|
||||
self.audit.record(AuditEvent {
|
||||
ts_ms: now,
|
||||
principal_sub: principal.sub.clone(),
|
||||
action: "tenant.migrate".to_string(),
|
||||
tenant_id: Some(tenant_id),
|
||||
reason,
|
||||
job_id: Some(inserted),
|
||||
});
|
||||
|
||||
let engine = self.clone();
|
||||
tokio::spawn(async move {
|
||||
engine
|
||||
.run_job(
|
||||
state,
|
||||
inserted,
|
||||
Some(tenant_id),
|
||||
RunSpec::Migrate { runner_target },
|
||||
)
|
||||
.await;
|
||||
});
|
||||
|
||||
Ok(inserted)
|
||||
}
|
||||
|
||||
async fn run_job(&self, state: AppState, job_id: Uuid, tenant_id: Option<Uuid>, spec: RunSpec) {
|
||||
self.jobs.update(job_id, |j| {
|
||||
j.status = JobStatus::Running;
|
||||
j.started_at_ms = Some(now_ms());
|
||||
});
|
||||
|
||||
let mut ok = true;
|
||||
for idx in 0.. {
|
||||
if self.jobs.cancel_requested(job_id) {
|
||||
ok = false;
|
||||
self.jobs.update(job_id, |j| {
|
||||
j.status = JobStatus::Cancelled;
|
||||
j.finished_at_ms = Some(now_ms());
|
||||
j.error = Some("cancelled".to_string());
|
||||
for step in &mut j.steps {
|
||||
if step.status == JobStatus::Pending || step.status == JobStatus::Running {
|
||||
step.status = JobStatus::Cancelled;
|
||||
}
|
||||
}
|
||||
});
|
||||
break;
|
||||
}
|
||||
|
||||
let step_name = {
|
||||
let Some(job) = self.jobs.get(job_id) else {
|
||||
break;
|
||||
};
|
||||
let Some(step) = job.steps.get(idx) else {
|
||||
break;
|
||||
};
|
||||
step.name.clone()
|
||||
};
|
||||
|
||||
self.jobs.update(job_id, |j| {
|
||||
if let Some(step) = j.steps.get_mut(idx) {
|
||||
step.status = JobStatus::Running;
|
||||
step.attempts += 1;
|
||||
}
|
||||
});
|
||||
|
||||
let r = tokio::time::timeout(
|
||||
self.step_timeout,
|
||||
run_step(&state, &spec, &step_name, tenant_id),
|
||||
)
|
||||
.await;
|
||||
match r {
|
||||
Ok(Ok(())) => {
|
||||
self.jobs.update(job_id, |j| {
|
||||
if let Some(step) = j.steps.get_mut(idx) {
|
||||
step.status = JobStatus::Succeeded;
|
||||
step.error = None;
|
||||
}
|
||||
});
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
ok = false;
|
||||
self.jobs.update(job_id, |j| {
|
||||
if let Some(step) = j.steps.get_mut(idx) {
|
||||
step.status = JobStatus::Failed;
|
||||
step.error = Some(e.clone());
|
||||
}
|
||||
j.status = JobStatus::Failed;
|
||||
j.error = Some(e);
|
||||
j.finished_at_ms = Some(now_ms());
|
||||
});
|
||||
break;
|
||||
}
|
||||
Err(_) => {
|
||||
ok = false;
|
||||
self.jobs.update(job_id, |j| {
|
||||
if let Some(step) = j.steps.get_mut(idx) {
|
||||
step.status = JobStatus::Failed;
|
||||
step.error = Some("step timeout".to_string());
|
||||
}
|
||||
j.status = JobStatus::Failed;
|
||||
j.error = Some("step timeout".to_string());
|
||||
j.finished_at_ms = Some(now_ms());
|
||||
});
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if !ok {
|
||||
break;
|
||||
}
|
||||
|
||||
let done = match self.jobs.get(job_id) {
|
||||
Some(job) => idx + 1 >= job.steps.len(),
|
||||
None => true,
|
||||
};
|
||||
if done {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ok {
|
||||
self.jobs.update(job_id, |j| {
|
||||
j.status = JobStatus::Succeeded;
|
||||
j.finished_at_ms = Some(now_ms());
|
||||
});
|
||||
}
|
||||
|
||||
if let Some(tid) = tenant_id {
|
||||
self.tenant_locks.unlock(tid, job_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum StartJobError {
|
||||
TenantLocked,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
enum RunSpec {
|
||||
Drain,
|
||||
Migrate { runner_target: String },
|
||||
}
|
||||
|
||||
fn step(name: &str) -> JobStep {
|
||||
JobStep {
|
||||
name: name.to_string(),
|
||||
status: JobStatus::Pending,
|
||||
attempts: 0,
|
||||
error: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn now_ms() -> u64 {
|
||||
SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_millis() as u64
|
||||
}
|
||||
|
||||
async fn run_step(
|
||||
state: &AppState,
|
||||
spec: &RunSpec,
|
||||
step: &str,
|
||||
tenant_id: Option<Uuid>,
|
||||
) -> Result<(), String> {
|
||||
match step {
|
||||
"preflight" => {
|
||||
let snapshots = fleet::snapshot(&state.http, &state.fleet_services).await;
|
||||
if snapshots.iter().any(|s| !s.ready_ok) {
|
||||
return Err("preflight failed: fleet not ready".to_string());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
"drain" => {
|
||||
tokio::time::sleep(Duration::from_millis(50)).await;
|
||||
Ok(())
|
||||
}
|
||||
"update_placement" => match spec {
|
||||
RunSpec::Migrate { runner_target } => {
|
||||
let tenant_id = tenant_id.ok_or_else(|| "missing tenant_id".to_string())?;
|
||||
state
|
||||
.placement
|
||||
.update_runner_target(tenant_id, runner_target.clone())
|
||||
.map(|_| ())
|
||||
}
|
||||
_ => Ok(()),
|
||||
},
|
||||
"reload" => {
|
||||
let _ = state.placement.tenant_summaries();
|
||||
Ok(())
|
||||
}
|
||||
"verify" => match spec {
|
||||
RunSpec::Migrate { runner_target } => {
|
||||
let tenant_id = tenant_id.ok_or_else(|| "missing tenant_id".to_string())?;
|
||||
let summaries = state.placement.tenant_summaries();
|
||||
let found = summaries
|
||||
.iter()
|
||||
.find(|t| t.tenant_id == tenant_id)
|
||||
.map(|t| t.runner_targets.iter().any(|x| x == runner_target))
|
||||
.unwrap_or(false);
|
||||
if !found {
|
||||
return Err("verify failed: placement not updated".to_string());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
_ => Ok(()),
|
||||
},
|
||||
_ => Ok(()),
|
||||
}
|
||||
}
|
||||
122
control/api/src/jobs.rs
Normal file
122
control/api/src/jobs.rs
Normal file
@@ -0,0 +1,122 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
sync::{
|
||||
Arc, Mutex,
|
||||
atomic::{AtomicBool, Ordering},
|
||||
},
|
||||
};
|
||||
use uuid::Uuid;
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum JobStatus {
|
||||
Pending,
|
||||
Running,
|
||||
Succeeded,
|
||||
Failed,
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct Job {
|
||||
pub job_id: Uuid,
|
||||
pub status: JobStatus,
|
||||
pub steps: Vec<JobStep>,
|
||||
pub error: Option<String>,
|
||||
pub created_at_ms: u64,
|
||||
pub started_at_ms: Option<u64>,
|
||||
pub finished_at_ms: Option<u64>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct JobStep {
|
||||
pub name: String,
|
||||
pub status: JobStatus,
|
||||
pub attempts: u32,
|
||||
pub error: Option<String>,
|
||||
}
|
||||
|
||||
struct JobRecord {
|
||||
job: Mutex<Job>,
|
||||
cancel: AtomicBool,
|
||||
}
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub struct JobStore {
|
||||
inner: Arc<Inner>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct Inner {
|
||||
jobs: Mutex<HashMap<Uuid, Arc<JobRecord>>>,
|
||||
idempotency: Mutex<HashMap<String, Uuid>>,
|
||||
}
|
||||
|
||||
impl JobStore {
|
||||
pub fn get(&self, job_id: Uuid) -> Option<Job> {
|
||||
let jobs = self.inner.jobs.lock().ok()?;
|
||||
let rec = jobs.get(&job_id)?.clone();
|
||||
rec.job.lock().ok().map(|j| j.clone())
|
||||
}
|
||||
|
||||
pub fn get_idempotent(&self, key: &str) -> Option<Uuid> {
|
||||
let map = self.inner.idempotency.lock().ok()?;
|
||||
map.get(key).copied()
|
||||
}
|
||||
|
||||
pub fn insert_idempotent(&self, key: &str, job: Job) -> Uuid {
|
||||
let mut idempotency = self
|
||||
.inner
|
||||
.idempotency
|
||||
.lock()
|
||||
.expect("idempotency lock poisoned");
|
||||
if let Some(existing) = idempotency.get(key) {
|
||||
return *existing;
|
||||
}
|
||||
|
||||
let job_id = job.job_id;
|
||||
let rec = Arc::new(JobRecord {
|
||||
job: Mutex::new(job),
|
||||
cancel: AtomicBool::new(false),
|
||||
});
|
||||
self.inner
|
||||
.jobs
|
||||
.lock()
|
||||
.expect("jobs lock poisoned")
|
||||
.insert(job_id, rec);
|
||||
|
||||
idempotency.insert(key.to_string(), job_id);
|
||||
job_id
|
||||
}
|
||||
|
||||
pub fn request_cancel(&self, job_id: Uuid) -> bool {
|
||||
let jobs = self.inner.jobs.lock().expect("jobs lock poisoned");
|
||||
let Some(rec) = jobs.get(&job_id) else {
|
||||
return false;
|
||||
};
|
||||
rec.cancel.store(true, Ordering::SeqCst);
|
||||
true
|
||||
}
|
||||
|
||||
pub fn cancel_requested(&self, job_id: Uuid) -> bool {
|
||||
let jobs = self.inner.jobs.lock().expect("jobs lock poisoned");
|
||||
let Some(rec) = jobs.get(&job_id) else {
|
||||
return false;
|
||||
};
|
||||
rec.cancel.load(Ordering::SeqCst)
|
||||
}
|
||||
|
||||
pub fn update<F>(&self, job_id: Uuid, f: F) -> bool
|
||||
where
|
||||
F: FnOnce(&mut Job),
|
||||
{
|
||||
let jobs = self.inner.jobs.lock().expect("jobs lock poisoned");
|
||||
let Some(rec) = jobs.get(&job_id) else {
|
||||
return false;
|
||||
};
|
||||
let mut job = rec.job.lock().expect("job lock poisoned");
|
||||
f(&mut job);
|
||||
true
|
||||
}
|
||||
}
|
||||
692
control/api/src/lib.rs
Normal file
692
control/api/src/lib.rs
Normal file
@@ -0,0 +1,692 @@
|
||||
mod admin;
|
||||
mod audit;
|
||||
mod auth;
|
||||
mod build_info;
|
||||
mod deployments;
|
||||
mod fleet;
|
||||
mod job_engine;
|
||||
mod jobs;
|
||||
mod placement;
|
||||
mod swarm;
|
||||
|
||||
pub use audit::AuditStore;
|
||||
pub use auth::{AuthConfig, Principal};
|
||||
use axum::{
|
||||
Router,
|
||||
extract::State,
|
||||
http::{HeaderName, HeaderValue, Request, StatusCode},
|
||||
middleware::{Next, from_fn, from_fn_with_state},
|
||||
response::{IntoResponse, Response},
|
||||
routing::get,
|
||||
};
|
||||
pub use build_info::{BuildInfo, extract_build_info};
|
||||
pub use deployments::{DeployAnnotationArgs, GrafanaAnnotation, build_grafana_deploy_annotation};
|
||||
pub use fleet::FleetService;
|
||||
pub use job_engine::TenantLocks;
|
||||
pub use jobs::JobStore;
|
||||
use metrics_exporter_prometheus::PrometheusHandle;
|
||||
pub use placement::PlacementStore;
|
||||
pub use placement::ServiceKind;
|
||||
use std::time::Instant;
|
||||
pub use swarm::SwarmStore;
|
||||
use tower_http::trace::TraceLayer;
|
||||
use tracing::{Span, field};
|
||||
use uuid::Uuid;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct AppState {
|
||||
pub prometheus: PrometheusHandle,
|
||||
pub auth: AuthConfig,
|
||||
pub jobs: JobStore,
|
||||
pub audit: AuditStore,
|
||||
pub tenant_locks: TenantLocks,
|
||||
pub http: reqwest::Client,
|
||||
pub placement: PlacementStore,
|
||||
pub fleet_services: Vec<FleetService>,
|
||||
pub swarm: SwarmStore,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RequestIds {
|
||||
pub request_id: String,
|
||||
pub correlation_id: Option<String>,
|
||||
pub traceparent: Option<String>,
|
||||
}
|
||||
|
||||
const HEADER_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
|
||||
const HEADER_CORRELATION_ID: HeaderName = HeaderName::from_static("x-correlation-id");
|
||||
const HEADER_TRACEPARENT: HeaderName = HeaderName::from_static("traceparent");
|
||||
|
||||
pub fn build_app(state: AppState) -> Router {
|
||||
let trace = TraceLayer::new_for_http()
|
||||
.make_span_with(|req: &Request<_>| {
|
||||
let request_id = req
|
||||
.headers()
|
||||
.get(&HEADER_REQUEST_ID)
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.unwrap_or("")
|
||||
.to_owned();
|
||||
|
||||
let correlation_id = req
|
||||
.headers()
|
||||
.get(&HEADER_CORRELATION_ID)
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.unwrap_or("")
|
||||
.to_owned();
|
||||
|
||||
tracing::info_span!(
|
||||
"http_request",
|
||||
request.method = %req.method(),
|
||||
request.path = %req.uri().path(),
|
||||
request_id = %request_id,
|
||||
correlation_id = %correlation_id,
|
||||
trace_id = "",
|
||||
status = field::Empty,
|
||||
duration_ms = field::Empty,
|
||||
)
|
||||
})
|
||||
.on_response(
|
||||
|res: &Response, latency: std::time::Duration, span: &Span| {
|
||||
span.record("status", field::display(res.status()));
|
||||
span.record("duration_ms", field::display(latency.as_millis()));
|
||||
tracing::info!("response");
|
||||
},
|
||||
);
|
||||
|
||||
let admin =
|
||||
admin::admin_router().layer(from_fn_with_state(state.clone(), auth::auth_middleware));
|
||||
|
||||
Router::new()
|
||||
.route("/health", get(health))
|
||||
.route("/ready", get(ready))
|
||||
.route("/metrics", get(metrics))
|
||||
.nest("/admin/v1", admin)
|
||||
.with_state(state)
|
||||
.layer(trace)
|
||||
.layer(from_fn(request_id_middleware))
|
||||
}
|
||||
|
||||
async fn health() -> impl IntoResponse {
|
||||
(StatusCode::OK, "ok")
|
||||
}
|
||||
|
||||
async fn ready() -> impl IntoResponse {
|
||||
(StatusCode::OK, "ready")
|
||||
}
|
||||
|
||||
async fn metrics(State(state): State<AppState>) -> impl IntoResponse {
|
||||
(StatusCode::OK, state.prometheus.render())
|
||||
}
|
||||
|
||||
async fn request_id_middleware(mut req: Request<axum::body::Body>, next: Next) -> Response {
|
||||
let request_id = req
|
||||
.headers()
|
||||
.get(&HEADER_REQUEST_ID)
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.map(|s| s.to_owned())
|
||||
.unwrap_or_else(|| Uuid::new_v4().to_string());
|
||||
|
||||
let correlation_id = req
|
||||
.headers()
|
||||
.get(&HEADER_CORRELATION_ID)
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.map(|s| s.to_owned());
|
||||
|
||||
let traceparent = req
|
||||
.headers()
|
||||
.get(&HEADER_TRACEPARENT)
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.map(|s| s.to_owned());
|
||||
|
||||
if req.headers().get(&HEADER_REQUEST_ID).is_none()
|
||||
&& let Ok(v) = HeaderValue::from_str(&request_id)
|
||||
{
|
||||
req.headers_mut().insert(HEADER_REQUEST_ID.clone(), v);
|
||||
}
|
||||
|
||||
req.extensions_mut().insert(RequestIds {
|
||||
request_id: request_id.clone(),
|
||||
correlation_id: correlation_id.clone(),
|
||||
traceparent: traceparent.clone(),
|
||||
});
|
||||
|
||||
let start = Instant::now();
|
||||
let mut res = next.run(req).await;
|
||||
|
||||
if let Ok(v) = HeaderValue::from_str(&request_id) {
|
||||
res.headers_mut().insert(HEADER_REQUEST_ID.clone(), v);
|
||||
}
|
||||
|
||||
if let Some(correlation_id) = correlation_id
|
||||
&& let Ok(v) = HeaderValue::from_str(&correlation_id)
|
||||
{
|
||||
res.headers_mut().insert(HEADER_CORRELATION_ID.clone(), v);
|
||||
}
|
||||
|
||||
metrics::histogram!("http_request_duration_ms").record(start.elapsed().as_millis() as f64);
|
||||
res
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::jobs::JobStatus;
|
||||
use axum::{
|
||||
body::Body,
|
||||
http::{Request, StatusCode, header},
|
||||
};
|
||||
use jsonwebtoken::{EncodingKey, Header, encode};
|
||||
use metrics_exporter_prometheus::PrometheusBuilder;
|
||||
use serde::Serialize;
|
||||
use std::fs;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::OnceLock;
|
||||
use tower::ServiceExt;
|
||||
use uuid::Uuid;
|
||||
|
||||
static HANDLE: OnceLock<PrometheusHandle> = OnceLock::new();
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct TestClaims {
|
||||
sub: String,
|
||||
session_id: String,
|
||||
permissions: Vec<String>,
|
||||
exp: usize,
|
||||
}
|
||||
|
||||
fn test_app() -> Router {
|
||||
test_app_with_fleet(vec![])
|
||||
}
|
||||
|
||||
fn test_app_with_fleet(fleet_services: Vec<FleetService>) -> Router {
|
||||
let handle = HANDLE
|
||||
.get_or_init(|| {
|
||||
PrometheusBuilder::new()
|
||||
.install_recorder()
|
||||
.expect("failed to install prometheus recorder")
|
||||
})
|
||||
.clone();
|
||||
|
||||
let placement_path = temp_placement_file();
|
||||
|
||||
build_app(AppState {
|
||||
prometheus: handle,
|
||||
auth: AuthConfig {
|
||||
hs256_secret: Some(b"test_secret".to_vec()),
|
||||
},
|
||||
jobs: JobStore::default(),
|
||||
audit: AuditStore::default(),
|
||||
tenant_locks: TenantLocks::default(),
|
||||
http: reqwest::Client::new(),
|
||||
placement: PlacementStore::new(placement_path),
|
||||
fleet_services,
|
||||
swarm: SwarmStore::new(repo_root().join("swarm/dev.json")),
|
||||
})
|
||||
}
|
||||
|
||||
fn repo_root() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.parent()
|
||||
.and_then(|p| p.parent())
|
||||
.expect("api crate should live under repo root")
|
||||
.to_path_buf()
|
||||
}
|
||||
|
||||
fn temp_placement_file() -> PathBuf {
|
||||
let root = repo_root();
|
||||
let src = root.join("placement/dev.json");
|
||||
let mut dst = std::env::temp_dir();
|
||||
dst.push(format!(
|
||||
"cloudlysis-control-placement-{}-{}.json",
|
||||
std::process::id(),
|
||||
Uuid::new_v4()
|
||||
));
|
||||
let raw = fs::read_to_string(src).expect("missing placement/dev.json");
|
||||
fs::write(&dst, raw).expect("failed to write temp placement file");
|
||||
dst
|
||||
}
|
||||
|
||||
fn assert_send_sync<T: Send + Sync>() {}
|
||||
|
||||
#[test]
|
||||
fn core_state_types_are_send_sync() {
|
||||
assert_send_sync::<AppState>();
|
||||
assert_send_sync::<JobStore>();
|
||||
assert_send_sync::<AuthConfig>();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn health_returns_200() {
|
||||
let res = test_app()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/health")
|
||||
.body(Body::empty())
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(res.status(), StatusCode::OK);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn ready_returns_200() {
|
||||
let res = test_app()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/ready")
|
||||
.body(Body::empty())
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(res.status(), StatusCode::OK);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn metrics_returns_200() {
|
||||
let res = test_app()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/metrics")
|
||||
.body(Body::empty())
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(res.status(), StatusCode::OK);
|
||||
}
|
||||
|
||||
fn make_token(perms: &[&str]) -> String {
|
||||
let exp = (std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs()
|
||||
+ 60) as usize;
|
||||
|
||||
encode(
|
||||
&Header::default(),
|
||||
&TestClaims {
|
||||
sub: "user_1".to_string(),
|
||||
session_id: "sess_1".to_string(),
|
||||
permissions: perms.iter().map(|p| (*p).to_string()).collect(),
|
||||
exp,
|
||||
},
|
||||
&EncodingKey::from_secret(b"test_secret"),
|
||||
)
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn unauthorized_admin_calls_return_401() {
|
||||
let res = test_app()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/admin/v1/platform/info")
|
||||
.body(Body::empty())
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(res.status(), StatusCode::UNAUTHORIZED);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn forbidden_admin_calls_return_403() {
|
||||
let token = make_token(&["control:read"]);
|
||||
let res = test_app()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/admin/v1/jobs/echo")
|
||||
.method("POST")
|
||||
.header(header::AUTHORIZATION, format!("Bearer {token}"))
|
||||
.header("idempotency-key", "k1")
|
||||
.body(Body::empty())
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(res.status(), StatusCode::FORBIDDEN);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn tenant_scoped_endpoints_require_x_tenant_id() {
|
||||
let token = make_token(&["control:read"]);
|
||||
let res = test_app()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/admin/v1/tenants/echo")
|
||||
.header(header::AUTHORIZATION, format!("Bearer {token}"))
|
||||
.body(Body::empty())
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(res.status(), StatusCode::BAD_REQUEST);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn job_create_is_idempotent() {
|
||||
let token = make_token(&["control:write"]);
|
||||
let app = test_app();
|
||||
let res1 = app
|
||||
.clone()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/admin/v1/jobs/echo")
|
||||
.method("POST")
|
||||
.header(header::AUTHORIZATION, format!("Bearer {token}"))
|
||||
.header("idempotency-key", "same-key")
|
||||
.body(Body::empty())
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(res1.status(), StatusCode::OK);
|
||||
let body1 = axum::body::to_bytes(res1.into_body(), 1024 * 1024)
|
||||
.await
|
||||
.unwrap();
|
||||
let v1: serde_json::Value = serde_json::from_slice(&body1).unwrap();
|
||||
let id1 = Uuid::parse_str(v1.get("job_id").unwrap().as_str().unwrap()).unwrap();
|
||||
|
||||
let res2 = app
|
||||
.clone()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/admin/v1/jobs/echo")
|
||||
.method("POST")
|
||||
.header(header::AUTHORIZATION, format!("Bearer {token}"))
|
||||
.header("idempotency-key", "same-key")
|
||||
.body(Body::empty())
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(res2.status(), StatusCode::OK);
|
||||
let body2 = axum::body::to_bytes(res2.into_body(), 1024 * 1024)
|
||||
.await
|
||||
.unwrap();
|
||||
let v2: serde_json::Value = serde_json::from_slice(&body2).unwrap();
|
||||
let id2 = Uuid::parse_str(v2.get("job_id").unwrap().as_str().unwrap()).unwrap();
|
||||
|
||||
assert_eq!(id1, id2);
|
||||
}
|
||||
|
||||
async fn wait_for_terminal_status(app: Router, job_id: Uuid) -> JobStatus {
|
||||
let start = tokio::time::Instant::now();
|
||||
loop {
|
||||
let res = app
|
||||
.clone()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri(format!("/admin/v1/jobs/{job_id}"))
|
||||
.header(
|
||||
header::AUTHORIZATION,
|
||||
format!("Bearer {}", make_token(&["control:read"])),
|
||||
)
|
||||
.body(Body::empty())
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
if res.status() == StatusCode::OK {
|
||||
let body = axum::body::to_bytes(res.into_body(), 1024 * 1024)
|
||||
.await
|
||||
.unwrap();
|
||||
let job: crate::jobs::Job = serde_json::from_slice(&body).unwrap();
|
||||
if job.status != JobStatus::Pending && job.status != JobStatus::Running {
|
||||
return job.status;
|
||||
}
|
||||
}
|
||||
|
||||
if start.elapsed() > std::time::Duration::from_millis(500) {
|
||||
return JobStatus::Failed;
|
||||
}
|
||||
tokio::time::sleep(std::time::Duration::from_millis(10)).await;
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn tenant_job_idempotency_does_not_duplicate_effects() {
|
||||
let token = make_token(&["control:write", "control:read"]);
|
||||
let app = test_app();
|
||||
let tenant_id = Uuid::new_v4();
|
||||
|
||||
let body = serde_json::json!({
|
||||
"tenant_id": tenant_id,
|
||||
"reason": "test",
|
||||
});
|
||||
|
||||
let res1 = app
|
||||
.clone()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/admin/v1/jobs/tenant/drain")
|
||||
.method("POST")
|
||||
.header(header::AUTHORIZATION, format!("Bearer {token}"))
|
||||
.header("idempotency-key", "same-key")
|
||||
.header(header::CONTENT_TYPE, "application/json")
|
||||
.body(Body::from(body.to_string()))
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(res1.status(), StatusCode::OK);
|
||||
|
||||
let res2 = app
|
||||
.clone()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/admin/v1/jobs/tenant/drain")
|
||||
.method("POST")
|
||||
.header(header::AUTHORIZATION, format!("Bearer {token}"))
|
||||
.header("idempotency-key", "same-key")
|
||||
.header(header::CONTENT_TYPE, "application/json")
|
||||
.body(Body::from(body.to_string()))
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(res2.status(), StatusCode::OK);
|
||||
|
||||
let b1 = axum::body::to_bytes(res1.into_body(), 1024 * 1024)
|
||||
.await
|
||||
.unwrap();
|
||||
let b2 = axum::body::to_bytes(res2.into_body(), 1024 * 1024)
|
||||
.await
|
||||
.unwrap();
|
||||
let v1: serde_json::Value = serde_json::from_slice(&b1).unwrap();
|
||||
let v2: serde_json::Value = serde_json::from_slice(&b2).unwrap();
|
||||
assert_eq!(v1.get("job_id"), v2.get("job_id"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn tenant_lock_prevents_concurrent_mutations() {
|
||||
let token = make_token(&["control:write", "control:read"]);
|
||||
let app = test_app();
|
||||
let tenant_id = Uuid::new_v4();
|
||||
|
||||
let res1 = app
|
||||
.clone()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/admin/v1/jobs/tenant/drain")
|
||||
.method("POST")
|
||||
.header(header::AUTHORIZATION, format!("Bearer {token}"))
|
||||
.header("idempotency-key", "k1")
|
||||
.header(header::CONTENT_TYPE, "application/json")
|
||||
.body(Body::from(
|
||||
serde_json::json!({ "tenant_id": tenant_id, "reason": "r" }).to_string(),
|
||||
))
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(res1.status(), StatusCode::OK);
|
||||
|
||||
let res2 = app
|
||||
.clone()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/admin/v1/jobs/tenant/migrate")
|
||||
.method("POST")
|
||||
.header(header::AUTHORIZATION, format!("Bearer {token}"))
|
||||
.header("idempotency-key", "k2")
|
||||
.header(header::CONTENT_TYPE, "application/json")
|
||||
.body(Body::from(
|
||||
serde_json::json!({
|
||||
"tenant_id": tenant_id,
|
||||
"runner_target": "node-2",
|
||||
"reason": "r2"
|
||||
})
|
||||
.to_string(),
|
||||
))
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(res2.status(), StatusCode::CONFLICT);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn migrate_preflight_fails_when_fleet_not_ready() {
|
||||
let token = make_token(&["control:write", "control:read"]);
|
||||
let app = test_app_with_fleet(vec![FleetService {
|
||||
name: "unreachable".to_string(),
|
||||
base_url: "http://127.0.0.1:1".to_string(),
|
||||
}]);
|
||||
|
||||
let tenant_id = Uuid::new_v4();
|
||||
let res = app
|
||||
.clone()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/admin/v1/jobs/tenant/migrate")
|
||||
.method("POST")
|
||||
.header(header::AUTHORIZATION, format!("Bearer {token}"))
|
||||
.header("idempotency-key", "k3")
|
||||
.header(header::CONTENT_TYPE, "application/json")
|
||||
.body(Body::from(
|
||||
serde_json::json!({
|
||||
"tenant_id": tenant_id,
|
||||
"runner_target": "node-2",
|
||||
"reason": "r"
|
||||
})
|
||||
.to_string(),
|
||||
))
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(res.status(), StatusCode::OK);
|
||||
|
||||
let body = axum::body::to_bytes(res.into_body(), 1024 * 1024)
|
||||
.await
|
||||
.unwrap();
|
||||
let v: serde_json::Value = serde_json::from_slice(&body).unwrap();
|
||||
let job_id = Uuid::parse_str(v.get("job_id").unwrap().as_str().unwrap()).unwrap();
|
||||
|
||||
let status = wait_for_terminal_status(app, job_id).await;
|
||||
assert_eq!(status, JobStatus::Failed);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn cancel_marks_job_cancelled() {
|
||||
let token = make_token(&["control:write", "control:read"]);
|
||||
let app = test_app();
|
||||
let tenant_id = Uuid::new_v4();
|
||||
|
||||
let res = app
|
||||
.clone()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/admin/v1/jobs/tenant/migrate")
|
||||
.method("POST")
|
||||
.header(header::AUTHORIZATION, format!("Bearer {token}"))
|
||||
.header("idempotency-key", "k4")
|
||||
.header(header::CONTENT_TYPE, "application/json")
|
||||
.body(Body::from(
|
||||
serde_json::json!({
|
||||
"tenant_id": tenant_id,
|
||||
"runner_target": "node-2",
|
||||
"reason": "r"
|
||||
})
|
||||
.to_string(),
|
||||
))
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(res.status(), StatusCode::OK);
|
||||
|
||||
let body = axum::body::to_bytes(res.into_body(), 1024 * 1024)
|
||||
.await
|
||||
.unwrap();
|
||||
let v: serde_json::Value = serde_json::from_slice(&body).unwrap();
|
||||
let job_id = Uuid::parse_str(v.get("job_id").unwrap().as_str().unwrap()).unwrap();
|
||||
|
||||
let res = app
|
||||
.clone()
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri(format!("/admin/v1/jobs/{job_id}/cancel"))
|
||||
.method("POST")
|
||||
.header(header::AUTHORIZATION, format!("Bearer {token}"))
|
||||
.body(Body::empty())
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(res.status(), StatusCode::OK);
|
||||
|
||||
let status = wait_for_terminal_status(app, job_id).await;
|
||||
assert_eq!(status, JobStatus::Cancelled);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn migration_plan_is_deterministic() {
|
||||
let token = make_token(&["control:write"]);
|
||||
let app = test_app();
|
||||
let tenant_id = Uuid::new_v4();
|
||||
|
||||
let res = app
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/admin/v1/plan/tenant/migrate")
|
||||
.method("POST")
|
||||
.header(header::AUTHORIZATION, format!("Bearer {token}"))
|
||||
.header(header::CONTENT_TYPE, "application/json")
|
||||
.body(Body::from(
|
||||
serde_json::json!({
|
||||
"tenant_id": tenant_id,
|
||||
"runner_target": "node-2",
|
||||
"reason": "r"
|
||||
})
|
||||
.to_string(),
|
||||
))
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(res.status(), StatusCode::OK);
|
||||
|
||||
let body = axum::body::to_bytes(res.into_body(), 1024 * 1024)
|
||||
.await
|
||||
.unwrap();
|
||||
let v: serde_json::Value = serde_json::from_slice(&body).unwrap();
|
||||
assert_eq!(
|
||||
v.get("steps").unwrap(),
|
||||
&serde_json::json!(["preflight", "drain", "update_placement", "reload", "verify"])
|
||||
);
|
||||
}
|
||||
}
|
||||
109
control/api/src/main.rs
Normal file
109
control/api/src/main.rs
Normal file
@@ -0,0 +1,109 @@
|
||||
use clap::Parser;
|
||||
use metrics_exporter_prometheus::PrometheusBuilder;
|
||||
use std::net::SocketAddr;
|
||||
use tracing_subscriber::EnvFilter;
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(name = "control-api")]
|
||||
struct Args {
|
||||
#[arg(long, env = "CONTROL_API_ADDR", default_value = "127.0.0.1:8080")]
|
||||
addr: SocketAddr,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let args = Args::parse();
|
||||
|
||||
tracing_subscriber::fmt()
|
||||
.with_env_filter(
|
||||
EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")),
|
||||
)
|
||||
.init();
|
||||
|
||||
let recorder = PrometheusBuilder::new()
|
||||
.set_buckets(&[
|
||||
1.0, 2.5, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0, 1000.0, 2500.0, 5000.0,
|
||||
])
|
||||
.expect("invalid prometheus buckets")
|
||||
.install_recorder()
|
||||
.expect("failed to install prometheus recorder");
|
||||
|
||||
let http = reqwest::Client::builder()
|
||||
.user_agent("cloudlysis-control-api")
|
||||
.build()
|
||||
.expect("failed to build http client");
|
||||
|
||||
let placement_path = std::env::var("CONTROL_PLACEMENT_PATH")
|
||||
.ok()
|
||||
.unwrap_or_else(|| "placement/dev.json".to_string())
|
||||
.into();
|
||||
|
||||
let swarm_path = std::env::var("CONTROL_SWARM_STATE_PATH")
|
||||
.ok()
|
||||
.unwrap_or_else(|| "swarm/dev.json".to_string())
|
||||
.into();
|
||||
|
||||
let self_url = std::env::var("CONTROL_SELF_URL")
|
||||
.ok()
|
||||
.unwrap_or_else(|| "http://127.0.0.1:8080".to_string());
|
||||
|
||||
let mut fleet_services = vec![api::FleetService {
|
||||
name: "control-api".to_string(),
|
||||
base_url: self_url,
|
||||
}];
|
||||
if let Ok(spec) = std::env::var("CONTROL_FLEET_SERVICES") {
|
||||
fleet_services.extend(parse_fleet_services(&spec));
|
||||
}
|
||||
|
||||
let app = api::build_app(api::AppState {
|
||||
prometheus: recorder,
|
||||
auth: api::AuthConfig {
|
||||
hs256_secret: std::env::var("CONTROL_GATEWAY_JWT_HS256_SECRET")
|
||||
.ok()
|
||||
.map(|s| s.into_bytes()),
|
||||
},
|
||||
jobs: api::JobStore::default(),
|
||||
audit: api::AuditStore::default(),
|
||||
tenant_locks: api::TenantLocks::default(),
|
||||
http,
|
||||
placement: api::PlacementStore::new(placement_path),
|
||||
fleet_services,
|
||||
swarm: api::SwarmStore::new(swarm_path),
|
||||
});
|
||||
|
||||
let listener = tokio::net::TcpListener::bind(args.addr)
|
||||
.await
|
||||
.expect("failed to bind");
|
||||
|
||||
tracing::info!(addr = %args.addr, "control api listening");
|
||||
|
||||
axum::serve(listener, app)
|
||||
.with_graceful_shutdown(shutdown_signal())
|
||||
.await
|
||||
.expect("server failed");
|
||||
}
|
||||
|
||||
async fn shutdown_signal() {
|
||||
let _ = tokio::signal::ctrl_c().await;
|
||||
}
|
||||
|
||||
fn parse_fleet_services(spec: &str) -> Vec<api::FleetService> {
|
||||
spec.split(',')
|
||||
.filter_map(|pair| {
|
||||
let pair = pair.trim();
|
||||
if pair.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let (name, url) = pair.split_once('=')?;
|
||||
let name = name.trim();
|
||||
let url = url.trim();
|
||||
if name.is_empty() || url.is_empty() {
|
||||
return None;
|
||||
}
|
||||
Some(api::FleetService {
|
||||
name: name.to_string(),
|
||||
base_url: url.to_string(),
|
||||
})
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
227
control/api/src/placement.rs
Normal file
227
control/api/src/placement.rs
Normal file
@@ -0,0 +1,227 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{
|
||||
collections::BTreeMap,
|
||||
fs,
|
||||
path::{Path, PathBuf},
|
||||
sync::{Arc, RwLock},
|
||||
time::SystemTime,
|
||||
};
|
||||
use uuid::Uuid;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum ServiceKind {
|
||||
Aggregate,
|
||||
Projection,
|
||||
Runner,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct PlacementFile {
|
||||
pub revision: Option<String>,
|
||||
pub aggregate_placement: Option<PlacementKind>,
|
||||
pub projection_placement: Option<PlacementKind>,
|
||||
pub runner_placement: Option<PlacementKind>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct PlacementKind {
|
||||
pub placements: Vec<TenantPlacement>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct TenantPlacement {
|
||||
pub tenant_id: Uuid,
|
||||
pub targets: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct PlacementResponse {
|
||||
pub kind: ServiceKind,
|
||||
pub revision: String,
|
||||
pub placements: Vec<TenantPlacement>,
|
||||
}
|
||||
|
||||
impl PlacementFile {
|
||||
pub fn load(path: &Path) -> Option<Self> {
|
||||
let raw = fs::read_to_string(path).ok()?;
|
||||
serde_json::from_str(&raw).ok()
|
||||
}
|
||||
|
||||
pub fn for_kind(&self, kind: ServiceKind) -> PlacementResponse {
|
||||
let revision = self.revision.clone().unwrap_or_else(|| "dev".to_string());
|
||||
let placements = match kind {
|
||||
ServiceKind::Aggregate => self
|
||||
.aggregate_placement
|
||||
.as_ref()
|
||||
.map(|p| p.placements.clone())
|
||||
.unwrap_or_default(),
|
||||
ServiceKind::Projection => self
|
||||
.projection_placement
|
||||
.as_ref()
|
||||
.map(|p| p.placements.clone())
|
||||
.unwrap_or_default(),
|
||||
ServiceKind::Runner => self
|
||||
.runner_placement
|
||||
.as_ref()
|
||||
.map(|p| p.placements.clone())
|
||||
.unwrap_or_default(),
|
||||
};
|
||||
|
||||
PlacementResponse {
|
||||
kind,
|
||||
revision,
|
||||
placements,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct PlacementStore {
|
||||
inner: Arc<RwLock<Inner>>,
|
||||
}
|
||||
|
||||
struct Inner {
|
||||
path: PathBuf,
|
||||
last_modified: Option<SystemTime>,
|
||||
cached: Option<PlacementFile>,
|
||||
}
|
||||
|
||||
impl PlacementStore {
|
||||
pub fn new(path: PathBuf) -> Self {
|
||||
Self {
|
||||
inner: Arc::new(RwLock::new(Inner {
|
||||
path,
|
||||
last_modified: None,
|
||||
cached: None,
|
||||
})),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_for_kind(&self, kind: ServiceKind) -> PlacementResponse {
|
||||
let mut inner = self.inner.write().expect("placement lock poisoned");
|
||||
inner.reload_if_changed();
|
||||
match inner.cached.as_ref() {
|
||||
Some(p) => p.for_kind(kind),
|
||||
None => PlacementResponse {
|
||||
kind,
|
||||
revision: "dev".to_string(),
|
||||
placements: vec![],
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub fn tenant_summaries(&self) -> Vec<TenantSummary> {
|
||||
let mut inner = self.inner.write().expect("placement lock poisoned");
|
||||
inner.reload_if_changed();
|
||||
|
||||
let Some(p) = inner.cached.as_ref() else {
|
||||
return vec![];
|
||||
};
|
||||
|
||||
let mut map: BTreeMap<Uuid, TenantSummary> = BTreeMap::new();
|
||||
|
||||
for (kind, placements) in [
|
||||
(
|
||||
ServiceKind::Aggregate,
|
||||
p.for_kind(ServiceKind::Aggregate).placements,
|
||||
),
|
||||
(
|
||||
ServiceKind::Projection,
|
||||
p.for_kind(ServiceKind::Projection).placements,
|
||||
),
|
||||
(
|
||||
ServiceKind::Runner,
|
||||
p.for_kind(ServiceKind::Runner).placements,
|
||||
),
|
||||
] {
|
||||
for tp in placements {
|
||||
let entry = map.entry(tp.tenant_id).or_insert_with(|| TenantSummary {
|
||||
tenant_id: tp.tenant_id,
|
||||
aggregate_targets: vec![],
|
||||
projection_targets: vec![],
|
||||
runner_targets: vec![],
|
||||
});
|
||||
match kind {
|
||||
ServiceKind::Aggregate => entry.aggregate_targets = tp.targets,
|
||||
ServiceKind::Projection => entry.projection_targets = tp.targets,
|
||||
ServiceKind::Runner => entry.runner_targets = tp.targets,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
map.into_values().collect()
|
||||
}
|
||||
|
||||
pub fn update_runner_target(
|
||||
&self,
|
||||
tenant_id: Uuid,
|
||||
runner_target: String,
|
||||
) -> Result<String, String> {
|
||||
let mut inner = self.inner.write().expect("placement lock poisoned");
|
||||
inner.reload_if_changed();
|
||||
|
||||
let mut file = inner.cached.clone().unwrap_or(PlacementFile {
|
||||
revision: Some("dev".to_string()),
|
||||
aggregate_placement: Some(PlacementKind { placements: vec![] }),
|
||||
projection_placement: Some(PlacementKind { placements: vec![] }),
|
||||
runner_placement: Some(PlacementKind { placements: vec![] }),
|
||||
});
|
||||
|
||||
let mut runner = file
|
||||
.runner_placement
|
||||
.take()
|
||||
.unwrap_or(PlacementKind { placements: vec![] });
|
||||
|
||||
if let Some(existing) = runner
|
||||
.placements
|
||||
.iter_mut()
|
||||
.find(|p| p.tenant_id == tenant_id)
|
||||
{
|
||||
existing.targets = vec![runner_target];
|
||||
} else {
|
||||
runner.placements.push(TenantPlacement {
|
||||
tenant_id,
|
||||
targets: vec![runner_target],
|
||||
});
|
||||
}
|
||||
|
||||
runner.placements.sort_by_key(|p| p.tenant_id);
|
||||
file.runner_placement = Some(runner);
|
||||
|
||||
let revision = format!("rev-{}", Uuid::new_v4());
|
||||
file.revision = Some(revision.clone());
|
||||
|
||||
let raw = serde_json::to_string_pretty(&file).map_err(|e| e.to_string())?;
|
||||
let tmp = inner.path.with_extension("json.tmp");
|
||||
fs::write(&tmp, raw).map_err(|e| e.to_string())?;
|
||||
fs::rename(&tmp, &inner.path).map_err(|e| e.to_string())?;
|
||||
|
||||
inner.last_modified = None;
|
||||
inner.cached = Some(file);
|
||||
|
||||
Ok(revision)
|
||||
}
|
||||
}
|
||||
|
||||
impl Inner {
|
||||
fn reload_if_changed(&mut self) {
|
||||
let meta = fs::metadata(&self.path).ok();
|
||||
let modified = meta.and_then(|m| m.modified().ok());
|
||||
|
||||
if self.cached.is_some() && modified.is_some() && modified == self.last_modified {
|
||||
return;
|
||||
}
|
||||
|
||||
self.last_modified = modified;
|
||||
self.cached = PlacementFile::load(&self.path);
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct TenantSummary {
|
||||
pub tenant_id: Uuid,
|
||||
pub aggregate_targets: Vec<String>,
|
||||
pub projection_targets: Vec<String>,
|
||||
pub runner_targets: Vec<String>,
|
||||
}
|
||||
62
control/api/src/swarm.rs
Normal file
62
control/api/src/swarm.rs
Normal file
@@ -0,0 +1,62 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{fs, path::Path};
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct SwarmService {
|
||||
pub name: String,
|
||||
pub image: Option<String>,
|
||||
pub mode: Option<String>,
|
||||
pub replicas: Option<String>,
|
||||
pub updated_at: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct SwarmTask {
|
||||
pub id: String,
|
||||
pub service: String,
|
||||
pub node: Option<String>,
|
||||
pub desired_state: Option<String>,
|
||||
pub current_state: Option<String>,
|
||||
pub error: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct SwarmStateFile {
|
||||
pub services: Vec<SwarmService>,
|
||||
pub tasks: Vec<SwarmTask>,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct SwarmStore {
|
||||
path: std::path::PathBuf,
|
||||
}
|
||||
|
||||
impl SwarmStore {
|
||||
pub fn new(path: std::path::PathBuf) -> Self {
|
||||
Self { path }
|
||||
}
|
||||
|
||||
pub fn list_services(&self) -> Vec<SwarmService> {
|
||||
self.load().map(|s| s.services).unwrap_or_default()
|
||||
}
|
||||
|
||||
pub fn list_tasks(&self, service_name: &str) -> Vec<SwarmTask> {
|
||||
self.load()
|
||||
.map(|s| {
|
||||
s.tasks
|
||||
.into_iter()
|
||||
.filter(|t| t.service == service_name)
|
||||
.collect()
|
||||
})
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
fn load(&self) -> Option<SwarmStateFile> {
|
||||
load_state(&self.path)
|
||||
}
|
||||
}
|
||||
|
||||
fn load_state(path: &Path) -> Option<SwarmStateFile> {
|
||||
let raw = fs::read_to_string(path).ok()?;
|
||||
serde_json::from_str(&raw).ok()
|
||||
}
|
||||
16
control/api/tests/annotations.rs
Normal file
16
control/api/tests/annotations.rs
Normal file
@@ -0,0 +1,16 @@
|
||||
#[test]
|
||||
fn annotation_writer_produces_expected_grafana_payload() {
|
||||
let a = api::build_grafana_deploy_annotation(api::DeployAnnotationArgs {
|
||||
service: "gateway",
|
||||
version: Some("1.2.3"),
|
||||
git_sha: Some("abc123"),
|
||||
time_ms: 1234567890,
|
||||
});
|
||||
|
||||
assert_eq!(a.time, 1234567890);
|
||||
assert!(a.tags.iter().any(|t| t == "deploy"));
|
||||
assert!(a.tags.iter().any(|t| t == "service:gateway"));
|
||||
assert!(a.tags.iter().any(|t| t == "version:1.2.3"));
|
||||
assert!(a.tags.iter().any(|t| t == "git_sha:abc123"));
|
||||
assert!(a.text.contains("deploy gateway"));
|
||||
}
|
||||
39
control/api/tests/build_info.rs
Normal file
39
control/api/tests/build_info.rs
Normal file
@@ -0,0 +1,39 @@
|
||||
#[test]
|
||||
fn build_info_parser_extracts_expected_labels() {
|
||||
let metrics = r#"
|
||||
# HELP gateway_build_info build info
|
||||
# TYPE gateway_build_info gauge
|
||||
gateway_build_info{service="gateway",version="1.2.3",git_sha="abc"} 1
|
||||
runner_build_info{service="runner",version="2.0.0",git_sha="def"} 1
|
||||
unrelated_metric 5
|
||||
"#;
|
||||
|
||||
let info = api::extract_build_info(metrics);
|
||||
assert_eq!(info.len(), 2);
|
||||
assert!(
|
||||
info.iter()
|
||||
.any(|i| i.service == "gateway" && i.version == "1.2.3" && i.git_sha == "abc")
|
||||
);
|
||||
assert!(
|
||||
info.iter()
|
||||
.any(|i| i.service == "runner" && i.version == "2.0.0" && i.git_sha == "def")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_info_snapshot_has_required_services() {
|
||||
let metrics = r#"
|
||||
gateway_build_info{service="gateway",version="1.2.3",git_sha="abc"} 1
|
||||
aggregate_build_info{service="aggregate",version="1.0.0",git_sha="aaa"} 1
|
||||
projection_build_info{service="projection",version="1.0.0",git_sha="bbb"} 1
|
||||
runner_build_info{service="runner",version="2.0.0",git_sha="ccc"} 1
|
||||
"#;
|
||||
|
||||
let info = api::extract_build_info(metrics);
|
||||
for required in ["gateway", "aggregate", "projection", "runner"] {
|
||||
assert!(
|
||||
info.iter().any(|i| i.service == required),
|
||||
"missing build_info for service={required}"
|
||||
);
|
||||
}
|
||||
}
|
||||
55
control/api/tests/docker_config_validation.rs
Normal file
55
control/api/tests/docker_config_validation.rs
Normal file
@@ -0,0 +1,55 @@
|
||||
use std::{fs, path::PathBuf, time::Duration};
|
||||
|
||||
fn repo_root() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.parent()
|
||||
.and_then(|p| p.parent())
|
||||
.expect("api crate should live under repo root")
|
||||
.to_path_buf()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn docker_compose_files_parse_and_include_required_services() {
|
||||
let root = repo_root();
|
||||
let compose = fs::read_to_string(root.join("observability/docker-compose.yml")).unwrap();
|
||||
let v: serde_yaml::Value = serde_yaml::from_str(&compose).unwrap();
|
||||
|
||||
let services = v
|
||||
.get("services")
|
||||
.and_then(|x| x.as_mapping())
|
||||
.expect("missing services");
|
||||
|
||||
for required in ["grafana", "victoria-metrics", "vmagent", "loki", "tempo"] {
|
||||
assert!(
|
||||
services.contains_key(serde_yaml::Value::String(required.to_string())),
|
||||
"missing service {required}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore]
|
||||
async fn docker_compose_config_validation_is_gated_and_fast() {
|
||||
let enabled = std::env::var("CONTROL_TEST_DOCKER").ok();
|
||||
assert_eq!(enabled.as_deref(), Some("1"));
|
||||
|
||||
let root = repo_root();
|
||||
let compose = root.join("observability/docker-compose.yml");
|
||||
|
||||
let cmd = tokio::process::Command::new("docker")
|
||||
.args(["compose", "-f"])
|
||||
.arg(compose)
|
||||
.args(["config"])
|
||||
.output();
|
||||
|
||||
let out = tokio::time::timeout(Duration::from_secs(10), cmd)
|
||||
.await
|
||||
.expect("docker compose config timed out")
|
||||
.expect("failed to run docker compose config");
|
||||
|
||||
assert!(
|
||||
out.status.success(),
|
||||
"docker compose config failed: {}",
|
||||
String::from_utf8_lossy(&out.stderr)
|
||||
);
|
||||
}
|
||||
6
control/api/tests/docker_gated.rs
Normal file
6
control/api/tests/docker_gated.rs
Normal file
@@ -0,0 +1,6 @@
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn docker_integration_tests_are_gated() {
|
||||
let enabled = std::env::var("CONTROL_TEST_DOCKER").ok();
|
||||
assert_eq!(enabled.as_deref(), Some("1"));
|
||||
}
|
||||
183
control/api/tests/e2e_control_plane_fleet_docker.rs
Normal file
183
control/api/tests/e2e_control_plane_fleet_docker.rs
Normal file
@@ -0,0 +1,183 @@
|
||||
use jsonwebtoken::{EncodingKey, Header, encode};
|
||||
use serde::Serialize;
|
||||
use std::{fs, net::TcpListener, time::Duration};
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct Claims {
|
||||
sub: String,
|
||||
session_id: String,
|
||||
permissions: Vec<String>,
|
||||
exp: usize,
|
||||
}
|
||||
|
||||
fn free_port() -> u16 {
|
||||
TcpListener::bind("127.0.0.1:0")
|
||||
.unwrap()
|
||||
.local_addr()
|
||||
.unwrap()
|
||||
.port()
|
||||
}
|
||||
|
||||
fn token(secret: &[u8], perms: &[&str]) -> String {
|
||||
let exp = (std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs()
|
||||
+ 60) as usize;
|
||||
|
||||
encode(
|
||||
&Header::default(),
|
||||
&Claims {
|
||||
sub: "op_1".to_string(),
|
||||
session_id: "sess_1".to_string(),
|
||||
permissions: perms.iter().map(|p| (*p).to_string()).collect(),
|
||||
exp,
|
||||
},
|
||||
&EncodingKey::from_secret(secret),
|
||||
)
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
async fn wait_ready(url: &str) {
|
||||
let client = reqwest::Client::new();
|
||||
let start = tokio::time::Instant::now();
|
||||
loop {
|
||||
let ok = client
|
||||
.get(format!("{url}/ready"))
|
||||
.send()
|
||||
.await
|
||||
.map(|r| r.status().is_success())
|
||||
.unwrap_or(false);
|
||||
if ok {
|
||||
return;
|
||||
}
|
||||
if start.elapsed() > Duration::from_secs(10) {
|
||||
panic!("control-api did not become ready");
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(100)).await;
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore]
|
||||
async fn control_plane_can_see_the_fleet_via_docker_stubs() {
|
||||
let enabled = std::env::var("CONTROL_TEST_DOCKER").ok();
|
||||
assert_eq!(enabled.as_deref(), Some("1"));
|
||||
|
||||
let nginx_conf = r#"
|
||||
server {
|
||||
listen 80;
|
||||
server_name _;
|
||||
|
||||
location = /health { return 200 "ok\n"; }
|
||||
location = /ready { return 200 "ready\n"; }
|
||||
location = /metrics { return 200 "stub_build_info{service=\"stub\",version=\"dev\",git_sha=\"000\"} 1\n"; }
|
||||
}
|
||||
"#;
|
||||
|
||||
let mut conf_path = std::env::temp_dir();
|
||||
conf_path.push(format!(
|
||||
"cloudlysis-control-nginx-{}.conf",
|
||||
uuid::Uuid::new_v4()
|
||||
));
|
||||
fs::write(&conf_path, nginx_conf).unwrap();
|
||||
|
||||
let gateway_port = free_port();
|
||||
let runner_port = free_port();
|
||||
let aggregate_port = free_port();
|
||||
let projection_port = free_port();
|
||||
|
||||
async fn run_stub(name: &str, port: u16, conf: &std::path::Path) -> String {
|
||||
let out = tokio::process::Command::new("docker")
|
||||
.args(["run", "-d", "--rm"])
|
||||
.args(["-p", &format!("{port}:80")])
|
||||
.args([
|
||||
"-v",
|
||||
&format!("{}:/etc/nginx/conf.d/default.conf:ro", conf.display()),
|
||||
])
|
||||
.arg("nginx:1.29-alpine")
|
||||
.output()
|
||||
.await
|
||||
.expect("failed to run docker");
|
||||
assert!(
|
||||
out.status.success(),
|
||||
"{name} stub failed: {}",
|
||||
String::from_utf8_lossy(&out.stderr)
|
||||
);
|
||||
String::from_utf8_lossy(&out.stdout).trim().to_string()
|
||||
}
|
||||
|
||||
let gateway_id = run_stub("gateway", gateway_port, &conf_path).await;
|
||||
let runner_id = run_stub("runner", runner_port, &conf_path).await;
|
||||
let aggregate_id = run_stub("aggregate", aggregate_port, &conf_path).await;
|
||||
let projection_id = run_stub("projection", projection_port, &conf_path).await;
|
||||
|
||||
let secret = b"e2e_secret";
|
||||
let api_port = free_port();
|
||||
let api_url = format!("http://127.0.0.1:{api_port}");
|
||||
|
||||
let mut placement_path = std::env::temp_dir();
|
||||
placement_path.push(format!(
|
||||
"cloudlysis-control-placement-{}.json",
|
||||
uuid::Uuid::new_v4()
|
||||
));
|
||||
fs::write(
|
||||
&placement_path,
|
||||
r#"{"revision":"e2e","aggregate_placement":{"placements":[]},"projection_placement":{"placements":[]},"runner_placement":{"placements":[]}}"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let mut child = tokio::process::Command::new(env!("CARGO_BIN_EXE_api"))
|
||||
.env("CONTROL_API_ADDR", format!("127.0.0.1:{api_port}"))
|
||||
.env("CONTROL_GATEWAY_JWT_HS256_SECRET", "e2e_secret")
|
||||
.env("CONTROL_PLACEMENT_PATH", placement_path.to_string_lossy().to_string())
|
||||
.env(
|
||||
"CONTROL_FLEET_SERVICES",
|
||||
format!(
|
||||
"gateway=http://127.0.0.1:{gateway_port},aggregate=http://127.0.0.1:{aggregate_port},projection=http://127.0.0.1:{projection_port},runner=http://127.0.0.1:{runner_port}"
|
||||
),
|
||||
)
|
||||
.spawn()
|
||||
.expect("failed to spawn control-api");
|
||||
|
||||
wait_ready(&api_url).await;
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let t = token(secret, &["control:read"]);
|
||||
|
||||
let res = client
|
||||
.get(format!("{api_url}/admin/v1/fleet/snapshot"))
|
||||
.header(reqwest::header::AUTHORIZATION, format!("Bearer {t}"))
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(res.status().is_success());
|
||||
|
||||
let v: serde_json::Value = res.json().await.unwrap();
|
||||
let services = v.get("services").and_then(|x| x.as_array()).unwrap();
|
||||
assert!(
|
||||
services.len() >= 5,
|
||||
"expected at least 5 services (including control-api), got {}",
|
||||
services.len()
|
||||
);
|
||||
|
||||
let res = client
|
||||
.get(format!("{api_url}/admin/v1/tenants"))
|
||||
.header(reqwest::header::AUTHORIZATION, format!("Bearer {t}"))
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(res.status().is_success());
|
||||
|
||||
let _ = child.kill().await;
|
||||
|
||||
for id in [gateway_id, runner_id, aggregate_id, projection_id] {
|
||||
let _ = tokio::process::Command::new("docker")
|
||||
.args(["stop", &id])
|
||||
.output()
|
||||
.await;
|
||||
}
|
||||
|
||||
let _ = fs::remove_file(&conf_path);
|
||||
let _ = fs::remove_file(&placement_path);
|
||||
}
|
||||
30
control/api/tests/fleet_services_env.rs
Normal file
30
control/api/tests/fleet_services_env.rs
Normal file
@@ -0,0 +1,30 @@
|
||||
#[test]
|
||||
fn fleet_services_env_parser_is_lenient() {
|
||||
let services = {
|
||||
fn parse(spec: &str) -> Vec<api::FleetService> {
|
||||
spec.split(',')
|
||||
.filter_map(|pair| {
|
||||
let pair = pair.trim();
|
||||
if pair.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let (name, url) = pair.split_once('=')?;
|
||||
let name = name.trim();
|
||||
let url = url.trim();
|
||||
if name.is_empty() || url.is_empty() {
|
||||
return None;
|
||||
}
|
||||
Some(api::FleetService {
|
||||
name: name.to_string(),
|
||||
base_url: url.to_string(),
|
||||
})
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
parse(" gateway=http://x , ,runner=http://y,broken, =http://z ")
|
||||
};
|
||||
|
||||
assert_eq!(services.len(), 2);
|
||||
assert_eq!(services[0].name, "gateway");
|
||||
assert_eq!(services[1].name, "runner");
|
||||
}
|
||||
23
control/api/tests/nats_gated.rs
Normal file
23
control/api/tests/nats_gated.rs
Normal file
@@ -0,0 +1,23 @@
|
||||
use std::time::Duration;
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore]
|
||||
async fn nats_integration_tests_are_gated_and_fast_fail() {
|
||||
let url = std::env::var("CONTROL_TEST_NATS_URL").expect("CONTROL_TEST_NATS_URL is required");
|
||||
|
||||
let without_scheme = url.strip_prefix("nats://").unwrap_or(url.as_str());
|
||||
let hostport = without_scheme.split('/').next().unwrap_or(without_scheme);
|
||||
let mut parts = hostport.split(':');
|
||||
let host = parts.next().unwrap_or("127.0.0.1");
|
||||
let port: u16 = parts
|
||||
.next()
|
||||
.unwrap_or("4222")
|
||||
.parse()
|
||||
.expect("invalid port in CONTROL_TEST_NATS_URL");
|
||||
|
||||
let connect = tokio::net::TcpStream::connect((host, port));
|
||||
tokio::time::timeout(Duration::from_secs(2), connect)
|
||||
.await
|
||||
.expect("tcp connect to NATS timed out")
|
||||
.expect("failed to connect to NATS");
|
||||
}
|
||||
75
control/api/tests/observability_configs.rs
Normal file
75
control/api/tests/observability_configs.rs
Normal file
@@ -0,0 +1,75 @@
|
||||
use std::{collections::BTreeSet, fs, path::PathBuf};
|
||||
|
||||
fn repo_root() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.parent()
|
||||
.and_then(|p| p.parent())
|
||||
.expect("api crate should live under repo root")
|
||||
.to_path_buf()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn grafana_provisioning_files_are_syntactically_valid() {
|
||||
let root = repo_root();
|
||||
|
||||
let datasources = fs::read_to_string(
|
||||
root.join("observability/grafana/provisioning/datasources/datasources.yml"),
|
||||
)
|
||||
.expect("missing grafana datasources provisioning file");
|
||||
let dashboards = fs::read_to_string(
|
||||
root.join("observability/grafana/provisioning/dashboards/dashboards.yml"),
|
||||
)
|
||||
.expect("missing grafana dashboards provisioning file");
|
||||
|
||||
let _datasources_yaml: serde_yaml::Value =
|
||||
serde_yaml::from_str(&datasources).expect("invalid grafana datasources yaml");
|
||||
let _dashboards_yaml: serde_yaml::Value =
|
||||
serde_yaml::from_str(&dashboards).expect("invalid grafana dashboards yaml");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn grafana_dashboards_are_syntactically_valid_json() {
|
||||
let root = repo_root();
|
||||
let dashboards_dir = root.join("observability/grafana/dashboards");
|
||||
|
||||
let mut found = 0usize;
|
||||
for entry in fs::read_dir(&dashboards_dir).expect("missing dashboards dir") {
|
||||
let entry = entry.expect("failed to read dashboards dir entry");
|
||||
let path = entry.path();
|
||||
if path.extension().and_then(|e| e.to_str()) != Some("json") {
|
||||
continue;
|
||||
}
|
||||
found += 1;
|
||||
let raw = fs::read_to_string(&path).expect("failed to read dashboard json");
|
||||
let _: serde_json::Value =
|
||||
serde_json::from_str(&raw).unwrap_or_else(|e| panic!("{path:?}: {e}"));
|
||||
}
|
||||
|
||||
assert!(found > 0, "expected at least one dashboard json file");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vmagent_config_parses_and_includes_required_jobs() {
|
||||
let root = repo_root();
|
||||
let scrape = fs::read_to_string(root.join("observability/vmagent/scrape.yml"))
|
||||
.expect("missing vmagent scrape config");
|
||||
|
||||
let value: serde_yaml::Value =
|
||||
serde_yaml::from_str(&scrape).expect("invalid vmagent scrape yaml");
|
||||
|
||||
let mut job_names = BTreeSet::<String>::new();
|
||||
if let Some(scrape_configs) = value.get("scrape_configs").and_then(|v| v.as_sequence()) {
|
||||
for cfg in scrape_configs {
|
||||
if let Some(job) = cfg.get("job_name").and_then(|v| v.as_str()) {
|
||||
job_names.insert(job.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for required in ["victoria-metrics", "vmagent", "control-api"] {
|
||||
assert!(
|
||||
job_names.contains(required),
|
||||
"vmagent scrape config missing required job_name={required}"
|
||||
);
|
||||
}
|
||||
}
|
||||
61
control/api/tests/observability_smoke_docker.rs
Normal file
61
control/api/tests/observability_smoke_docker.rs
Normal file
@@ -0,0 +1,61 @@
|
||||
use std::{
|
||||
net::TcpStream,
|
||||
path::PathBuf,
|
||||
process::Command,
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
fn repo_root() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.parent()
|
||||
.and_then(|p| p.parent())
|
||||
.expect("api crate should live under repo root")
|
||||
.to_path_buf()
|
||||
}
|
||||
|
||||
fn wait_for_tcp(addr: &str, timeout: Duration) -> bool {
|
||||
let start = Instant::now();
|
||||
while start.elapsed() < timeout {
|
||||
if TcpStream::connect_timeout(
|
||||
&addr.parse().expect("invalid socket addr"),
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.is_ok()
|
||||
{
|
||||
return true;
|
||||
}
|
||||
std::thread::sleep(Duration::from_millis(250));
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn observability_stack_reaches_healthy_state_fast() {
|
||||
let enabled = std::env::var("CONTROL_TEST_DOCKER").ok();
|
||||
assert_eq!(enabled.as_deref(), Some("1"));
|
||||
|
||||
let root = repo_root();
|
||||
let compose = root.join("observability/docker-compose.yml");
|
||||
|
||||
let up = Command::new("docker")
|
||||
.args(["compose", "-f"])
|
||||
.arg(&compose)
|
||||
.args(["up", "-d"])
|
||||
.status()
|
||||
.expect("failed to run docker compose up");
|
||||
assert!(up.success(), "docker compose up failed");
|
||||
|
||||
let ok = wait_for_tcp("127.0.0.1:3000", Duration::from_secs(30))
|
||||
&& wait_for_tcp("127.0.0.1:8428", Duration::from_secs(30))
|
||||
&& wait_for_tcp("127.0.0.1:3100", Duration::from_secs(30))
|
||||
&& wait_for_tcp("127.0.0.1:3200", Duration::from_secs(30));
|
||||
|
||||
let _ = Command::new("docker")
|
||||
.args(["compose", "-f"])
|
||||
.arg(&compose)
|
||||
.args(["down", "-v"])
|
||||
.status();
|
||||
|
||||
assert!(ok, "observability stack did not become reachable in time");
|
||||
}
|
||||
43
control/api/tests/placement_hot_reload.rs
Normal file
43
control/api/tests/placement_hot_reload.rs
Normal file
@@ -0,0 +1,43 @@
|
||||
use std::{fs, path::PathBuf, thread, time::Duration};
|
||||
|
||||
use api::PlacementStore;
|
||||
|
||||
fn tmp_file(name: &str) -> PathBuf {
|
||||
let mut p = std::env::temp_dir();
|
||||
p.push(format!(
|
||||
"cloudlysis-control-{name}-{}-{}.json",
|
||||
std::process::id(),
|
||||
std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_nanos()
|
||||
));
|
||||
p
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn placement_store_hot_reload_swaps_atomically() {
|
||||
let path = tmp_file("placement");
|
||||
fs::write(
|
||||
&path,
|
||||
r#"{"revision":"r1","aggregate_placement":{"placements":[]},"projection_placement":{"placements":[]},"runner_placement":{"placements":[]}}"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let store = PlacementStore::new(path.clone());
|
||||
let a1 = store.get_for_kind(api::ServiceKind::Aggregate);
|
||||
assert_eq!(a1.revision, "r1");
|
||||
|
||||
thread::sleep(Duration::from_millis(5));
|
||||
|
||||
fs::write(
|
||||
&path,
|
||||
r#"{"revision":"r2","aggregate_placement":{"placements":[]},"projection_placement":{"placements":[]},"runner_placement":{"placements":[]}}"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let a2 = store.get_for_kind(api::ServiceKind::Aggregate);
|
||||
assert_eq!(a2.revision, "r2");
|
||||
|
||||
let _ = fs::remove_file(&path);
|
||||
}
|
||||
31
control/api/tests/swarm_client.rs
Normal file
31
control/api/tests/swarm_client.rs
Normal file
@@ -0,0 +1,31 @@
|
||||
use std::{fs, path::PathBuf};
|
||||
|
||||
#[test]
|
||||
fn swarm_store_is_deterministic_from_file() {
|
||||
let mut path = std::env::temp_dir();
|
||||
path.push(format!(
|
||||
"cloudlysis-control-swarm-{}-{}.json",
|
||||
std::process::id(),
|
||||
uuid::Uuid::new_v4()
|
||||
));
|
||||
|
||||
fs::write(
|
||||
&path,
|
||||
r#"{"services":[{"name":"gateway","image":"x","mode":"replicated","replicas":"1/1","updated_at":null}],"tasks":[{"id":"t1","service":"gateway","node":"n1","desired_state":"running","current_state":"running","error":null}]}"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let store = api::SwarmStore::new(PathBuf::from(&path));
|
||||
let services = store.list_services();
|
||||
assert_eq!(services.len(), 1);
|
||||
assert_eq!(services[0].name, "gateway");
|
||||
|
||||
let tasks = store.list_tasks("gateway");
|
||||
assert_eq!(tasks.len(), 1);
|
||||
assert_eq!(tasks[0].id, "t1");
|
||||
|
||||
let none = store.list_tasks("missing");
|
||||
assert_eq!(none.len(), 0);
|
||||
|
||||
let _ = fs::remove_file(&path);
|
||||
}
|
||||
42
control/api/tests/swarm_smoke_docker.rs
Normal file
42
control/api/tests/swarm_smoke_docker.rs
Normal file
@@ -0,0 +1,42 @@
|
||||
use std::time::Duration;
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore]
|
||||
async fn docker_swarm_smoke_test_is_gated_and_times_out() {
|
||||
let enabled = std::env::var("CONTROL_TEST_DOCKER").ok();
|
||||
assert_eq!(enabled.as_deref(), Some("1"));
|
||||
|
||||
let stack = "cloudlysis_control_test";
|
||||
let compose = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.parent()
|
||||
.and_then(|p| p.parent())
|
||||
.unwrap()
|
||||
.join("swarm/stacks/control-plane.yml");
|
||||
|
||||
let deploy = tokio::process::Command::new("docker")
|
||||
.args(["stack", "deploy", "-c"])
|
||||
.arg(&compose)
|
||||
.arg(stack)
|
||||
.output();
|
||||
|
||||
let out = tokio::time::timeout(Duration::from_secs(30), deploy)
|
||||
.await
|
||||
.expect("docker stack deploy timed out")
|
||||
.expect("failed to run docker stack deploy");
|
||||
assert!(
|
||||
out.status.success(),
|
||||
"docker stack deploy failed: {}",
|
||||
String::from_utf8_lossy(&out.stderr)
|
||||
);
|
||||
|
||||
let ls = tokio::process::Command::new("docker")
|
||||
.args(["service", "ls"])
|
||||
.output();
|
||||
let _ = tokio::time::timeout(Duration::from_secs(10), ls).await;
|
||||
|
||||
let rm = tokio::process::Command::new("docker")
|
||||
.args(["stack", "rm"])
|
||||
.arg(stack)
|
||||
.output();
|
||||
let _ = tokio::time::timeout(Duration::from_secs(10), rm).await;
|
||||
}
|
||||
40
control/api/tests/swarm_stack_yaml.rs
Normal file
40
control/api/tests/swarm_stack_yaml.rs
Normal file
@@ -0,0 +1,40 @@
|
||||
use std::{fs, path::PathBuf};
|
||||
|
||||
fn repo_root() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.parent()
|
||||
.and_then(|p| p.parent())
|
||||
.expect("api crate should live under repo root")
|
||||
.to_path_buf()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn stack_files_parse_as_yaml() {
|
||||
let root = repo_root();
|
||||
for file in [
|
||||
root.join("swarm/stacks/control-plane.yml"),
|
||||
root.join("swarm/stacks/observability.yml"),
|
||||
] {
|
||||
let raw = fs::read_to_string(&file).unwrap();
|
||||
let _: serde_yaml::Value = serde_yaml::from_str(&raw).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn control_plane_stack_has_required_services() {
|
||||
let root = repo_root();
|
||||
let raw = fs::read_to_string(root.join("swarm/stacks/control-plane.yml")).unwrap();
|
||||
let v: serde_yaml::Value = serde_yaml::from_str(&raw).unwrap();
|
||||
|
||||
let services = v
|
||||
.get("services")
|
||||
.and_then(|x| x.as_mapping())
|
||||
.expect("missing services");
|
||||
|
||||
for required in ["control-api", "control-ui"] {
|
||||
assert!(
|
||||
services.contains_key(serde_yaml::Value::String(required.to_string())),
|
||||
"missing service {required}"
|
||||
);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user