transport: complete M0–M7
Some checks failed
ci / rust (push) Failing after 2m21s
ci / ui (push) Failing after 28s
images / build-and-push (push) Failing after 18s

shared: add stream+consumer policy helpers; NATS context header builder

aggregate/runner/projection: centralize stream validation and header usage; set bounded consumer params

projection: add QueryService gRPC and wire into main; settings include PROJECTION_GRPC_ADDR

gateway: gRPC routing to Projection/Runner with deadlines; bounded read-only retries; pooled gRPC channels (bounded LRU+TTL); admin proxy forwards to gRPC; probes use concurrency limiter + TTL cache

runner: add RunnerAdmin gRPC server (drain, status, reload) and wire into main; settings include RUNNER_GRPC_ADDR

tests: add gateway authz for runner admin, projection tenant isolation, runner admin drain semantics

docs: update TRANSPORT_DEVELOPMENT_PLAN to reflect completed milestones and details
This commit is contained in:
2026-03-30 14:24:14 +03:00
parent 1ab112438b
commit 90c307016d
41 changed files with 2391 additions and 505 deletions

View File

@@ -1,12 +1,25 @@
fn main() -> Result<(), Box<dyn std::error::Error>> {
let proto_path = "../aggregate/proto/aggregate.proto";
let proto_dir = "../aggregate/proto";
let protoc = protoc_bin_vendored::protoc_bin_path()?;
std::env::set_var("PROTOC", protoc);
tonic_build::configure()
.build_server(true)
.build_client(true)
.compile_protos(&[proto_path], &[proto_dir])?;
.compile_protos(
&[
"../aggregate/proto/aggregate.proto",
"../projection/proto/query.proto",
"../runner/proto/admin.proto",
],
&[
"../aggregate/proto",
"../projection/proto",
"../runner/proto",
],
)?;
println!("cargo:rerun-if-changed={}", proto_path);
println!("cargo:rerun-if-changed=../aggregate/proto/aggregate.proto");
println!("cargo:rerun-if-changed=../projection/proto/query.proto");
println!("cargo:rerun-if-changed=../runner/proto/admin.proto");
Ok(())
}

View File

@@ -1422,14 +1422,38 @@ mod tests {
#[tokio::test]
async fn tenant_admin_can_create_service_account_and_service_can_query() {
let projection_app = axum::Router::new().route(
"/v1/query/TestView",
axum::routing::post(|| async { (StatusCode::OK, r#"{"ok":true}"#) }),
);
use crate::grpc::projection_proto::query_service_server::QueryService;
#[derive(Default)]
struct Upstream;
#[tonic::async_trait]
impl QueryService for Upstream {
async fn execute_query(
&self,
_request: tonic::Request<crate::grpc::projection_proto::QueryRequest>,
) -> Result<tonic::Response<crate::grpc::projection_proto::QueryResponse>, tonic::Status>
{
Ok(tonic::Response::new(
crate::grpc::projection_proto::QueryResponse {
json: r#"{"ok":true}"#.to_string(),
},
))
}
}
let projection_listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
let projection_addr = projection_listener.local_addr().unwrap();
drop(projection_listener);
let projection_url = format!("http://{}", projection_addr);
tokio::spawn(async move {
axum::serve(projection_listener, projection_app)
tonic::transport::Server::builder()
.add_service(
crate::grpc::projection_proto::query_service_server::QueryServiceServer::new(
Upstream,
),
)
.serve(projection_addr)
.await
.unwrap();
});
@@ -1446,7 +1470,7 @@ mod tests {
aggregate_shards: std::collections::HashMap::new(),
projection_shards: std::collections::HashMap::from([(
"p".to_string(),
vec![format!("http://{}", projection_addr)],
vec![projection_url],
)]),
runner_shards: std::collections::HashMap::new(),
};

View File

@@ -144,6 +144,7 @@ async fn status(
async fn gates(
State(state): State<AppState>,
ctx: crate::RequestContext,
principal: Principal,
Query(q): Query<TenantQuery>,
) -> Result<Json<GatesResponse>, AuthzRejection> {
@@ -165,24 +166,33 @@ async fn gates(
.await
.ok();
let projection_ready = if let Some(ep) = projection_endpoint {
projection_gate_ready(&ep, &q.tenant_id)
.await
.unwrap_or(false)
} else {
false
let projection_fut = async {
if let Some(ep) = projection_endpoint {
projection_gate_ready(&ep, &q.tenant_id, &ctx)
.await
.unwrap_or(false)
} else {
false
}
};
let runner_ready = if let Some(ep) = runner_endpoint {
http_ready(&ep).await.unwrap_or(false)
} else {
false
let runner_fut = async {
if let Some(ep) = runner_endpoint {
http_ready(&ep, &ctx).await.unwrap_or(false)
} else {
false
}
};
let aggregate_ready = if let Some(ep) = aggregate_endpoint {
aggregate_ready(&ep).await.unwrap_or(false)
} else {
false
let aggregate_fut = async {
if let Some(ep) = aggregate_endpoint {
aggregate_ready(&ep, &ctx).await.unwrap_or(false)
} else {
false
}
};
let (projection_ready, runner_ready, aggregate_ready) =
tokio::join!(projection_fut, runner_fut, aggregate_fut);
Ok(Json(GatesResponse {
tenant_id: q.tenant_id,
aggregate_ready,
@@ -191,35 +201,49 @@ async fn gates(
}))
}
async fn http_ready(endpoint: &str) -> Result<bool, AuthzRejection> {
async fn http_ready(endpoint: &str, ctx: &crate::RequestContext) -> Result<bool, AuthzRejection> {
let url = format!("{}/ready", endpoint.trim_end_matches('/'));
let client = crate::upstream::http_client();
let resp = tokio::time::timeout(Duration::from_secs(2), client.get(url).send())
.await
.map_err(|_| AuthzRejection::Internal)?
.map_err(|_| AuthzRejection::Internal)?;
Ok(resp.status().is_success())
crate::upstream::probe_status_ok(
&url,
&[
(shared::HEADER_X_CORRELATION_ID, ctx.correlation_id.as_str()),
(shared::HEADER_TRACEPARENT, ctx.traceparent.as_str()),
],
Duration::from_secs(2),
Duration::from_millis(500),
)
.await
.map_err(|_| AuthzRejection::Internal)
}
async fn aggregate_ready(endpoint: &str) -> Result<bool, AuthzRejection> {
async fn aggregate_ready(
endpoint: &str,
ctx: &crate::RequestContext,
) -> Result<bool, AuthzRejection> {
if endpoint.contains(":50051") {
let http_ep = endpoint.replace(":50051", ":8080");
return http_ready(&http_ep).await;
return http_ready(&http_ep, ctx).await;
}
http_ready(endpoint).await
http_ready(endpoint, ctx).await
}
async fn projection_gate_ready(endpoint: &str, tenant_id: &str) -> Result<bool, AuthzRejection> {
async fn projection_gate_ready(
endpoint: &str,
tenant_id: &str,
ctx: &crate::RequestContext,
) -> Result<bool, AuthzRejection> {
let url = format!("{}/metrics", endpoint.trim_end_matches('/'));
let client = crate::upstream::http_client();
let resp = tokio::time::timeout(Duration::from_secs(2), client.get(url).send())
.await
.map_err(|_| AuthzRejection::Internal)?
.map_err(|_| AuthzRejection::Internal)?;
if !resp.status().is_success() {
return Ok(false);
}
let text = resp.text().await.map_err(|_| AuthzRejection::Internal)?;
let text = crate::upstream::probe_text(
&url,
&[
(shared::HEADER_X_CORRELATION_ID, ctx.correlation_id.as_str()),
(shared::HEADER_TRACEPARENT, ctx.traceparent.as_str()),
],
Duration::from_secs(2),
Duration::from_millis(250),
)
.await
.map_err(|_| AuthzRejection::Internal)?;
let ready = parse_prom_gauge(&text, "projection_ready").unwrap_or(0.0) >= 1.0;
if !ready {

View File

@@ -81,7 +81,7 @@ where
async fn from_request_parts(parts: &mut Parts, _state: &S) -> Result<Self, Self::Rejection> {
let raw = parts
.headers
.get("x-tenant-id")
.get(shared::HEADER_X_TENANT_ID)
.and_then(|v| v.to_str().ok())
.ok_or(AuthzRejection::MissingTenant)?;
@@ -239,33 +239,45 @@ async fn query_stub(
)
.await?;
let upstream = state
.routing
.resolve(&tenant_id, crate::routing::ServiceKind::Projection)
let uqf = payload
.get("uqf")
.and_then(|v| v.as_str())
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.unwrap_or_else(|| payload.to_string());
if uqf.trim().is_empty() {
return Err(AuthzRejection::BadRequest);
}
let request = crate::grpc::projection_proto::QueryRequest {
tenant_id: tenant_id.clone(),
view_type,
uqf,
};
let resp = crate::grpc::execute_query_via_routing(&state.routing, request, &ctx)
.await
.map_err(|_| AuthzRejection::Internal)?;
tracing::Span::current().record("upstream", upstream.as_str());
.map_err(map_query_error)?;
let url = format!("{}/v1/query/{}", upstream.trim_end_matches('/'), view_type);
let client = crate::upstream::http_client();
let resp = client
.post(url)
.header("x-tenant-id", tenant_id)
.header("x-correlation-id", ctx.correlation_id)
.header("traceparent", ctx.traceparent)
.json(&payload)
.send()
.await
.map_err(|_| AuthzRejection::Internal)?;
let status = StatusCode::from_u16(resp.status().as_u16()).unwrap_or(StatusCode::BAD_GATEWAY);
let bytes = resp.bytes().await.map_err(|_| AuthzRejection::Internal)?;
let mut out = Response::new(axum::body::Body::from(bytes));
*out.status_mut() = status;
let mut out = Response::new(axum::body::Body::from(resp.json));
out.headers_mut().insert(
header::CONTENT_TYPE,
axum::http::HeaderValue::from_static("application/json"),
);
Ok(out)
}
fn map_query_error(status: tonic::Status) -> AuthzRejection {
match status.code() {
tonic::Code::InvalidArgument => AuthzRejection::BadRequest,
tonic::Code::NotFound => AuthzRejection::NotFound,
tonic::Code::PermissionDenied => AuthzRejection::Forbidden,
tonic::Code::Unauthenticated => AuthzRejection::Unauthorized,
tonic::Code::Unavailable => AuthzRejection::Internal,
_ => AuthzRejection::Internal,
}
}
pub async fn runner_admin_proxy(
State(state): State<AppState>,
ctx: crate::RequestContext,
@@ -282,51 +294,73 @@ pub async fn runner_admin_proxy(
)
.await?;
let upstream = state
.routing
.resolve(&tenant_id, crate::routing::ServiceKind::Runner)
.await
.map_err(|_| AuthzRejection::Internal)?;
tracing::Span::current().record("upstream", upstream.as_str());
let path = path.trim_start_matches('/').to_string();
match (request.method().as_str(), path.as_str()) {
("POST", "drain") => {
let wait_ms = request.uri().query().and_then(|q| {
q.split('&').find_map(|pair| {
let (k, v) = pair.split_once('=')?;
if k == "wait_ms" {
v.parse::<u64>().ok()
} else {
None
}
})
});
let mut url = format!(
"{}/admin/{}",
upstream.trim_end_matches('/'),
path.trim_start_matches('/')
);
if let Some(q) = request.uri().query() {
url.push('?');
url.push_str(q);
}
let method = request.method().clone();
let headers = request.headers().clone();
let body = axum::body::to_bytes(request.into_body(), usize::MAX)
.await
.map_err(|_| AuthzRejection::Internal)?;
let client = crate::upstream::http_client();
let mut req = client
.request(method, url)
.header("x-tenant-id", tenant_id)
.header("x-correlation-id", ctx.correlation_id)
.header("traceparent", ctx.traceparent)
.body(body);
for (k, v) in headers.iter() {
if k == header::HOST {
continue;
let resp = crate::grpc::runner_admin_drain_via_routing(
&state.routing,
&tenant_id,
wait_ms,
&ctx,
)
.await
.map_err(map_query_error)?;
let status =
StatusCode::from_u16(resp.http_status as u16).unwrap_or(StatusCode::BAD_GATEWAY);
let mut out = Response::new(axum::body::Body::from(resp.json));
*out.status_mut() = status;
out.headers_mut().insert(
header::CONTENT_TYPE,
axum::http::HeaderValue::from_static("application/json"),
);
Ok(out)
}
req = req.header(k, v);
("GET", "drain/status") => {
let resp = crate::grpc::runner_admin_drain_status_via_routing(
&state.routing,
&tenant_id,
&ctx,
)
.await
.map_err(map_query_error)?;
let status =
StatusCode::from_u16(resp.http_status as u16).unwrap_or(StatusCode::BAD_GATEWAY);
let mut out = Response::new(axum::body::Body::from(resp.json));
*out.status_mut() = status;
out.headers_mut().insert(
header::CONTENT_TYPE,
axum::http::HeaderValue::from_static("application/json"),
);
Ok(out)
}
("POST", "reload") => {
let resp =
crate::grpc::runner_admin_reload_via_routing(&state.routing, &tenant_id, &ctx)
.await
.map_err(map_query_error)?;
let status =
StatusCode::from_u16(resp.http_status as u16).unwrap_or(StatusCode::BAD_GATEWAY);
let mut out = Response::new(axum::body::Body::from(resp.json));
*out.status_mut() = status;
out.headers_mut().insert(
header::CONTENT_TYPE,
axum::http::HeaderValue::from_static("application/json"),
);
Ok(out)
}
_ => Err(AuthzRejection::NotFound),
}
let resp = req.send().await.map_err(|_| AuthzRejection::Internal)?;
let status = StatusCode::from_u16(resp.status().as_u16()).unwrap_or(StatusCode::BAD_GATEWAY);
let bytes = resp.bytes().await.map_err(|_| AuthzRejection::Internal)?;
let mut out = Response::new(axum::body::Body::from(bytes));
*out.status_mut() = status;
Ok(out)
}
pub async fn ensure_allowed(
@@ -739,34 +773,60 @@ mod tests {
use crate::routing::RoutingConfig;
use std::collections::HashMap;
let projection_app = axum::Router::new().route(
"/v1/query/TestView",
post(|headers: axum::http::HeaderMap| async move {
let correlation = headers
.get("x-correlation-id")
use crate::grpc::projection_proto::query_service_server::QueryService;
#[derive(Default)]
struct Upstream;
#[async_trait::async_trait]
impl QueryService for Upstream {
async fn execute_query(
&self,
request: tonic::Request<crate::grpc::projection_proto::QueryRequest>,
) -> Result<tonic::Response<crate::grpc::projection_proto::QueryResponse>, tonic::Status>
{
let correlation = request
.metadata()
.get(shared::HEADER_X_CORRELATION_ID)
.and_then(|v| v.to_str().ok())
.unwrap_or("");
let traceparent = headers
.get("traceparent")
let traceparent = request
.metadata()
.get(shared::HEADER_TRACEPARENT)
.and_then(|v| v.to_str().ok())
.unwrap_or("");
if correlation.trim().is_empty()
|| crate::trace_id_from_traceparent(traceparent).is_none()
|| shared::trace_id_from_traceparent(traceparent).is_none()
{
return (StatusCode::BAD_REQUEST, "missing correlation");
return Err(tonic::Status::failed_precondition(
"missing correlation metadata",
));
}
(StatusCode::OK, r#"{"mode":"count"}"#)
}),
);
Ok(tonic::Response::new(
crate::grpc::projection_proto::QueryResponse {
json: r#"{"mode":"count"}"#.to_string(),
},
))
}
}
let projection_listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
let projection_addr = projection_listener.local_addr().unwrap();
drop(projection_listener);
let projection_url = format!("http://{}", projection_addr);
tokio::spawn(async move {
axum::serve(projection_listener, projection_app)
tonic::transport::Server::builder()
.add_service(
crate::grpc::projection_proto::query_service_server::QueryServiceServer::new(
Upstream,
),
)
.serve(projection_addr)
.await
.unwrap();
});
tokio::time::sleep(std::time::Duration::from_millis(50)).await;
let projection_url = format!("http://{}", projection_addr);
let cfg = RoutingConfig {
revision: 1,
@@ -788,7 +848,7 @@ mod tests {
.method("POST")
.uri("/v1/query/TestView")
.header("authorization", format!("Bearer {token}"))
.header("x-tenant-id", "tenant-a")
.header(shared::HEADER_X_TENANT_ID, "tenant-a")
.header("content-type", "application/json")
.body(axum::body::Body::from(r#"{"uqf":"{}"}"#))
.unwrap(),
@@ -814,7 +874,7 @@ mod tests {
.method("POST")
.uri("/v1/query/TestView")
.header("authorization", format!("Bearer {token}"))
.header("x-tenant-id", "tenant-a")
.header(shared::HEADER_X_TENANT_ID, "tenant-a")
.header("content-type", "application/json")
.body(axum::body::Body::from(r#"{"uqf":"{}"}"#))
.unwrap(),
@@ -824,16 +884,175 @@ mod tests {
assert_eq!(ok.status(), StatusCode::OK);
assert!(!ok
.headers()
.get("x-correlation-id")
.get(shared::HEADER_X_CORRELATION_ID)
.and_then(|v| v.to_str().ok())
.unwrap_or("")
.is_empty());
assert!(crate::trace_id_from_traceparent(
assert!(shared::trace_id_from_traceparent(
ok.headers()
.get("traceparent")
.get(shared::HEADER_TRACEPARENT)
.and_then(|v| v.to_str().ok())
.unwrap_or("")
)
.is_some());
}
#[tokio::test]
async fn runner_admin_proxy_denies_unauthorized_and_forwards_when_authorized() {
use crate::grpc::runner_admin_proto::runner_admin_server::RunnerAdmin;
use std::collections::HashMap;
#[derive(Default)]
struct Upstream;
#[tonic::async_trait]
impl RunnerAdmin for Upstream {
async fn drain(
&self,
_request: tonic::Request<crate::grpc::runner_admin_proto::DrainRequest>,
) -> Result<
tonic::Response<crate::grpc::runner_admin_proto::AdminResponse>,
tonic::Status,
> {
Ok(tonic::Response::new(
crate::grpc::runner_admin_proto::AdminResponse {
http_status: 200,
json: r#"{"ok":true}"#.to_string(),
},
))
}
async fn drain_status(
&self,
_request: tonic::Request<crate::grpc::runner_admin_proto::DrainStatusRequest>,
) -> Result<
tonic::Response<crate::grpc::runner_admin_proto::AdminResponse>,
tonic::Status,
> {
Ok(tonic::Response::new(
crate::grpc::runner_admin_proto::AdminResponse {
http_status: 202,
json: r#"{"ok":true,"drained":false}"#.to_string(),
},
))
}
async fn reload(
&self,
_request: tonic::Request<crate::grpc::runner_admin_proto::ReloadRequest>,
) -> Result<
tonic::Response<crate::grpc::runner_admin_proto::AdminResponse>,
tonic::Status,
> {
Ok(tonic::Response::new(
crate::grpc::runner_admin_proto::AdminResponse {
http_status: 200,
json: r#"{"ok":true}"#.to_string(),
},
))
}
}
let runner_listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
let runner_addr = runner_listener.local_addr().unwrap();
drop(runner_listener);
let runner_url = format!("http://{}", runner_addr);
tokio::spawn(async move {
tonic::transport::Server::builder()
.add_service(
crate::grpc::runner_admin_proto::runner_admin_server::RunnerAdminServer::new(
Upstream,
),
)
.serve(runner_addr)
.await
.unwrap();
});
tokio::time::sleep(std::time::Duration::from_millis(50)).await;
let cfg = crate::routing::RoutingConfig {
revision: 1,
aggregate_placement: HashMap::new(),
projection_placement: HashMap::new(),
runner_placement: HashMap::from([("tenant-a".to_string(), "r".to_string())]),
aggregate_shards: HashMap::new(),
projection_shards: HashMap::new(),
runner_shards: HashMap::from([("r".to_string(), vec![runner_url])]),
};
let (app, state) = test_app_with_routing(cfg).await;
let (token, claims) = signup_and_get_claims(&app, &state.authn).await;
let forbidden = app
.clone()
.oneshot(
axum::http::Request::builder()
.method("POST")
.uri("/admin/runner/drain")
.header("authorization", format!("Bearer {token}"))
.header("x-tenant-id", "tenant-a")
.body(axum::body::Body::empty())
.unwrap(),
)
.await
.unwrap();
assert_eq!(forbidden.status(), StatusCode::FORBIDDEN);
put_role(
&state.storage,
"role-runner-admin",
vec!["runner.admin".to_string()],
)
.await
.unwrap();
assign_role(&state.storage, "tenant-a", &claims.sub, "role-runner-admin")
.await
.unwrap();
let ok = app
.oneshot(
axum::http::Request::builder()
.method("POST")
.uri("/admin/runner/drain?wait_ms=0")
.header("authorization", format!("Bearer {token}"))
.header("x-tenant-id", "tenant-a")
.body(axum::body::Body::empty())
.unwrap(),
)
.await
.unwrap();
assert_eq!(ok.status(), StatusCode::OK);
}
#[tokio::test]
async fn runner_admin_proxy_rejects_tenant_spoofing() {
let cfg = crate::routing::RoutingConfig::empty();
let (app, state) = test_app_with_routing(cfg).await;
let (token, claims) = signup_and_get_claims(&app, &state.authn).await;
put_role(
&state.storage,
"role-runner-admin",
vec!["runner.admin".to_string()],
)
.await
.unwrap();
assign_role(&state.storage, "tenant-a", &claims.sub, "role-runner-admin")
.await
.unwrap();
let forbidden = app
.oneshot(
axum::http::Request::builder()
.method("POST")
.uri("/admin/runner/reload")
.header("authorization", format!("Bearer {token}"))
.header("x-tenant-id", "tenant-b")
.body(axum::body::Body::empty())
.unwrap(),
)
.await
.unwrap();
assert_eq!(forbidden.status(), StatusCode::FORBIDDEN);
}
}

View File

@@ -1,11 +1,47 @@
use crate::routing::RouterState;
use crate::routing::RoutingError;
use crate::routing::ServiceKind;
use std::future::Future;
pub mod proto {
tonic::include_proto!("aggregate.gateway.v1");
}
pub mod projection_proto {
tonic::include_proto!("projection.gateway.v1");
}
pub mod runner_admin_proto {
tonic::include_proto!("runner.admin.v1");
}
async fn retry_read_only<T, F, Fut>(mut f: F) -> Result<T, tonic::Status>
where
F: FnMut() -> Fut,
Fut: Future<Output = Result<T, tonic::Status>>,
{
let mut last = None;
for attempt in 0..3 {
match f().await {
Ok(v) => return Ok(v),
Err(status) => {
let retryable = matches!(
status.code(),
tonic::Code::Unavailable | tonic::Code::DeadlineExceeded
);
if retryable && attempt < 2 {
let backoff_ms = 25_u64.saturating_mul(2_u64.pow(attempt as u32));
tokio::time::sleep(std::time::Duration::from_millis(backoff_ms)).await;
last = Some(status);
continue;
}
return Err(status);
}
}
}
Err(last.unwrap_or_else(|| tonic::Status::internal("retry exhausted")))
}
#[derive(Clone)]
pub struct GatewayCommandService {
routing: RouterState,
@@ -23,33 +59,20 @@ impl proto::command_service_server::CommandService for GatewayCommandService {
&self,
request: tonic::Request<proto::SubmitCommandRequest>,
) -> Result<tonic::Response<proto::SubmitCommandResponse>, tonic::Status> {
let correlation_id = request
.metadata()
.get("x-correlation-id")
.and_then(|v| v.to_str().ok())
.map(|s| s.trim())
.filter(|s| !s.is_empty())
.map(|s| s.to_string())
.unwrap_or_else(|| uuid::Uuid::new_v4().to_string());
let correlation_id = shared::normalize_correlation_id(
request
.metadata()
.get(shared::HEADER_X_CORRELATION_ID)
.and_then(|v| v.to_str().ok()),
)
.to_string();
let traceparent = request
.metadata()
.get("traceparent")
.and_then(|v| v.to_str().ok())
.map(|s| s.trim())
.filter(|s| !s.is_empty())
.and_then(|s| {
if crate::trace_id_from_traceparent(s).is_some() {
Some(s.to_string())
} else {
None
}
})
.unwrap_or_else(|| {
let trace_id = uuid::Uuid::new_v4().simple().to_string();
let span_id = uuid::Uuid::new_v4().simple().to_string()[..16].to_string();
format!("00-{trace_id}-{span_id}-01")
});
let traceparent = shared::normalize_traceparent(
request
.metadata()
.get(shared::HEADER_TRACEPARENT)
.and_then(|v| v.to_str().ok()),
);
let mut req = request.into_inner();
@@ -66,30 +89,35 @@ impl proto::command_service_server::CommandService for GatewayCommandService {
.map_err(map_routing_error)?;
tracing::Span::current().record("upstream", upstream.as_str());
let channel = crate::upstream::grpc_endpoint(&upstream)
.map_err(|e| tonic::Status::unavailable(e.to_string()))?
.connect()
.await
let channel = crate::upstream::grpc_channel(&upstream)
.map_err(|e| tonic::Status::unavailable(e.to_string()))?;
let mut client = proto::command_service_client::CommandServiceClient::new(channel);
let mut upstream_req = tonic::Request::new(req);
upstream_req.set_timeout(std::time::Duration::from_secs(5));
if let Ok(v) = tonic::metadata::MetadataValue::try_from(tenant_id.as_str()) {
upstream_req.metadata_mut().insert("x-tenant-id", v);
upstream_req
.metadata_mut()
.insert(shared::HEADER_X_TENANT_ID, v);
}
if let Ok(v) = tonic::metadata::MetadataValue::try_from(correlation_id.as_str()) {
upstream_req.metadata_mut().insert("x-correlation-id", v);
upstream_req
.metadata_mut()
.insert(shared::HEADER_X_CORRELATION_ID, v);
}
if let Ok(v) = tonic::metadata::MetadataValue::try_from(traceparent.as_str()) {
upstream_req.metadata_mut().insert("traceparent", v);
upstream_req
.metadata_mut()
.insert(shared::HEADER_TRACEPARENT, v);
}
let mut resp = client.submit_command(upstream_req).await?;
if let Ok(v) = tonic::metadata::MetadataValue::try_from(correlation_id.as_str()) {
resp.metadata_mut().insert("x-correlation-id", v);
resp.metadata_mut()
.insert(shared::HEADER_X_CORRELATION_ID, v);
}
if let Ok(v) = tonic::metadata::MetadataValue::try_from(traceparent.as_str()) {
resp.metadata_mut().insert("traceparent", v);
resp.metadata_mut().insert(shared::HEADER_TRACEPARENT, v);
}
Ok(resp)
}
@@ -111,28 +139,176 @@ pub async fn submit_command_via_routing(
.map_err(map_routing_error)?;
tracing::Span::current().record("upstream", upstream.as_str());
let channel = crate::upstream::grpc_endpoint(&upstream)
.map_err(|e| tonic::Status::unavailable(e.to_string()))?
.connect()
.await
let channel = crate::upstream::grpc_channel(&upstream)
.map_err(|e| tonic::Status::unavailable(e.to_string()))?;
let mut client = proto::command_service_client::CommandServiceClient::new(channel);
let mut upstream_req = tonic::Request::new(request);
upstream_req.set_timeout(std::time::Duration::from_secs(5));
if let Ok(v) = tonic::metadata::MetadataValue::try_from(tenant_id.as_str()) {
upstream_req.metadata_mut().insert("x-tenant-id", v);
upstream_req
.metadata_mut()
.insert(shared::HEADER_X_TENANT_ID, v);
}
if let Ok(v) = tonic::metadata::MetadataValue::try_from(ctx.correlation_id.as_str()) {
upstream_req.metadata_mut().insert("x-correlation-id", v);
upstream_req
.metadata_mut()
.insert(shared::HEADER_X_CORRELATION_ID, v);
}
if let Ok(v) = tonic::metadata::MetadataValue::try_from(ctx.traceparent.as_str()) {
upstream_req.metadata_mut().insert("traceparent", v);
upstream_req
.metadata_mut()
.insert(shared::HEADER_TRACEPARENT, v);
}
let resp = client.submit_command(upstream_req).await?;
Ok(resp.into_inner())
}
pub async fn execute_query_via_routing(
routing: &RouterState,
request: projection_proto::QueryRequest,
ctx: &crate::RequestContext,
) -> Result<projection_proto::QueryResponse, tonic::Status> {
let tenant_id = request.tenant_id.trim().to_string();
if tenant_id.is_empty() {
return Err(tonic::Status::invalid_argument("tenant_id is required"));
}
let upstream = routing
.resolve(&tenant_id, ServiceKind::Projection)
.await
.map_err(map_routing_error)?;
tracing::Span::current().record("upstream", upstream.as_str());
let channel = crate::upstream::grpc_channel(&upstream)
.map_err(|e| tonic::Status::unavailable(e.to_string()))?;
retry_read_only(|| {
let mut client =
projection_proto::query_service_client::QueryServiceClient::new(channel.clone());
let mut upstream_req = tonic::Request::new(request.clone());
upstream_req.set_timeout(std::time::Duration::from_secs(2));
if let Ok(v) = tonic::metadata::MetadataValue::try_from(tenant_id.as_str()) {
upstream_req
.metadata_mut()
.insert(shared::HEADER_X_TENANT_ID, v);
}
if let Ok(v) = tonic::metadata::MetadataValue::try_from(ctx.correlation_id.as_str()) {
upstream_req
.metadata_mut()
.insert(shared::HEADER_X_CORRELATION_ID, v);
}
if let Ok(v) = tonic::metadata::MetadataValue::try_from(ctx.traceparent.as_str()) {
upstream_req
.metadata_mut()
.insert(shared::HEADER_TRACEPARENT, v);
}
async move { Ok(client.execute_query(upstream_req).await?.into_inner()) }
})
.await
}
pub async fn runner_admin_drain_via_routing(
routing: &RouterState,
tenant_id: &str,
wait_ms: Option<u64>,
ctx: &crate::RequestContext,
) -> Result<runner_admin_proto::AdminResponse, tonic::Status> {
let upstream = routing
.resolve(tenant_id, ServiceKind::Runner)
.await
.map_err(map_routing_error)?;
tracing::Span::current().record("upstream", upstream.as_str());
let channel = crate::upstream::grpc_channel(&upstream)
.map_err(|e| tonic::Status::unavailable(e.to_string()))?;
let mut client = runner_admin_proto::runner_admin_client::RunnerAdminClient::new(channel);
let mut req = tonic::Request::new(runner_admin_proto::DrainRequest {
tenant_id: tenant_id.to_string(),
wait_ms: wait_ms.unwrap_or(0),
});
req.set_timeout(std::time::Duration::from_secs(5));
if let Ok(v) = tonic::metadata::MetadataValue::try_from(tenant_id) {
req.metadata_mut().insert(shared::HEADER_X_TENANT_ID, v);
}
if let Ok(v) = tonic::metadata::MetadataValue::try_from(ctx.correlation_id.as_str()) {
req.metadata_mut()
.insert(shared::HEADER_X_CORRELATION_ID, v);
}
if let Ok(v) = tonic::metadata::MetadataValue::try_from(ctx.traceparent.as_str()) {
req.metadata_mut().insert(shared::HEADER_TRACEPARENT, v);
}
Ok(client.drain(req).await?.into_inner())
}
pub async fn runner_admin_drain_status_via_routing(
routing: &RouterState,
tenant_id: &str,
ctx: &crate::RequestContext,
) -> Result<runner_admin_proto::AdminResponse, tonic::Status> {
let upstream = routing
.resolve(tenant_id, ServiceKind::Runner)
.await
.map_err(map_routing_error)?;
tracing::Span::current().record("upstream", upstream.as_str());
let channel = crate::upstream::grpc_channel(&upstream)
.map_err(|e| tonic::Status::unavailable(e.to_string()))?;
retry_read_only(|| {
let mut client =
runner_admin_proto::runner_admin_client::RunnerAdminClient::new(channel.clone());
let mut req = tonic::Request::new(runner_admin_proto::DrainStatusRequest {
tenant_id: tenant_id.to_string(),
});
req.set_timeout(std::time::Duration::from_secs(2));
if let Ok(v) = tonic::metadata::MetadataValue::try_from(tenant_id) {
req.metadata_mut().insert(shared::HEADER_X_TENANT_ID, v);
}
if let Ok(v) = tonic::metadata::MetadataValue::try_from(ctx.correlation_id.as_str()) {
req.metadata_mut()
.insert(shared::HEADER_X_CORRELATION_ID, v);
}
if let Ok(v) = tonic::metadata::MetadataValue::try_from(ctx.traceparent.as_str()) {
req.metadata_mut().insert(shared::HEADER_TRACEPARENT, v);
}
async move { Ok(client.drain_status(req).await?.into_inner()) }
})
.await
}
pub async fn runner_admin_reload_via_routing(
routing: &RouterState,
tenant_id: &str,
ctx: &crate::RequestContext,
) -> Result<runner_admin_proto::AdminResponse, tonic::Status> {
let upstream = routing
.resolve(tenant_id, ServiceKind::Runner)
.await
.map_err(map_routing_error)?;
tracing::Span::current().record("upstream", upstream.as_str());
let channel = crate::upstream::grpc_channel(&upstream)
.map_err(|e| tonic::Status::unavailable(e.to_string()))?;
let mut client = runner_admin_proto::runner_admin_client::RunnerAdminClient::new(channel);
let mut req = tonic::Request::new(runner_admin_proto::ReloadRequest {});
req.set_timeout(std::time::Duration::from_secs(2));
if let Ok(v) = tonic::metadata::MetadataValue::try_from(tenant_id) {
req.metadata_mut().insert(shared::HEADER_X_TENANT_ID, v);
}
if let Ok(v) = tonic::metadata::MetadataValue::try_from(ctx.correlation_id.as_str()) {
req.metadata_mut()
.insert(shared::HEADER_X_CORRELATION_ID, v);
}
if let Ok(v) = tonic::metadata::MetadataValue::try_from(ctx.traceparent.as_str()) {
req.metadata_mut().insert(shared::HEADER_TRACEPARENT, v);
}
Ok(client.reload(req).await?.into_inner())
}
fn map_routing_error(err: RoutingError) -> tonic::Status {
match err {
RoutingError::UnknownTenant => tonic::Status::not_found("unknown tenant"),
@@ -187,7 +363,7 @@ mod tests {
.get("traceparent")
.and_then(|v| v.to_str().ok())
.unwrap_or("");
if crate::trace_id_from_traceparent(traceparent).is_none() {
if shared::trace_id_from_traceparent(traceparent).is_none() {
return Err(tonic::Status::failed_precondition("missing traceparent"));
}
@@ -258,9 +434,9 @@ mod tests {
.and_then(|v| v.to_str().ok())
.unwrap_or("")
.is_empty());
assert!(crate::trace_id_from_traceparent(
assert!(shared::trace_id_from_traceparent(
resp.metadata()
.get("traceparent")
.get(shared::HEADER_TRACEPARENT)
.and_then(|v| v.to_str().ok())
.unwrap_or("")
)

View File

@@ -49,23 +49,23 @@ where
async fn from_request_parts(parts: &mut Parts, _state: &S) -> Result<Self, Self::Rejection> {
let request_id = parts
.headers
.get("x-request-id")
.get(shared::HEADER_X_REQUEST_ID)
.and_then(|v| v.to_str().ok())
.unwrap_or("")
.to_string();
let correlation_id = parts
.headers
.get("x-correlation-id")
.get(shared::HEADER_X_CORRELATION_ID)
.and_then(|v| v.to_str().ok())
.unwrap_or("")
.to_string();
let traceparent = parts
.headers
.get("traceparent")
.get(shared::HEADER_TRACEPARENT)
.and_then(|v| v.to_str().ok())
.unwrap_or("")
.to_string();
let trace_id = trace_id_from_traceparent(&traceparent)
let trace_id = shared::trace_id_from_traceparent(&traceparent)
.map(|s| s.to_string())
.unwrap_or_default();
@@ -92,7 +92,7 @@ struct StatusResponse {
}
pub fn app(state: AppState) -> Router {
let request_id_header = HeaderName::from_static("x-request-id");
let request_id_header = HeaderName::from_static(shared::HEADER_X_REQUEST_ID);
Router::new()
.route("/health", get(health))
@@ -133,20 +133,20 @@ pub fn app(state: AppState) -> Router {
|request: &axum::http::Request<_>| {
let request_id = request
.headers()
.get("x-request-id")
.get(shared::HEADER_X_REQUEST_ID)
.and_then(|v| v.to_str().ok())
.unwrap_or("");
let correlation_id = request
.headers()
.get("x-correlation-id")
.get(shared::HEADER_X_CORRELATION_ID)
.and_then(|v| v.to_str().ok())
.unwrap_or("");
let traceparent = request
.headers()
.get("traceparent")
.get(shared::HEADER_TRACEPARENT)
.and_then(|v| v.to_str().ok())
.unwrap_or("");
let trace_id = trace_id_from_traceparent(traceparent).unwrap_or("");
let trace_id = shared::trace_id_from_traceparent(traceparent).unwrap_or("");
let path = request_path_for_logging(request);
tracing::span!(
@@ -205,48 +205,42 @@ where
}
fn call(&mut self, mut req: axum::http::Request<ReqBody>) -> Self::Future {
let correlation_id = req
.headers()
.get("x-correlation-id")
.and_then(|v| v.to_str().ok())
.map(|s| s.trim())
.filter(|s| !s.is_empty())
.map(|s| s.to_string())
.unwrap_or_else(generate_correlation_id);
let correlation_id = shared::normalize_correlation_id(
req.headers()
.get(shared::HEADER_X_CORRELATION_ID)
.and_then(|v| v.to_str().ok()),
)
.to_string();
let traceparent = req
.headers()
.get("traceparent")
.and_then(|v| v.to_str().ok())
.map(|s| s.trim())
.filter(|s| !s.is_empty())
.and_then(|s| {
if trace_id_from_traceparent(s).is_some() {
Some(s.to_string())
} else {
None
}
})
.unwrap_or_else(generate_traceparent);
let traceparent = shared::normalize_traceparent(
req.headers()
.get(shared::HEADER_TRACEPARENT)
.and_then(|v| v.to_str().ok()),
);
if let Ok(v) = HeaderValue::from_str(&correlation_id) {
req.headers_mut().insert("x-correlation-id", v);
req.headers_mut().insert(shared::HEADER_X_CORRELATION_ID, v);
}
if let Ok(v) = HeaderValue::from_str(&traceparent) {
req.headers_mut().insert("traceparent", v);
req.headers_mut().insert(shared::HEADER_TRACEPARENT, v);
}
let mut inner = self.inner.clone();
Box::pin(async move {
let mut resp = inner.call(req).await?;
if resp.headers().get("x-correlation-id").is_none() {
if resp
.headers()
.get(shared::HEADER_X_CORRELATION_ID)
.is_none()
{
if let Ok(v) = HeaderValue::from_str(&correlation_id) {
resp.headers_mut().insert("x-correlation-id", v);
resp.headers_mut()
.insert(shared::HEADER_X_CORRELATION_ID, v);
}
}
if resp.headers().get("traceparent").is_none() {
if resp.headers().get(shared::HEADER_TRACEPARENT).is_none() {
if let Ok(v) = HeaderValue::from_str(&traceparent) {
resp.headers_mut().insert("traceparent", v);
resp.headers_mut().insert(shared::HEADER_TRACEPARENT, v);
}
}
Ok(resp)
@@ -254,20 +248,6 @@ where
}
}
fn generate_correlation_id() -> String {
uuid::Uuid::new_v4().to_string()
}
fn generate_traceparent() -> String {
let trace_id = uuid::Uuid::new_v4().simple().to_string();
let span_id = uuid::Uuid::new_v4().simple().to_string()[..16].to_string();
format!("00-{trace_id}-{span_id}-01")
}
pub(crate) fn trace_id_from_traceparent(traceparent: &str) -> Option<&str> {
shared::trace_id_from_traceparent(traceparent)
}
async fn track_http_metrics(
req: axum::http::Request<axum::body::Body>,
next: Next,

View File

@@ -1,5 +1,6 @@
use std::sync::OnceLock;
use std::time::Duration;
use std::collections::HashMap;
use std::sync::{Mutex, OnceLock};
use std::time::{Duration, Instant};
pub fn http_client() -> &'static reqwest::Client {
static CLIENT: OnceLock<reqwest::Client> = OnceLock::new();
@@ -47,6 +48,175 @@ pub fn grpc_endpoint(url: &str) -> Result<tonic::transport::Endpoint, tonic::tra
Ok(endpoint)
}
pub fn grpc_channel(url: &str) -> Result<tonic::transport::Channel, tonic::transport::Error> {
const MAX_CHANNELS: usize = 64;
const TTL: Duration = Duration::from_secs(300);
static CACHE: OnceLock<Mutex<HashMap<String, (tonic::transport::Channel, Instant)>>> =
OnceLock::new();
let cache = CACHE.get_or_init(|| Mutex::new(HashMap::new()));
if let Ok(mut guard) = cache.lock() {
if let Some((channel, last_used)) = guard.get_mut(url) {
if last_used.elapsed() < TTL {
*last_used = Instant::now();
return Ok(channel.clone());
}
}
let endpoint = grpc_endpoint(url)?;
let channel = endpoint.connect_lazy();
if guard.len() >= MAX_CHANNELS {
let mut oldest_key = None;
let mut oldest_at = Instant::now();
for (k, (_, last_used)) in guard.iter() {
if oldest_key.is_none() || *last_used < oldest_at {
oldest_key = Some(k.clone());
oldest_at = *last_used;
}
}
if let Some(key) = oldest_key {
guard.remove(&key);
}
}
guard.insert(url.to_string(), (channel.clone(), Instant::now()));
Ok(channel)
} else {
let endpoint = grpc_endpoint(url)?;
Ok(endpoint.connect_lazy())
}
}
pub async fn probe_status_ok(
url: &str,
headers: &[(&str, &str)],
timeout: Duration,
cache_ttl: Duration,
) -> Result<bool, reqwest::Error> {
const MAX_ENTRIES: usize = 256;
static SEM: OnceLock<tokio::sync::Semaphore> = OnceLock::new();
static CACHE: OnceLock<Mutex<HashMap<String, (bool, Instant)>>> = OnceLock::new();
let sem = SEM.get_or_init(|| tokio::sync::Semaphore::new(32));
let cache = CACHE.get_or_init(|| Mutex::new(HashMap::new()));
if cache_ttl > Duration::ZERO {
if let Ok(mut guard) = cache.lock() {
if let Some((value, last_used)) = guard.get_mut(url) {
if last_used.elapsed() < cache_ttl {
*last_used = Instant::now();
return Ok(*value);
}
}
}
}
let _permit = sem.acquire().await.expect("probe semaphore closed");
if cache_ttl > Duration::ZERO {
if let Ok(mut guard) = cache.lock() {
if let Some((value, last_used)) = guard.get_mut(url) {
if last_used.elapsed() < cache_ttl {
*last_used = Instant::now();
return Ok(*value);
}
}
}
}
let client = http_client();
let mut req = client.get(url).timeout(timeout);
for (k, v) in headers {
req = req.header(*k, *v);
}
let ok = req.send().await.map(|r| r.status().is_success())?;
if cache_ttl > Duration::ZERO {
if let Ok(mut guard) = cache.lock() {
if guard.len() >= MAX_ENTRIES {
evict_oldest(&mut guard);
}
guard.insert(url.to_string(), (ok, Instant::now()));
}
}
Ok(ok)
}
pub async fn probe_text(
url: &str,
headers: &[(&str, &str)],
timeout: Duration,
cache_ttl: Duration,
) -> Result<String, reqwest::Error> {
const MAX_ENTRIES: usize = 128;
static SEM: OnceLock<tokio::sync::Semaphore> = OnceLock::new();
static CACHE: OnceLock<Mutex<HashMap<String, (String, Instant)>>> = OnceLock::new();
let sem = SEM.get_or_init(|| tokio::sync::Semaphore::new(16));
let cache = CACHE.get_or_init(|| Mutex::new(HashMap::new()));
if cache_ttl > Duration::ZERO {
if let Ok(mut guard) = cache.lock() {
if let Some((value, last_used)) = guard.get_mut(url) {
if last_used.elapsed() < cache_ttl {
*last_used = Instant::now();
return Ok(value.clone());
}
}
}
}
let _permit = sem.acquire().await.expect("probe semaphore closed");
if cache_ttl > Duration::ZERO {
if let Ok(mut guard) = cache.lock() {
if let Some((value, last_used)) = guard.get_mut(url) {
if last_used.elapsed() < cache_ttl {
*last_used = Instant::now();
return Ok(value.clone());
}
}
}
}
let client = http_client();
let mut req = client.get(url).timeout(timeout);
for (k, v) in headers {
req = req.header(*k, *v);
}
let text = req.send().await?.text().await?;
if cache_ttl > Duration::ZERO {
if let Ok(mut guard) = cache.lock() {
if guard.len() >= MAX_ENTRIES {
evict_oldest(&mut guard);
}
guard.insert(url.to_string(), (text.clone(), Instant::now()));
}
}
Ok(text)
}
fn evict_oldest<T: Clone>(map: &mut HashMap<String, (T, Instant)>) {
let mut oldest_key = None;
let mut oldest_at = Instant::now();
for (k, (_, last_used)) in map.iter() {
if oldest_key.is_none() || *last_used < oldest_at {
oldest_key = Some(k.clone());
oldest_at = *last_used;
}
}
if let Some(key) = oldest_key {
map.remove(&key);
}
}
fn grpc_tls_config() -> Option<tonic::transport::ClientTlsConfig> {
let mut tls = tonic::transport::ClientTlsConfig::new();
let mut configured = false;