Monorepo consolidation: workspace, shared types, transport plans, docker/swam assets
This commit is contained in:
4
runner/src/config/mod.rs
Normal file
4
runner/src/config/mod.rs
Normal file
@@ -0,0 +1,4 @@
|
||||
mod settings;
|
||||
|
||||
pub use settings::RunnerMode;
|
||||
pub use settings::{Settings, SettingsLoadError};
|
||||
496
runner/src/config/settings.rs
Normal file
496
runner/src/config/settings.rs
Normal file
@@ -0,0 +1,496 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::Path;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(default)]
|
||||
pub struct Settings {
|
||||
pub nats_url: String,
|
||||
pub storage_path: String,
|
||||
pub mode: RunnerMode,
|
||||
|
||||
pub multi_tenant_enabled: bool,
|
||||
pub default_tenant_id: Option<String>,
|
||||
pub tenant_allowlist: Vec<String>,
|
||||
pub tenant_placement_bucket: Option<String>,
|
||||
pub shard_id: Option<String>,
|
||||
|
||||
pub aggregate_gateway_url: Option<String>,
|
||||
|
||||
pub aggregate_events_stream: String,
|
||||
pub workflow_commands_stream: String,
|
||||
pub workflow_events_stream: String,
|
||||
|
||||
pub saga_trigger_subject_filters: Vec<String>,
|
||||
pub effect_command_subject_filters: Vec<String>,
|
||||
|
||||
pub consumer_durable_prefix: String,
|
||||
pub deliver_group: Option<String>,
|
||||
pub max_in_flight: usize,
|
||||
pub ack_timeout_ms: u64,
|
||||
pub max_deliver: i64,
|
||||
|
||||
pub saga_manifest_path: String,
|
||||
pub effects_manifest_path: String,
|
||||
|
||||
pub outbox_scan_interval_ms: u64,
|
||||
pub outbox_batch_size: usize,
|
||||
pub outbox_max_in_flight: usize,
|
||||
pub outbox_max_in_flight_per_tenant: usize,
|
||||
|
||||
pub schedule_scan_interval_ms: u64,
|
||||
pub schedule_batch_size: usize,
|
||||
|
||||
pub effect_timeout_ms: u64,
|
||||
pub effect_retry_max_attempts: usize,
|
||||
pub effect_retry_backoff_ms: u64,
|
||||
|
||||
pub http_addr: String,
|
||||
|
||||
pub test_saga_crash_after_commit: bool,
|
||||
pub test_effect_crash_after_dedupe_before_ack: bool,
|
||||
pub test_outbox_crash_after_dispatch: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum RunnerMode {
|
||||
Saga,
|
||||
Effect,
|
||||
#[default]
|
||||
Combined,
|
||||
}
|
||||
|
||||
impl Default for Settings {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
nats_url: "nats://localhost:4222".to_string(),
|
||||
storage_path: "./data".to_string(),
|
||||
mode: RunnerMode::Combined,
|
||||
|
||||
multi_tenant_enabled: true,
|
||||
default_tenant_id: None,
|
||||
tenant_allowlist: Vec::new(),
|
||||
tenant_placement_bucket: None,
|
||||
shard_id: None,
|
||||
|
||||
aggregate_gateway_url: None,
|
||||
|
||||
aggregate_events_stream: "AGGREGATE_EVENTS".to_string(),
|
||||
workflow_commands_stream: "WORKFLOW_COMMANDS".to_string(),
|
||||
workflow_events_stream: "WORKFLOW_EVENTS".to_string(),
|
||||
|
||||
saga_trigger_subject_filters: vec!["tenant.*.aggregate.*.*".to_string()],
|
||||
effect_command_subject_filters: vec!["tenant.*.effect.*.*".to_string()],
|
||||
|
||||
consumer_durable_prefix: "runner".to_string(),
|
||||
deliver_group: None,
|
||||
max_in_flight: 128,
|
||||
ack_timeout_ms: 30_000,
|
||||
max_deliver: 10,
|
||||
|
||||
saga_manifest_path: "./sagas.yaml".to_string(),
|
||||
effects_manifest_path: "./effects.yaml".to_string(),
|
||||
|
||||
outbox_scan_interval_ms: 200,
|
||||
outbox_batch_size: 256,
|
||||
outbox_max_in_flight: 512,
|
||||
outbox_max_in_flight_per_tenant: 32,
|
||||
|
||||
schedule_scan_interval_ms: 250,
|
||||
schedule_batch_size: 256,
|
||||
|
||||
effect_timeout_ms: 30_000,
|
||||
effect_retry_max_attempts: 1,
|
||||
effect_retry_backoff_ms: 250,
|
||||
|
||||
http_addr: "0.0.0.0:8080".to_string(),
|
||||
|
||||
test_saga_crash_after_commit: false,
|
||||
test_effect_crash_after_dedupe_before_ack: false,
|
||||
test_outbox_crash_after_dispatch: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Settings {
|
||||
pub fn from_env() -> Result<Self, std::env::VarError> {
|
||||
let mut settings = Self::default();
|
||||
settings.apply_env_overrides();
|
||||
Ok(settings)
|
||||
}
|
||||
|
||||
pub fn from_yaml(yaml: &str) -> Result<Self, serde_yaml::Error> {
|
||||
serde_yaml::from_str(yaml)
|
||||
}
|
||||
|
||||
pub fn from_toml(toml_str: &str) -> Result<Self, toml::de::Error> {
|
||||
toml::from_str(toml_str)
|
||||
}
|
||||
|
||||
pub fn from_json(json: &str) -> Result<Self, serde_json::Error> {
|
||||
serde_json::from_str(json)
|
||||
}
|
||||
|
||||
pub fn from_file(path: impl AsRef<Path>) -> Result<Self, SettingsLoadError> {
|
||||
let path = path.as_ref();
|
||||
let raw = std::fs::read_to_string(path)?;
|
||||
let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
|
||||
|
||||
match ext {
|
||||
"yaml" | "yml" => Ok(Self::from_yaml(&raw)?),
|
||||
"toml" => Ok(Self::from_toml(&raw)?),
|
||||
"json" => Ok(Self::from_json(&raw)?),
|
||||
_ => Err(SettingsLoadError::UnsupportedFormat {
|
||||
path: path.display().to_string(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn load_from_file_with_env_overrides(
|
||||
path: impl AsRef<Path>,
|
||||
) -> Result<Self, SettingsLoadError> {
|
||||
let mut settings = Self::from_file(path)?;
|
||||
settings.apply_env_overrides();
|
||||
Ok(settings)
|
||||
}
|
||||
|
||||
fn apply_env_overrides(&mut self) {
|
||||
if let Ok(url) = std::env::var("RUNNER_NATS_URL") {
|
||||
self.nats_url = url;
|
||||
}
|
||||
|
||||
if let Ok(path) = std::env::var("RUNNER_STORAGE_PATH") {
|
||||
self.storage_path = path;
|
||||
}
|
||||
|
||||
if let Ok(mode) = std::env::var("RUNNER_MODE") {
|
||||
self.mode = match mode.trim().to_ascii_lowercase().as_str() {
|
||||
"saga" => RunnerMode::Saga,
|
||||
"effect" => RunnerMode::Effect,
|
||||
"combined" => RunnerMode::Combined,
|
||||
_ => self.mode,
|
||||
};
|
||||
}
|
||||
|
||||
if let Ok(enabled) = std::env::var("RUNNER_MULTI_TENANT") {
|
||||
if let Ok(value) = enabled.parse() {
|
||||
self.multi_tenant_enabled = value;
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(default_tenant_id) = std::env::var("RUNNER_DEFAULT_TENANT_ID") {
|
||||
if default_tenant_id.is_empty() {
|
||||
self.default_tenant_id = None;
|
||||
} else {
|
||||
self.default_tenant_id = Some(default_tenant_id);
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(list) = std::env::var("RUNNER_TENANT_ALLOWLIST") {
|
||||
let values = list
|
||||
.split(',')
|
||||
.map(|s| s.trim().to_string())
|
||||
.filter(|s| !s.is_empty())
|
||||
.collect::<Vec<_>>();
|
||||
self.tenant_allowlist = values;
|
||||
}
|
||||
|
||||
if let Ok(bucket) = std::env::var("RUNNER_TENANT_PLACEMENT_BUCKET") {
|
||||
if bucket.trim().is_empty() {
|
||||
self.tenant_placement_bucket = None;
|
||||
} else {
|
||||
self.tenant_placement_bucket = Some(bucket);
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(shard) = std::env::var("RUNNER_SHARD_ID") {
|
||||
if shard.trim().is_empty() {
|
||||
self.shard_id = None;
|
||||
} else {
|
||||
self.shard_id = Some(shard);
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(url) = std::env::var("RUNNER_AGGREGATE_GATEWAY_URL") {
|
||||
if url.trim().is_empty() {
|
||||
self.aggregate_gateway_url = None;
|
||||
} else {
|
||||
self.aggregate_gateway_url = Some(url);
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(name) = std::env::var("RUNNER_AGGREGATE_EVENTS_STREAM") {
|
||||
self.aggregate_events_stream = name;
|
||||
}
|
||||
|
||||
if let Ok(name) = std::env::var("RUNNER_WORKFLOW_COMMANDS_STREAM") {
|
||||
self.workflow_commands_stream = name;
|
||||
}
|
||||
|
||||
if let Ok(name) = std::env::var("RUNNER_WORKFLOW_EVENTS_STREAM") {
|
||||
self.workflow_events_stream = name;
|
||||
}
|
||||
|
||||
if let Ok(filters) = std::env::var("RUNNER_SAGA_TRIGGER_SUBJECT_FILTERS") {
|
||||
let values = filters
|
||||
.split(',')
|
||||
.map(|s| s.trim().to_string())
|
||||
.filter(|s| !s.is_empty())
|
||||
.collect::<Vec<_>>();
|
||||
if !values.is_empty() {
|
||||
self.saga_trigger_subject_filters = values;
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(filters) = std::env::var("RUNNER_EFFECT_COMMAND_SUBJECT_FILTERS") {
|
||||
let values = filters
|
||||
.split(',')
|
||||
.map(|s| s.trim().to_string())
|
||||
.filter(|s| !s.is_empty())
|
||||
.collect::<Vec<_>>();
|
||||
if !values.is_empty() {
|
||||
self.effect_command_subject_filters = values;
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(prefix) = std::env::var("RUNNER_CONSUMER_DURABLE_PREFIX") {
|
||||
self.consumer_durable_prefix = prefix;
|
||||
}
|
||||
|
||||
if let Ok(group) = std::env::var("RUNNER_DELIVER_GROUP") {
|
||||
if group.trim().is_empty() {
|
||||
self.deliver_group = None;
|
||||
} else {
|
||||
self.deliver_group = Some(group);
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(max_in_flight) = std::env::var("RUNNER_MAX_IN_FLIGHT") {
|
||||
if let Ok(value) = max_in_flight.parse() {
|
||||
self.max_in_flight = value;
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(ms) = std::env::var("RUNNER_ACK_TIMEOUT_MS") {
|
||||
if let Ok(value) = ms.parse() {
|
||||
self.ack_timeout_ms = value;
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(max_deliver) = std::env::var("RUNNER_MAX_DELIVER") {
|
||||
if let Ok(value) = max_deliver.parse() {
|
||||
self.max_deliver = value;
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(path) = std::env::var("RUNNER_SAGA_MANIFEST_PATH") {
|
||||
self.saga_manifest_path = path;
|
||||
}
|
||||
|
||||
if let Ok(path) = std::env::var("RUNNER_EFFECTS_MANIFEST_PATH") {
|
||||
self.effects_manifest_path = path;
|
||||
}
|
||||
|
||||
if let Ok(ms) = std::env::var("RUNNER_OUTBOX_SCAN_INTERVAL_MS") {
|
||||
if let Ok(value) = ms.parse() {
|
||||
self.outbox_scan_interval_ms = value;
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(size) = std::env::var("RUNNER_OUTBOX_BATCH_SIZE") {
|
||||
if let Ok(value) = size.parse() {
|
||||
self.outbox_batch_size = value;
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(size) = std::env::var("RUNNER_OUTBOX_MAX_IN_FLIGHT") {
|
||||
if let Ok(value) = size.parse() {
|
||||
self.outbox_max_in_flight = value;
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(size) = std::env::var("RUNNER_OUTBOX_MAX_IN_FLIGHT_PER_TENANT") {
|
||||
if let Ok(value) = size.parse() {
|
||||
self.outbox_max_in_flight_per_tenant = value;
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(ms) = std::env::var("RUNNER_SCHEDULE_SCAN_INTERVAL_MS") {
|
||||
if let Ok(value) = ms.parse() {
|
||||
self.schedule_scan_interval_ms = value;
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(size) = std::env::var("RUNNER_SCHEDULE_BATCH_SIZE") {
|
||||
if let Ok(value) = size.parse() {
|
||||
self.schedule_batch_size = value;
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(ms) = std::env::var("RUNNER_EFFECT_TIMEOUT_MS") {
|
||||
if let Ok(value) = ms.parse() {
|
||||
self.effect_timeout_ms = value;
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(attempts) = std::env::var("RUNNER_EFFECT_RETRY_MAX_ATTEMPTS") {
|
||||
if let Ok(value) = attempts.parse() {
|
||||
self.effect_retry_max_attempts = value;
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(ms) = std::env::var("RUNNER_EFFECT_RETRY_BACKOFF_MS") {
|
||||
if let Ok(value) = ms.parse() {
|
||||
self.effect_retry_backoff_ms = value;
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(addr) = std::env::var("RUNNER_HTTP_ADDR") {
|
||||
if !addr.trim().is_empty() {
|
||||
self.http_addr = addr;
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(v) = std::env::var("RUNNER_TEST_SAGA_CRASH_AFTER_COMMIT") {
|
||||
self.test_saga_crash_after_commit =
|
||||
matches!(v.trim().to_ascii_lowercase().as_str(), "1" | "true" | "yes");
|
||||
}
|
||||
if let Ok(v) = std::env::var("RUNNER_TEST_EFFECT_CRASH_AFTER_DEDUPE_BEFORE_ACK") {
|
||||
self.test_effect_crash_after_dedupe_before_ack =
|
||||
matches!(v.trim().to_ascii_lowercase().as_str(), "1" | "true" | "yes");
|
||||
}
|
||||
if let Ok(v) = std::env::var("RUNNER_TEST_OUTBOX_CRASH_AFTER_DISPATCH") {
|
||||
self.test_outbox_crash_after_dispatch =
|
||||
matches!(v.trim().to_ascii_lowercase().as_str(), "1" | "true" | "yes");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn validate(&self) -> Result<(), String> {
|
||||
if self.nats_url.is_empty() {
|
||||
return Err("NATS URL is required".to_string());
|
||||
}
|
||||
if self.storage_path.is_empty() {
|
||||
return Err("Storage path is required".to_string());
|
||||
}
|
||||
if self.aggregate_events_stream.is_empty() {
|
||||
return Err("Aggregate events stream name is required".to_string());
|
||||
}
|
||||
if matches!(self.mode, RunnerMode::Saga | RunnerMode::Combined)
|
||||
&& self.saga_trigger_subject_filters.is_empty()
|
||||
{
|
||||
return Err("At least one saga trigger subject filter is required".to_string());
|
||||
}
|
||||
if matches!(self.mode, RunnerMode::Effect | RunnerMode::Combined)
|
||||
&& self.effect_command_subject_filters.is_empty()
|
||||
{
|
||||
return Err("At least one effect command subject filter is required".to_string());
|
||||
}
|
||||
if self.consumer_durable_prefix.trim().is_empty() {
|
||||
return Err("Consumer durable prefix is required".to_string());
|
||||
}
|
||||
if self.max_in_flight == 0 {
|
||||
return Err("Max in-flight must be > 0".to_string());
|
||||
}
|
||||
if self.ack_timeout_ms == 0 {
|
||||
return Err("Ack timeout must be > 0".to_string());
|
||||
}
|
||||
if self.outbox_batch_size == 0 || self.schedule_batch_size == 0 {
|
||||
return Err("Batch sizes must be > 0".to_string());
|
||||
}
|
||||
if self.outbox_max_in_flight == 0 || self.outbox_max_in_flight_per_tenant == 0 {
|
||||
return Err("Outbox max in-flight must be > 0".to_string());
|
||||
}
|
||||
if self.effect_timeout_ms == 0 {
|
||||
return Err("Effect timeout must be > 0".to_string());
|
||||
}
|
||||
if self.effect_retry_max_attempts == 0 {
|
||||
return Err("Effect retry max attempts must be > 0".to_string());
|
||||
}
|
||||
|
||||
if matches!(self.mode, RunnerMode::Saga | RunnerMode::Combined) {
|
||||
if self.saga_manifest_path.trim().is_empty() {
|
||||
return Err("Saga manifest path is required".to_string());
|
||||
}
|
||||
let manifest = crate::saga::SagaManifest::from_file(&self.saga_manifest_path)
|
||||
.map_err(|e| format!("Failed to load saga manifest: {}", e))?;
|
||||
manifest
|
||||
.validate()
|
||||
.map_err(|e| format!("Invalid saga manifest: {}", e))?;
|
||||
}
|
||||
|
||||
if matches!(self.mode, RunnerMode::Effect | RunnerMode::Combined) {
|
||||
if self.effects_manifest_path.trim().is_empty() {
|
||||
return Err("Effects manifest path is required".to_string());
|
||||
}
|
||||
let manifest = crate::effects::EffectsManifest::from_file(&self.effects_manifest_path)
|
||||
.map_err(|e| format!("Failed to load effects manifest: {}", e))?;
|
||||
manifest
|
||||
.validate()
|
||||
.map_err(|e| format!("Invalid effects manifest: {}", e))?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum SettingsLoadError {
|
||||
#[error("Failed to read config file: {0}")]
|
||||
Io(#[from] std::io::Error),
|
||||
#[error("Failed to parse YAML config: {0}")]
|
||||
Yaml(#[from] serde_yaml::Error),
|
||||
#[error("Failed to parse TOML config: {0}")]
|
||||
Toml(#[from] toml::de::Error),
|
||||
#[error("Failed to parse JSON config: {0}")]
|
||||
Json(#[from] serde_json::Error),
|
||||
#[error("Unsupported config format: {path}")]
|
||||
UnsupportedFormat { path: String },
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn env_lock() -> std::sync::MutexGuard<'static, ()> {
|
||||
static LOCK: std::sync::OnceLock<std::sync::Mutex<()>> = std::sync::OnceLock::new();
|
||||
LOCK.get_or_init(|| std::sync::Mutex::new(()))
|
||||
.lock()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn settings_from_env() {
|
||||
let _guard = env_lock();
|
||||
std::env::set_var("RUNNER_NATS_URL", "nats://localhost:4222");
|
||||
let settings = Settings::from_env().unwrap();
|
||||
assert_eq!(settings.nats_url, "nats://localhost:4222");
|
||||
std::env::remove_var("RUNNER_NATS_URL");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tenant_allowlist_overrides_subject_filters() {
|
||||
let _guard = env_lock();
|
||||
std::env::set_var("RUNNER_TENANT_ALLOWLIST", "t1,t2");
|
||||
let settings = Settings::from_env().unwrap();
|
||||
assert_eq!(
|
||||
settings.tenant_allowlist,
|
||||
vec!["t1".to_string(), "t2".to_string()]
|
||||
);
|
||||
std::env::remove_var("RUNNER_TENANT_ALLOWLIST");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn settings_validation_catches_missing_required() {
|
||||
let settings = Settings {
|
||||
nats_url: "".to_string(),
|
||||
..Default::default()
|
||||
};
|
||||
assert!(settings.validate().is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn settings_is_clone_debug() {
|
||||
fn assert_clone_debug<T: Clone + std::fmt::Debug>() {}
|
||||
assert_clone_debug::<Settings>();
|
||||
}
|
||||
}
|
||||
79
runner/src/effects/manifest.rs
Normal file
79
runner/src/effects/manifest.rs
Normal file
@@ -0,0 +1,79 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
use std::path::Path;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
||||
#[serde(default)]
|
||||
pub struct EffectsManifest {
|
||||
pub effects: Vec<EffectDefinition>,
|
||||
}
|
||||
|
||||
impl EffectsManifest {
|
||||
pub fn validate(&self) -> Result<(), String> {
|
||||
for effect in &self.effects {
|
||||
effect.validate()?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn from_yaml(yaml: &str) -> Result<Self, serde_yaml::Error> {
|
||||
serde_yaml::from_str(yaml)
|
||||
}
|
||||
|
||||
pub fn from_toml(toml_str: &str) -> Result<Self, toml::de::Error> {
|
||||
toml::from_str(toml_str)
|
||||
}
|
||||
|
||||
pub fn from_json(json: &str) -> Result<Self, serde_json::Error> {
|
||||
serde_json::from_str(json)
|
||||
}
|
||||
|
||||
pub fn from_file(path: impl AsRef<Path>) -> Result<Self, EffectsManifestLoadError> {
|
||||
let path = path.as_ref();
|
||||
let raw = std::fs::read_to_string(path)?;
|
||||
let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
|
||||
|
||||
match ext {
|
||||
"yaml" | "yml" => Ok(Self::from_yaml(&raw)?),
|
||||
"toml" => Ok(Self::from_toml(&raw)?),
|
||||
"json" => Ok(Self::from_json(&raw)?),
|
||||
_ => Err(EffectsManifestLoadError::UnsupportedFormat {
|
||||
path: path.display().to_string(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
||||
#[serde(default)]
|
||||
pub struct EffectDefinition {
|
||||
pub name: String,
|
||||
pub provider: String,
|
||||
pub config: Value,
|
||||
}
|
||||
|
||||
impl EffectDefinition {
|
||||
pub fn validate(&self) -> Result<(), String> {
|
||||
if self.name.trim().is_empty() {
|
||||
return Err("Effect name is required".to_string());
|
||||
}
|
||||
if self.provider.trim().is_empty() {
|
||||
return Err(format!("Effect '{}' must specify provider", self.name));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum EffectsManifestLoadError {
|
||||
#[error("Failed to read manifest file: {0}")]
|
||||
Io(#[from] std::io::Error),
|
||||
#[error("Failed to parse YAML manifest: {0}")]
|
||||
Yaml(#[from] serde_yaml::Error),
|
||||
#[error("Failed to parse TOML manifest: {0}")]
|
||||
Toml(#[from] toml::de::Error),
|
||||
#[error("Failed to parse JSON manifest: {0}")]
|
||||
Json(#[from] serde_json::Error),
|
||||
#[error("Unsupported manifest format: {path}")]
|
||||
UnsupportedFormat { path: String },
|
||||
}
|
||||
9
runner/src/effects/mod.rs
Normal file
9
runner/src/effects/mod.rs
Normal file
@@ -0,0 +1,9 @@
|
||||
mod manifest;
|
||||
mod providers;
|
||||
mod runtime;
|
||||
mod worker;
|
||||
|
||||
pub use manifest::{EffectDefinition, EffectsManifest};
|
||||
pub use providers::{EffectProvider, ProviderRegistry};
|
||||
pub use runtime::EffectRuntime;
|
||||
pub use worker::run_effect_worker;
|
||||
527
runner/src/effects/providers/email.rs
Normal file
527
runner/src/effects/providers/email.rs
Normal file
@@ -0,0 +1,527 @@
|
||||
use crate::types::{EffectCommandEnvelope, EffectResultEnvelope, EffectResultType, RunnerError};
|
||||
use aws_config::Region;
|
||||
use aws_sdk_sesv2::types::{Body, Content, Destination, EmailContent, Message};
|
||||
use chrono::Utc;
|
||||
use futures::future::BoxFuture;
|
||||
use lettre::message::{
|
||||
header::ContentType, Mailbox, Message as LettreMessage, MultiPart, SinglePart,
|
||||
};
|
||||
use lettre::{AsyncSmtpTransport, AsyncTransport, Tokio1Executor};
|
||||
use serde::Deserialize;
|
||||
use serde_json::Value;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
#[serde(tag = "backend", rename_all = "snake_case")]
|
||||
pub enum EmailProviderConfig {
|
||||
Resend {
|
||||
api_key_env: String,
|
||||
#[serde(default)]
|
||||
from: Option<String>,
|
||||
},
|
||||
Postmark {
|
||||
server_token_env: String,
|
||||
#[serde(default)]
|
||||
from: Option<String>,
|
||||
},
|
||||
Smtp {
|
||||
url_env: String,
|
||||
},
|
||||
Ses {
|
||||
region: String,
|
||||
#[serde(default)]
|
||||
from: Option<String>,
|
||||
#[serde(default)]
|
||||
configuration_set: Option<String>,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct EmailProvider {
|
||||
config: EmailProviderConfig,
|
||||
client: Arc<reqwest::Client>,
|
||||
}
|
||||
|
||||
impl EmailProvider {
|
||||
pub fn from_config_value(config: Value) -> Result<Self, RunnerError> {
|
||||
let cfg: EmailProviderConfig =
|
||||
serde_json::from_value(config).map_err(|e| RunnerError::DecodeError(e.to_string()))?;
|
||||
Ok(Self {
|
||||
config: cfg,
|
||||
client: Arc::new(reqwest::Client::new()),
|
||||
})
|
||||
}
|
||||
|
||||
fn env_required(name: &str) -> Result<String, RunnerError> {
|
||||
std::env::var(name)
|
||||
.map_err(|_| RunnerError::RuntimeError(format!("Missing required env var: {}", name)))
|
||||
}
|
||||
}
|
||||
|
||||
impl super::EffectProvider for EmailProvider {
|
||||
fn execute(
|
||||
&self,
|
||||
cmd: EffectCommandEnvelope,
|
||||
) -> BoxFuture<'static, Result<EffectResultEnvelope, RunnerError>> {
|
||||
let config = self.config.clone();
|
||||
let client = self.client.clone();
|
||||
Box::pin(async move {
|
||||
let payload: SendEmailPayload = serde_json::from_value(cmd.payload.clone())
|
||||
.map_err(|e| RunnerError::DecodeError(e.to_string()))?;
|
||||
|
||||
match config {
|
||||
EmailProviderConfig::Resend { api_key_env, from } => {
|
||||
let api_key = Self::env_required(&api_key_env)?;
|
||||
let from = payload.from.clone().or(from).ok_or_else(|| {
|
||||
RunnerError::RuntimeError(
|
||||
"Missing 'from' (payload.from or config.from)".to_string(),
|
||||
)
|
||||
})?;
|
||||
let res = send_resend(&client, api_key, from, payload).await?;
|
||||
Ok(EffectResultEnvelope {
|
||||
tenant_id: cmd.tenant_id,
|
||||
command_id: cmd.command_id,
|
||||
effect_name: cmd.effect_name,
|
||||
result_type: EffectResultType::Succeeded,
|
||||
payload: res,
|
||||
timestamp: Utc::now(),
|
||||
metadata: cmd.metadata,
|
||||
})
|
||||
}
|
||||
EmailProviderConfig::Postmark {
|
||||
server_token_env,
|
||||
from,
|
||||
} => {
|
||||
let token = Self::env_required(&server_token_env)?;
|
||||
let from = payload.from.clone().or(from).ok_or_else(|| {
|
||||
RunnerError::RuntimeError(
|
||||
"Missing 'from' (payload.from or config.from)".to_string(),
|
||||
)
|
||||
})?;
|
||||
let res = send_postmark(&client, token, from, payload).await?;
|
||||
Ok(EffectResultEnvelope {
|
||||
tenant_id: cmd.tenant_id,
|
||||
command_id: cmd.command_id,
|
||||
effect_name: cmd.effect_name,
|
||||
result_type: EffectResultType::Succeeded,
|
||||
payload: res,
|
||||
timestamp: Utc::now(),
|
||||
metadata: cmd.metadata,
|
||||
})
|
||||
}
|
||||
EmailProviderConfig::Smtp { url_env } => {
|
||||
let url = Self::env_required(&url_env)?;
|
||||
let res = send_smtp(url, payload).await?;
|
||||
Ok(EffectResultEnvelope {
|
||||
tenant_id: cmd.tenant_id,
|
||||
command_id: cmd.command_id,
|
||||
effect_name: cmd.effect_name,
|
||||
result_type: EffectResultType::Succeeded,
|
||||
payload: res,
|
||||
timestamp: Utc::now(),
|
||||
metadata: cmd.metadata,
|
||||
})
|
||||
}
|
||||
EmailProviderConfig::Ses {
|
||||
region,
|
||||
from,
|
||||
configuration_set,
|
||||
} => {
|
||||
let from = payload.from.clone().or(from).ok_or_else(|| {
|
||||
RunnerError::RuntimeError(
|
||||
"Missing 'from' (payload.from or config.from)".to_string(),
|
||||
)
|
||||
})?;
|
||||
let res = send_ses(region, from, configuration_set, payload).await?;
|
||||
Ok(EffectResultEnvelope {
|
||||
tenant_id: cmd.tenant_id,
|
||||
command_id: cmd.command_id,
|
||||
effect_name: cmd.effect_name,
|
||||
result_type: EffectResultType::Succeeded,
|
||||
payload: res,
|
||||
timestamp: Utc::now(),
|
||||
metadata: cmd.metadata,
|
||||
})
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
enum OneOrMany<T> {
|
||||
One(T),
|
||||
Many(Vec<T>),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub struct SendEmailPayload {
|
||||
#[serde(default)]
|
||||
from: Option<String>,
|
||||
to: OneOrMany<String>,
|
||||
#[serde(default)]
|
||||
cc: Option<OneOrMany<String>>,
|
||||
#[serde(default)]
|
||||
bcc: Option<OneOrMany<String>>,
|
||||
#[serde(default)]
|
||||
reply_to: Option<String>,
|
||||
subject: String,
|
||||
#[serde(default)]
|
||||
text: Option<String>,
|
||||
#[serde(default)]
|
||||
html: Option<String>,
|
||||
#[serde(default)]
|
||||
tags: Option<HashMap<String, String>>,
|
||||
#[serde(default)]
|
||||
headers: Option<HashMap<String, String>>,
|
||||
}
|
||||
|
||||
impl SendEmailPayload {
|
||||
fn to_vec(one_or_many: OneOrMany<String>) -> Vec<String> {
|
||||
match one_or_many {
|
||||
OneOrMany::One(v) => vec![v],
|
||||
OneOrMany::Many(v) => v,
|
||||
}
|
||||
}
|
||||
|
||||
fn to_list(&self) -> Vec<String> {
|
||||
Self::to_vec(self.to.clone())
|
||||
}
|
||||
|
||||
fn cc_list(&self) -> Vec<String> {
|
||||
self.cc.clone().map(Self::to_vec).unwrap_or_default()
|
||||
}
|
||||
|
||||
fn bcc_list(&self) -> Vec<String> {
|
||||
self.bcc.clone().map(Self::to_vec).unwrap_or_default()
|
||||
}
|
||||
}
|
||||
|
||||
async fn send_resend(
|
||||
client: &reqwest::Client,
|
||||
api_key: String,
|
||||
from: String,
|
||||
payload: SendEmailPayload,
|
||||
) -> Result<Value, RunnerError> {
|
||||
let url = "https://api.resend.com/emails";
|
||||
let to = payload.to_list();
|
||||
let cc = payload.cc_list();
|
||||
let bcc = payload.bcc_list();
|
||||
let subject = payload.subject;
|
||||
let text = payload.text;
|
||||
let html = payload.html;
|
||||
let reply_to = payload.reply_to;
|
||||
let mut body = serde_json::json!({
|
||||
"from": from,
|
||||
"to": to,
|
||||
"subject": subject,
|
||||
});
|
||||
if let Some(text) = text {
|
||||
body["text"] = Value::String(text);
|
||||
}
|
||||
if let Some(html) = html {
|
||||
body["html"] = Value::String(html);
|
||||
}
|
||||
if let Some(reply_to) = reply_to {
|
||||
body["reply_to"] = Value::String(reply_to);
|
||||
}
|
||||
if !cc.is_empty() {
|
||||
body["cc"] = serde_json::to_value(cc).unwrap_or(Value::Null);
|
||||
}
|
||||
if !bcc.is_empty() {
|
||||
body["bcc"] = serde_json::to_value(bcc).unwrap_or(Value::Null);
|
||||
}
|
||||
|
||||
let resp = client
|
||||
.post(url)
|
||||
.bearer_auth(api_key)
|
||||
.json(&body)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| RunnerError::RuntimeError(e.to_string()))?;
|
||||
|
||||
let status = resp.status();
|
||||
let text = resp
|
||||
.text()
|
||||
.await
|
||||
.map_err(|e| RunnerError::RuntimeError(e.to_string()))?;
|
||||
if !status.is_success() {
|
||||
return Err(RunnerError::RuntimeError(format!(
|
||||
"Resend API error (status={}): {}",
|
||||
status.as_u16(),
|
||||
text
|
||||
)));
|
||||
}
|
||||
serde_json::from_str(&text).map_err(|e| RunnerError::DecodeError(e.to_string()))
|
||||
}
|
||||
|
||||
async fn send_postmark(
|
||||
client: &reqwest::Client,
|
||||
token: String,
|
||||
from: String,
|
||||
payload: SendEmailPayload,
|
||||
) -> Result<Value, RunnerError> {
|
||||
let url = "https://api.postmarkapp.com/email";
|
||||
let to = payload.to_list();
|
||||
let cc = payload.cc_list();
|
||||
let bcc = payload.bcc_list();
|
||||
let subject = payload.subject;
|
||||
let text = payload.text;
|
||||
let html = payload.html;
|
||||
let reply_to = payload.reply_to;
|
||||
let tags = payload.tags;
|
||||
let headers = payload.headers;
|
||||
let mut body = serde_json::json!({
|
||||
"From": from,
|
||||
"To": to.join(","),
|
||||
"Subject": subject,
|
||||
});
|
||||
if let Some(text) = text {
|
||||
body["TextBody"] = Value::String(text);
|
||||
}
|
||||
if let Some(html) = html {
|
||||
body["HtmlBody"] = Value::String(html);
|
||||
}
|
||||
if let Some(reply_to) = reply_to {
|
||||
body["ReplyTo"] = Value::String(reply_to);
|
||||
}
|
||||
if !cc.is_empty() {
|
||||
body["Cc"] = Value::String(cc.join(","));
|
||||
}
|
||||
if !bcc.is_empty() {
|
||||
body["Bcc"] = Value::String(bcc.join(","));
|
||||
}
|
||||
if let Some(tags) = tags {
|
||||
body["Metadata"] = serde_json::to_value(tags).unwrap_or(Value::Null);
|
||||
}
|
||||
if let Some(headers) = headers {
|
||||
let headers_arr = headers
|
||||
.into_iter()
|
||||
.map(|(k, v)| serde_json::json!({ "Name": k, "Value": v }))
|
||||
.collect::<Vec<_>>();
|
||||
body["Headers"] = serde_json::to_value(headers_arr).unwrap_or(Value::Null);
|
||||
}
|
||||
|
||||
let resp = client
|
||||
.post(url)
|
||||
.header("X-Postmark-Server-Token", token)
|
||||
.json(&body)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| RunnerError::RuntimeError(e.to_string()))?;
|
||||
|
||||
let status = resp.status();
|
||||
let text = resp
|
||||
.text()
|
||||
.await
|
||||
.map_err(|e| RunnerError::RuntimeError(e.to_string()))?;
|
||||
if !status.is_success() {
|
||||
return Err(RunnerError::RuntimeError(format!(
|
||||
"Postmark API error (status={}): {}",
|
||||
status.as_u16(),
|
||||
text
|
||||
)));
|
||||
}
|
||||
serde_json::from_str(&text).map_err(|e| RunnerError::DecodeError(e.to_string()))
|
||||
}
|
||||
|
||||
async fn send_smtp(url: String, payload: SendEmailPayload) -> Result<Value, RunnerError> {
|
||||
let from = payload.from.clone().ok_or_else(|| {
|
||||
RunnerError::RuntimeError("Missing 'from' in payload for SMTP backend".to_string())
|
||||
})?;
|
||||
let to = payload.to_list();
|
||||
let cc = payload.cc_list();
|
||||
let bcc = payload.bcc_list();
|
||||
let subject = payload.subject;
|
||||
let reply_to = payload.reply_to;
|
||||
let text = payload.text;
|
||||
let html = payload.html;
|
||||
if to.is_empty() {
|
||||
return Err(RunnerError::RuntimeError("Missing 'to'".to_string()));
|
||||
}
|
||||
let mut builder = LettreMessage::builder()
|
||||
.from(
|
||||
from.parse::<Mailbox>()
|
||||
.map_err(|e| RunnerError::DecodeError(e.to_string()))?,
|
||||
)
|
||||
.subject(subject);
|
||||
for addr in to {
|
||||
builder = builder.to(addr
|
||||
.parse::<Mailbox>()
|
||||
.map_err(|e| RunnerError::DecodeError(e.to_string()))?);
|
||||
}
|
||||
for addr in cc {
|
||||
builder = builder.cc(addr
|
||||
.parse::<Mailbox>()
|
||||
.map_err(|e| RunnerError::DecodeError(e.to_string()))?);
|
||||
}
|
||||
for addr in bcc {
|
||||
builder = builder.bcc(
|
||||
addr.parse::<Mailbox>()
|
||||
.map_err(|e| RunnerError::DecodeError(e.to_string()))?,
|
||||
);
|
||||
}
|
||||
if let Some(reply_to) = reply_to {
|
||||
builder = builder.reply_to(
|
||||
reply_to
|
||||
.parse::<Mailbox>()
|
||||
.map_err(|e| RunnerError::DecodeError(e.to_string()))?,
|
||||
);
|
||||
}
|
||||
|
||||
let message = match (text, html) {
|
||||
(Some(text), Some(html)) => builder
|
||||
.multipart(
|
||||
MultiPart::alternative()
|
||||
.singlepart(
|
||||
SinglePart::builder()
|
||||
.header(ContentType::TEXT_PLAIN)
|
||||
.body(text),
|
||||
)
|
||||
.singlepart(
|
||||
SinglePart::builder()
|
||||
.header(ContentType::TEXT_HTML)
|
||||
.body(html),
|
||||
),
|
||||
)
|
||||
.map_err(|e| RunnerError::RuntimeError(e.to_string()))?,
|
||||
(Some(text), None) => builder
|
||||
.singlepart(
|
||||
SinglePart::builder()
|
||||
.header(ContentType::TEXT_PLAIN)
|
||||
.body(text),
|
||||
)
|
||||
.map_err(|e| RunnerError::RuntimeError(e.to_string()))?,
|
||||
(None, Some(html)) => builder
|
||||
.singlepart(
|
||||
SinglePart::builder()
|
||||
.header(ContentType::TEXT_HTML)
|
||||
.body(html),
|
||||
)
|
||||
.map_err(|e| RunnerError::RuntimeError(e.to_string()))?,
|
||||
(None, None) => {
|
||||
return Err(RunnerError::RuntimeError(
|
||||
"Missing 'text' or 'html'".to_string(),
|
||||
))
|
||||
}
|
||||
};
|
||||
|
||||
let transport = AsyncSmtpTransport::<Tokio1Executor>::from_url(&url)
|
||||
.map_err(|e| RunnerError::RuntimeError(e.to_string()))?
|
||||
.build();
|
||||
|
||||
transport
|
||||
.send(message)
|
||||
.await
|
||||
.map_err(|e| RunnerError::RuntimeError(e.to_string()))?;
|
||||
|
||||
Ok(serde_json::json!({ "ok": true }))
|
||||
}
|
||||
|
||||
async fn send_ses(
|
||||
region: String,
|
||||
from: String,
|
||||
configuration_set: Option<String>,
|
||||
payload: SendEmailPayload,
|
||||
) -> Result<Value, RunnerError> {
|
||||
let to_addresses = payload.to_list();
|
||||
let cc_addresses = payload.cc_list();
|
||||
let bcc_addresses = payload.bcc_list();
|
||||
let subject_value = payload.subject;
|
||||
let text_value = payload.text;
|
||||
let html_value = payload.html;
|
||||
if to_addresses.is_empty() {
|
||||
return Err(RunnerError::RuntimeError("Missing 'to'".to_string()));
|
||||
}
|
||||
|
||||
let cfg = aws_config::from_env()
|
||||
.region(Region::new(region))
|
||||
.load()
|
||||
.await;
|
||||
let client = aws_sdk_sesv2::Client::new(&cfg);
|
||||
|
||||
let subject = Content::builder()
|
||||
.data(subject_value)
|
||||
.build()
|
||||
.map_err(|e| RunnerError::RuntimeError(e.to_string()))?;
|
||||
|
||||
let mut body_builder = Body::builder();
|
||||
if let Some(text) = text_value {
|
||||
let content = Content::builder()
|
||||
.data(text)
|
||||
.build()
|
||||
.map_err(|e| RunnerError::RuntimeError(e.to_string()))?;
|
||||
body_builder = body_builder.text(content);
|
||||
}
|
||||
if let Some(html) = html_value {
|
||||
let content = Content::builder()
|
||||
.data(html)
|
||||
.build()
|
||||
.map_err(|e| RunnerError::RuntimeError(e.to_string()))?;
|
||||
body_builder = body_builder.html(content);
|
||||
}
|
||||
|
||||
let body = body_builder.build();
|
||||
|
||||
let message = Message::builder().subject(subject).body(body).build();
|
||||
|
||||
let email_content = EmailContent::builder().simple(message).build();
|
||||
|
||||
let dest = Destination::builder()
|
||||
.set_to_addresses(Some(to_addresses))
|
||||
.set_cc_addresses(if cc_addresses.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(cc_addresses)
|
||||
})
|
||||
.set_bcc_addresses(if bcc_addresses.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(bcc_addresses)
|
||||
})
|
||||
.build();
|
||||
|
||||
let mut req = client
|
||||
.send_email()
|
||||
.from_email_address(from)
|
||||
.destination(dest)
|
||||
.content(email_content);
|
||||
if let Some(cs) = configuration_set {
|
||||
req = req.configuration_set_name(cs);
|
||||
}
|
||||
|
||||
let out = req
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| RunnerError::RuntimeError(e.to_string()))?;
|
||||
|
||||
Ok(serde_json::json!({ "message_id": out.message_id() }))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn config_parses_resend() {
|
||||
let v =
|
||||
serde_json::json!({"backend":"resend","api_key_env":"RESEND_API_KEY","from":"a@b.com"});
|
||||
let p = EmailProvider::from_config_value(v).unwrap();
|
||||
match p.config {
|
||||
EmailProviderConfig::Resend { api_key_env, from } => {
|
||||
assert_eq!(api_key_env, "RESEND_API_KEY");
|
||||
assert_eq!(from.as_deref(), Some("a@b.com"));
|
||||
}
|
||||
_ => panic!("unexpected backend"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn payload_requires_subject_and_to() {
|
||||
let v = serde_json::json!({"subject":"hi","to":"x@y.com","text":"ok"});
|
||||
let p: SendEmailPayload = serde_json::from_value(v).unwrap();
|
||||
assert_eq!(p.to_list(), vec!["x@y.com".to_string()]);
|
||||
}
|
||||
}
|
||||
32
runner/src/effects/providers/mod.rs
Normal file
32
runner/src/effects/providers/mod.rs
Normal file
@@ -0,0 +1,32 @@
|
||||
use crate::types::{EffectCommandEnvelope, EffectResultEnvelope, RunnerError};
|
||||
use futures::future::BoxFuture;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub(crate) mod email;
|
||||
|
||||
pub trait EffectProvider: Send + Sync {
|
||||
fn execute(
|
||||
&self,
|
||||
cmd: EffectCommandEnvelope,
|
||||
) -> BoxFuture<'static, Result<EffectResultEnvelope, RunnerError>>;
|
||||
}
|
||||
|
||||
#[derive(Default, Clone)]
|
||||
pub struct ProviderRegistry {
|
||||
providers: HashMap<String, Arc<dyn EffectProvider>>,
|
||||
}
|
||||
|
||||
impl ProviderRegistry {
|
||||
pub fn new() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
pub fn register(&mut self, name: impl Into<String>, provider: Arc<dyn EffectProvider>) {
|
||||
self.providers.insert(name.into(), provider);
|
||||
}
|
||||
|
||||
pub fn get(&self, name: &str) -> Option<Arc<dyn EffectProvider>> {
|
||||
self.providers.get(name).cloned()
|
||||
}
|
||||
}
|
||||
39
runner/src/effects/runtime.rs
Normal file
39
runner/src/effects/runtime.rs
Normal file
@@ -0,0 +1,39 @@
|
||||
use crate::effects::ProviderRegistry;
|
||||
use crate::types::{EffectCommandEnvelope, EffectResultEnvelope, RunnerError};
|
||||
use std::collections::HashMap;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct EffectRuntime {
|
||||
effect_to_provider: HashMap<String, String>,
|
||||
registry: ProviderRegistry,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for EffectRuntime {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("EffectRuntime").finish_non_exhaustive()
|
||||
}
|
||||
}
|
||||
|
||||
impl EffectRuntime {
|
||||
pub fn new(effect_to_provider: HashMap<String, String>, registry: ProviderRegistry) -> Self {
|
||||
Self {
|
||||
effect_to_provider,
|
||||
registry,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn execute(
|
||||
&self,
|
||||
cmd: EffectCommandEnvelope,
|
||||
) -> Result<EffectResultEnvelope, RunnerError> {
|
||||
let provider_name = self
|
||||
.effect_to_provider
|
||||
.get(cmd.effect_name.as_str())
|
||||
.ok_or_else(|| RunnerError::RuntimeError("Unknown effect".to_string()))?;
|
||||
let provider = self
|
||||
.registry
|
||||
.get(provider_name)
|
||||
.ok_or_else(|| RunnerError::RuntimeError("Unknown effect provider".to_string()))?;
|
||||
provider.execute(cmd).await
|
||||
}
|
||||
}
|
||||
794
runner/src/effects/worker.rs
Normal file
794
runner/src/effects/worker.rs
Normal file
@@ -0,0 +1,794 @@
|
||||
use crate::config::Settings;
|
||||
use crate::effects::{EffectProvider, EffectsManifest, ProviderRegistry};
|
||||
use crate::observability::Metrics;
|
||||
use crate::storage::KvClient;
|
||||
use crate::stream::{ConsumerOptions, JetStreamClient};
|
||||
use crate::tenant_placement::TenantGate;
|
||||
use crate::types::{
|
||||
DedupeEffectKey, EffectCommandEnvelope, EffectResultEnvelope, EffectResultType, RunnerError,
|
||||
};
|
||||
use async_nats::jetstream::consumer::DeliverPolicy;
|
||||
use async_nats::jetstream::AckKind;
|
||||
use chrono::Utc;
|
||||
use futures::future::BoxFuture;
|
||||
use futures::StreamExt;
|
||||
use serde_json::Value;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::HashSet;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::sync::watch;
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn run_effect_worker(
|
||||
settings: Settings,
|
||||
storage: KvClient,
|
||||
metrics: Arc<Metrics>,
|
||||
tenant_gate: Arc<TenantGate>,
|
||||
tenant_filter: Option<watch::Receiver<HashSet<String>>>,
|
||||
reload: Arc<tokio::sync::Notify>,
|
||||
shutdown: Arc<tokio::sync::Notify>,
|
||||
draining: Arc<AtomicBool>,
|
||||
) -> Result<(), RunnerError> {
|
||||
if tenant_filter.is_none() && settings.tenant_allowlist.is_empty() {
|
||||
return run_effect_worker_single(
|
||||
settings,
|
||||
storage,
|
||||
metrics,
|
||||
tenant_gate,
|
||||
reload,
|
||||
shutdown,
|
||||
draining,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
let settings = Arc::new(settings);
|
||||
let manifest = load_manifest(&settings, &storage)?;
|
||||
|
||||
let (effect_to_provider, registry) = build_registry(&manifest)?;
|
||||
let runtime = Arc::new(crate::effects::EffectRuntime::new(
|
||||
effect_to_provider,
|
||||
registry,
|
||||
));
|
||||
let (runtime_tx, runtime_rx) = watch::channel(runtime);
|
||||
|
||||
let jetstream = JetStreamClient::connect(&settings)
|
||||
.await
|
||||
.map_err(|e| RunnerError::StreamError(e.to_string()))?;
|
||||
let publisher = JetStreamPublisher::new(jetstream.clone());
|
||||
let mut tenant_rx = match tenant_filter {
|
||||
Some(rx) => rx,
|
||||
None => {
|
||||
let initial = settings
|
||||
.tenant_allowlist
|
||||
.iter()
|
||||
.cloned()
|
||||
.collect::<HashSet<_>>();
|
||||
let (_tx, rx) = watch::channel(initial);
|
||||
rx
|
||||
}
|
||||
};
|
||||
|
||||
let mut tasks: HashMap<String, tokio::task::JoinHandle<()>> = HashMap::new();
|
||||
let mut stops: HashMap<String, Arc<tokio::sync::Notify>> = HashMap::new();
|
||||
|
||||
{
|
||||
let settings = settings.clone();
|
||||
let storage = storage.clone();
|
||||
let reload = reload.clone();
|
||||
tokio::spawn(async move {
|
||||
loop {
|
||||
reload.notified().await;
|
||||
let manifest = match load_manifest(&settings, &storage) {
|
||||
Ok(m) => m,
|
||||
Err(e) => {
|
||||
tracing::error!(error = %e, "Failed to load effects manifest on reload");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
if let Err(e) = manifest.validate() {
|
||||
tracing::error!(error = %e, "Invalid effects manifest on reload");
|
||||
continue;
|
||||
}
|
||||
let runtime = match build_registry(&manifest)
|
||||
.map(|(m, r)| crate::effects::EffectRuntime::new(m, r))
|
||||
{
|
||||
Ok(r) => Arc::new(r),
|
||||
Err(e) => {
|
||||
tracing::error!(error = %e, "Failed to rebuild effect runtime on reload");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let _ = runtime_tx.send(runtime);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = shutdown.notified() => break,
|
||||
_ = tokio::time::sleep(Duration::from_millis(250)) => {},
|
||||
changed = tenant_rx.changed() => {
|
||||
if changed.is_err() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let tenants = tenant_rx.borrow().clone();
|
||||
|
||||
for tenant in tasks.keys().cloned().collect::<Vec<_>>() {
|
||||
if !tenants.contains(&tenant) {
|
||||
if let Some(n) = stops.remove(&tenant) {
|
||||
n.notify_waiters();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for tenant in tasks
|
||||
.iter()
|
||||
.filter_map(|(t, h)| {
|
||||
if h.is_finished() {
|
||||
Some(t.clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
{
|
||||
if let Some(h) = tasks.remove(&tenant) {
|
||||
let _ = h.await;
|
||||
}
|
||||
stops.remove(&tenant);
|
||||
}
|
||||
|
||||
for tenant in tenants {
|
||||
if tasks.contains_key(&tenant) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let stop = Arc::new(tokio::sync::Notify::new());
|
||||
stops.insert(tenant.clone(), stop.clone());
|
||||
let tenant_key = tenant.clone();
|
||||
|
||||
let settings = settings.clone();
|
||||
let jetstream = jetstream.clone();
|
||||
let storage = storage.clone();
|
||||
let runtime_rx = runtime_rx.clone();
|
||||
let publisher = publisher.clone();
|
||||
let metrics = metrics.clone();
|
||||
let tenant_gate = tenant_gate.clone();
|
||||
let shutdown = shutdown.clone();
|
||||
let draining = draining.clone();
|
||||
|
||||
let handle = tokio::spawn(async move {
|
||||
let _ = run_effect_worker_for_tenant(
|
||||
settings,
|
||||
jetstream,
|
||||
storage,
|
||||
runtime_rx,
|
||||
publisher,
|
||||
metrics,
|
||||
tenant_gate,
|
||||
tenant,
|
||||
shutdown,
|
||||
stop,
|
||||
draining,
|
||||
)
|
||||
.await;
|
||||
});
|
||||
tasks.insert(tenant_key, handle);
|
||||
}
|
||||
}
|
||||
|
||||
for (_, n) in stops {
|
||||
n.notify_waiters();
|
||||
}
|
||||
for (_, h) in tasks {
|
||||
let _ = h.await;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn run_effect_worker_single(
|
||||
settings: Settings,
|
||||
storage: KvClient,
|
||||
metrics: Arc<Metrics>,
|
||||
tenant_gate: Arc<TenantGate>,
|
||||
reload: Arc<tokio::sync::Notify>,
|
||||
shutdown: Arc<tokio::sync::Notify>,
|
||||
draining: Arc<AtomicBool>,
|
||||
) -> Result<(), RunnerError> {
|
||||
let manifest = load_manifest(&settings, &storage)?;
|
||||
|
||||
let (effect_to_provider, registry) = build_registry(&manifest)?;
|
||||
let runtime = Arc::new(crate::effects::EffectRuntime::new(
|
||||
effect_to_provider,
|
||||
registry,
|
||||
));
|
||||
let (runtime_tx, runtime_rx) = watch::channel(runtime);
|
||||
|
||||
let jetstream = JetStreamClient::connect(&settings)
|
||||
.await
|
||||
.map_err(|e| RunnerError::StreamError(e.to_string()))?;
|
||||
let publisher = JetStreamPublisher::new(jetstream.clone());
|
||||
|
||||
let durable_name = format!("{}_effects", settings.consumer_durable_prefix);
|
||||
let filter_subject = settings
|
||||
.effect_command_subject_filters
|
||||
.first()
|
||||
.cloned()
|
||||
.unwrap_or_else(|| "tenant.*.effect.*.*".to_string());
|
||||
|
||||
let consumer = jetstream
|
||||
.effect_command_consumer(
|
||||
&settings,
|
||||
ConsumerOptions {
|
||||
durable_name,
|
||||
filter_subject,
|
||||
deliver_policy: DeliverPolicy::All,
|
||||
},
|
||||
)
|
||||
.await
|
||||
.map_err(|e| RunnerError::StreamError(e.to_string()))?;
|
||||
|
||||
let mut messages = consumer
|
||||
.messages()
|
||||
.await
|
||||
.map_err(|e| RunnerError::StreamError(e.to_string()))?;
|
||||
|
||||
{
|
||||
let settings = settings.clone();
|
||||
let storage = storage.clone();
|
||||
let reload = reload.clone();
|
||||
tokio::spawn(async move {
|
||||
loop {
|
||||
reload.notified().await;
|
||||
let manifest = match load_manifest(&settings, &storage) {
|
||||
Ok(m) => m,
|
||||
Err(e) => {
|
||||
tracing::error!(error = %e, "Failed to load effects manifest on reload");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
if let Err(e) = manifest.validate() {
|
||||
tracing::error!(error = %e, "Invalid effects manifest on reload");
|
||||
continue;
|
||||
}
|
||||
let runtime = match build_registry(&manifest)
|
||||
.map(|(m, r)| crate::effects::EffectRuntime::new(m, r))
|
||||
{
|
||||
Ok(r) => Arc::new(r),
|
||||
Err(e) => {
|
||||
tracing::error!(error = %e, "Failed to rebuild effect runtime on reload");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let _ = runtime_tx.send(runtime);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
loop {
|
||||
if draining.load(Ordering::Relaxed) {
|
||||
tokio::select! {
|
||||
_ = shutdown.notified() => break,
|
||||
_ = tokio::time::sleep(Duration::from_millis(50)) => continue,
|
||||
};
|
||||
}
|
||||
|
||||
let next = tokio::select! {
|
||||
_ = shutdown.notified() => break,
|
||||
msg = messages.next() => msg,
|
||||
};
|
||||
|
||||
let Some(msg) = next else { break };
|
||||
let msg = match msg {
|
||||
Ok(m) => m,
|
||||
Err(e) => {
|
||||
tracing::error!(error = %e, "JetStream message stream error");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let runtime = runtime_rx.borrow().clone();
|
||||
handle_effect_message(
|
||||
&settings,
|
||||
&storage,
|
||||
runtime,
|
||||
&publisher,
|
||||
&metrics,
|
||||
&tenant_gate,
|
||||
draining.load(Ordering::Relaxed),
|
||||
msg,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn run_effect_worker_for_tenant(
|
||||
settings: Arc<Settings>,
|
||||
jetstream: JetStreamClient,
|
||||
storage: KvClient,
|
||||
runtime_rx: watch::Receiver<Arc<crate::effects::EffectRuntime>>,
|
||||
publisher: JetStreamPublisher,
|
||||
metrics: Arc<Metrics>,
|
||||
tenant_gate: Arc<TenantGate>,
|
||||
tenant: String,
|
||||
shutdown: Arc<tokio::sync::Notify>,
|
||||
stop: Arc<tokio::sync::Notify>,
|
||||
draining: Arc<AtomicBool>,
|
||||
) -> Result<(), RunnerError> {
|
||||
let durable_name = format!("{}_effects_{}", settings.consumer_durable_prefix, tenant);
|
||||
let filter_subject = format!("tenant.{}.effect.*.*", tenant);
|
||||
|
||||
let consumer = jetstream
|
||||
.effect_command_consumer(
|
||||
&settings,
|
||||
ConsumerOptions {
|
||||
durable_name,
|
||||
filter_subject,
|
||||
deliver_policy: DeliverPolicy::All,
|
||||
},
|
||||
)
|
||||
.await
|
||||
.map_err(|e| RunnerError::StreamError(e.to_string()))?;
|
||||
|
||||
let mut messages = consumer
|
||||
.messages()
|
||||
.await
|
||||
.map_err(|e| RunnerError::StreamError(e.to_string()))?;
|
||||
|
||||
loop {
|
||||
if !tenant_gate.should_acquire_processing_work(&tenant, draining.load(Ordering::Relaxed)) {
|
||||
tokio::select! {
|
||||
_ = shutdown.notified() => break,
|
||||
_ = stop.notified() => break,
|
||||
_ = tokio::time::sleep(Duration::from_millis(50)) => continue,
|
||||
};
|
||||
}
|
||||
|
||||
if draining.load(Ordering::Relaxed) {
|
||||
tokio::select! {
|
||||
_ = shutdown.notified() => break,
|
||||
_ = stop.notified() => break,
|
||||
_ = tokio::time::sleep(Duration::from_millis(50)) => continue,
|
||||
};
|
||||
}
|
||||
|
||||
let next = tokio::select! {
|
||||
_ = shutdown.notified() => break,
|
||||
_ = stop.notified() => break,
|
||||
msg = messages.next() => msg,
|
||||
};
|
||||
|
||||
let Some(msg) = next else { break };
|
||||
let msg = match msg {
|
||||
Ok(m) => m,
|
||||
Err(e) => {
|
||||
tracing::error!(error = %e, "JetStream message stream error");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let runtime = runtime_rx.borrow().clone();
|
||||
handle_effect_message(
|
||||
&settings,
|
||||
&storage,
|
||||
runtime,
|
||||
&publisher,
|
||||
&metrics,
|
||||
&tenant_gate,
|
||||
draining.load(Ordering::Relaxed),
|
||||
msg,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn handle_effect_message(
|
||||
settings: &Settings,
|
||||
storage: &KvClient,
|
||||
runtime: Arc<crate::effects::EffectRuntime>,
|
||||
publisher: &JetStreamPublisher,
|
||||
metrics: &Metrics,
|
||||
tenant_gate: &TenantGate,
|
||||
global_draining: bool,
|
||||
msg: async_nats::jetstream::Message,
|
||||
) {
|
||||
let info = match msg.info() {
|
||||
Ok(i) => i,
|
||||
Err(e) => {
|
||||
tracing::error!(error = %e, "Failed to parse JetStream message info");
|
||||
let _ = msg.ack().await;
|
||||
return;
|
||||
}
|
||||
};
|
||||
let delivered = info.delivered.max(0) as u64;
|
||||
|
||||
let cmd: EffectCommandEnvelope = match serde_json::from_slice(&msg.payload) {
|
||||
Ok(c) => c,
|
||||
Err(e) => {
|
||||
tracing::error!(error = %e, "Failed to decode effect command envelope");
|
||||
if delivered >= settings.max_deliver.max(1) as u64 {
|
||||
let key = format!(
|
||||
"deadletter:{}:effect_decode:{}",
|
||||
"",
|
||||
Utc::now().timestamp_millis()
|
||||
);
|
||||
let record = serde_json::json!({
|
||||
"reason": "decode_error",
|
||||
"delivered": delivered,
|
||||
"payload": Value::Null,
|
||||
"timestamp": Utc::now(),
|
||||
});
|
||||
let _ = storage.put_deadletter(&key, &record);
|
||||
metrics.inc_deadletter_written();
|
||||
let _ = msg.ack_with(AckKind::Term).await;
|
||||
}
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
if !tenant_gate.should_acquire_processing_work(cmd.tenant_id.as_str(), global_draining) {
|
||||
let _ = msg
|
||||
.ack_with(AckKind::Nak(Some(Duration::from_millis(250))))
|
||||
.await;
|
||||
return;
|
||||
}
|
||||
let _work = tenant_gate.begin_work(cmd.tenant_id.as_str());
|
||||
|
||||
match process_effect_command(settings, storage, runtime.as_ref(), publisher, metrics, cmd).await
|
||||
{
|
||||
Ok(ProcessDecision::Ack) => {
|
||||
if settings.test_effect_crash_after_dedupe_before_ack {
|
||||
panic!("test_effect_crash_after_dedupe_before_ack");
|
||||
}
|
||||
let _ = msg.ack().await;
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!(error = %e, "Effect processing failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
enum ProcessDecision {
|
||||
Ack,
|
||||
}
|
||||
|
||||
trait EffectResultPublisher: Send + Sync {
|
||||
fn publish(
|
||||
&self,
|
||||
subject: String,
|
||||
result: EffectResultEnvelope,
|
||||
) -> BoxFuture<'static, Result<(), RunnerError>>;
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct JetStreamPublisher {
|
||||
jetstream: JetStreamClient,
|
||||
}
|
||||
|
||||
impl JetStreamPublisher {
|
||||
fn new(jetstream: JetStreamClient) -> Self {
|
||||
Self { jetstream }
|
||||
}
|
||||
}
|
||||
|
||||
impl EffectResultPublisher for JetStreamPublisher {
|
||||
fn publish(
|
||||
&self,
|
||||
subject: String,
|
||||
result: EffectResultEnvelope,
|
||||
) -> BoxFuture<'static, Result<(), RunnerError>> {
|
||||
let jetstream = self.jetstream.clone();
|
||||
Box::pin(async move { jetstream.publish_effect_result(subject, &result).await })
|
||||
}
|
||||
}
|
||||
|
||||
async fn process_effect_command(
|
||||
settings: &Settings,
|
||||
storage: &KvClient,
|
||||
runtime: &crate::effects::EffectRuntime,
|
||||
publisher: &dyn EffectResultPublisher,
|
||||
metrics: &Metrics,
|
||||
cmd: EffectCommandEnvelope,
|
||||
) -> Result<ProcessDecision, RunnerError> {
|
||||
let dedupe_key = DedupeEffectKey::new(&cmd.tenant_id, &cmd.command_id);
|
||||
if storage.is_deduped_effect(&dedupe_key)? {
|
||||
return Ok(ProcessDecision::Ack);
|
||||
}
|
||||
|
||||
let timeout = Duration::from_millis(settings.effect_timeout_ms.max(1));
|
||||
let mut last_error = None;
|
||||
|
||||
let mut attempt = 0usize;
|
||||
while attempt < settings.effect_retry_max_attempts.max(1) {
|
||||
attempt += 1;
|
||||
let exec = tokio::time::timeout(timeout, runtime.execute(cmd.clone())).await;
|
||||
match exec {
|
||||
Ok(Ok(result)) => {
|
||||
metrics.inc_effect_exec_success();
|
||||
return publish_and_mark(storage, publisher, metrics, cmd, result).await;
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
metrics.inc_effect_exec_failed();
|
||||
last_error = Some(e.to_string());
|
||||
}
|
||||
Err(_) => {
|
||||
metrics.inc_effect_exec_timeout();
|
||||
last_error = Some("timeout".to_string());
|
||||
}
|
||||
}
|
||||
|
||||
if attempt < settings.effect_retry_max_attempts.max(1) {
|
||||
let backoff_ms = settings.effect_retry_backoff_ms.max(1) * (1u64 << (attempt - 1));
|
||||
tokio::time::sleep(Duration::from_millis(backoff_ms)).await;
|
||||
}
|
||||
}
|
||||
|
||||
let result = EffectResultEnvelope {
|
||||
tenant_id: cmd.tenant_id.clone(),
|
||||
command_id: cmd.command_id.clone(),
|
||||
effect_name: cmd.effect_name.clone(),
|
||||
result_type: EffectResultType::Failed,
|
||||
payload: serde_json::json!({ "error": last_error.unwrap_or_else(|| "failed".to_string()) }),
|
||||
timestamp: Utc::now(),
|
||||
metadata: cmd.metadata.clone(),
|
||||
};
|
||||
|
||||
publish_and_mark(storage, publisher, metrics, cmd, result).await
|
||||
}
|
||||
|
||||
async fn publish_and_mark(
|
||||
storage: &KvClient,
|
||||
publisher: &dyn EffectResultPublisher,
|
||||
metrics: &Metrics,
|
||||
cmd: EffectCommandEnvelope,
|
||||
mut result: EffectResultEnvelope,
|
||||
) -> Result<ProcessDecision, RunnerError> {
|
||||
if result.metadata.correlation_id.is_none() {
|
||||
result.metadata.correlation_id = cmd.metadata.correlation_id.clone();
|
||||
}
|
||||
if result.metadata.trace_id.is_none() {
|
||||
result.metadata.trace_id = cmd.metadata.trace_id.clone();
|
||||
}
|
||||
|
||||
let subject = format!(
|
||||
"tenant.{}.effect_result.{}.{}",
|
||||
cmd.tenant_id.as_str(),
|
||||
cmd.effect_name.as_str(),
|
||||
cmd.command_id.as_str()
|
||||
);
|
||||
|
||||
if let Err(e) = publisher.publish(subject, result).await {
|
||||
metrics.inc_effect_publish_failed();
|
||||
return Err(e);
|
||||
}
|
||||
let dedupe_key = DedupeEffectKey::new(&cmd.tenant_id, &cmd.command_id);
|
||||
storage.mark_deduped_effect(&dedupe_key)?;
|
||||
Ok(ProcessDecision::Ack)
|
||||
}
|
||||
|
||||
fn build_registry(
|
||||
manifest: &EffectsManifest,
|
||||
) -> Result<(HashMap<String, String>, ProviderRegistry), RunnerError> {
|
||||
let mut registry = ProviderRegistry::new();
|
||||
type ProviderFactory =
|
||||
Arc<dyn Fn(Value) -> Result<Arc<dyn EffectProvider>, RunnerError> + Send + Sync + 'static>;
|
||||
|
||||
let mut factories: HashMap<String, ProviderFactory> = HashMap::new();
|
||||
factories.insert(
|
||||
"noop".to_string(),
|
||||
Arc::new(|_cfg| Ok(Arc::new(NoopProvider))),
|
||||
);
|
||||
factories.insert(
|
||||
"email".to_string(),
|
||||
Arc::new(|cfg| {
|
||||
let provider = super::providers::email::EmailProvider::from_config_value(cfg)?;
|
||||
Ok(Arc::new(provider))
|
||||
}),
|
||||
);
|
||||
|
||||
let mut effect_to_provider = HashMap::new();
|
||||
for def in &manifest.effects {
|
||||
let factory = factories.get(&def.provider).ok_or_else(|| {
|
||||
RunnerError::RuntimeError(format!("Unknown effect provider: {}", def.provider))
|
||||
})?;
|
||||
let provider_instance = factory(def.config.clone())?;
|
||||
let provider_name = format!("{}__{}", def.provider, def.name);
|
||||
registry.register(provider_name.clone(), provider_instance);
|
||||
effect_to_provider.insert(def.name.clone(), provider_name);
|
||||
}
|
||||
|
||||
Ok((effect_to_provider, registry))
|
||||
}
|
||||
|
||||
fn load_manifest(settings: &Settings, storage: &KvClient) -> Result<EffectsManifest, RunnerError> {
|
||||
if let Some(m) = storage.get_effects_manifest_override()? {
|
||||
return Ok(m);
|
||||
}
|
||||
let manifest = EffectsManifest::from_file(&settings.effects_manifest_path)
|
||||
.map_err(|e| RunnerError::DecodeError(e.to_string()))?;
|
||||
manifest
|
||||
.validate()
|
||||
.map_err(|e| RunnerError::DecodeError(e.to_string()))?;
|
||||
Ok(manifest)
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct NoopProvider;
|
||||
|
||||
impl EffectProvider for NoopProvider {
|
||||
fn execute(
|
||||
&self,
|
||||
cmd: EffectCommandEnvelope,
|
||||
) -> BoxFuture<'static, Result<EffectResultEnvelope, RunnerError>> {
|
||||
Box::pin(async move {
|
||||
Ok(EffectResultEnvelope {
|
||||
tenant_id: cmd.tenant_id,
|
||||
command_id: cmd.command_id,
|
||||
effect_name: cmd.effect_name,
|
||||
result_type: EffectResultType::Succeeded,
|
||||
payload: cmd.payload,
|
||||
timestamp: Utc::now(),
|
||||
metadata: cmd.metadata,
|
||||
})
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::effects::EffectRuntime;
|
||||
use crate::types::{CommandId, EffectName, MessageMetadata, TenantId};
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
#[derive(Clone)]
|
||||
struct CountingProvider {
|
||||
calls: Arc<AtomicUsize>,
|
||||
}
|
||||
|
||||
impl EffectProvider for CountingProvider {
|
||||
fn execute(
|
||||
&self,
|
||||
cmd: EffectCommandEnvelope,
|
||||
) -> BoxFuture<'static, Result<EffectResultEnvelope, RunnerError>> {
|
||||
let calls = self.calls.clone();
|
||||
Box::pin(async move {
|
||||
calls.fetch_add(1, Ordering::Relaxed);
|
||||
Ok(EffectResultEnvelope {
|
||||
tenant_id: cmd.tenant_id,
|
||||
command_id: cmd.command_id,
|
||||
effect_name: cmd.effect_name,
|
||||
result_type: EffectResultType::Succeeded,
|
||||
payload: cmd.payload,
|
||||
timestamp: Utc::now(),
|
||||
metadata: cmd.metadata,
|
||||
})
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct FakePublisher {
|
||||
fail: bool,
|
||||
published: Arc<AtomicUsize>,
|
||||
}
|
||||
|
||||
impl EffectResultPublisher for FakePublisher {
|
||||
fn publish(
|
||||
&self,
|
||||
_subject: String,
|
||||
_result: EffectResultEnvelope,
|
||||
) -> BoxFuture<'static, Result<(), RunnerError>> {
|
||||
let fail = self.fail;
|
||||
let published = self.published.clone();
|
||||
Box::pin(async move {
|
||||
published.fetch_add(1, Ordering::Relaxed);
|
||||
if fail {
|
||||
Err(RunnerError::StreamError("publish failed".to_string()))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn runtime_with_counting_provider(calls: Arc<AtomicUsize>) -> EffectRuntime {
|
||||
let mut registry = ProviderRegistry::new();
|
||||
registry.register("counting", Arc::new(CountingProvider { calls }));
|
||||
let mut map = HashMap::new();
|
||||
map.insert("send_email".to_string(), "counting".to_string());
|
||||
EffectRuntime::new(map, registry)
|
||||
}
|
||||
|
||||
fn base_settings() -> Settings {
|
||||
Settings {
|
||||
effect_retry_max_attempts: 1,
|
||||
effect_timeout_ms: 1000,
|
||||
effect_retry_backoff_ms: 1,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn idempotency_gate_prevents_double_execution_for_same_command_id() {
|
||||
let storage = KvClient::in_memory();
|
||||
let calls = Arc::new(AtomicUsize::new(0));
|
||||
let runtime = runtime_with_counting_provider(calls.clone());
|
||||
let publisher = FakePublisher {
|
||||
fail: false,
|
||||
published: Arc::new(AtomicUsize::new(0)),
|
||||
};
|
||||
let metrics = Metrics::default();
|
||||
let settings = base_settings();
|
||||
|
||||
let cmd = EffectCommandEnvelope {
|
||||
tenant_id: TenantId::new("t1"),
|
||||
command_id: CommandId::new("c1"),
|
||||
effect_name: EffectName::new("send_email"),
|
||||
payload: serde_json::json!({"a": 1}),
|
||||
metadata: MessageMetadata::default(),
|
||||
};
|
||||
|
||||
let dedupe_key = DedupeEffectKey::new(&cmd.tenant_id, &cmd.command_id);
|
||||
storage.mark_deduped_effect(&dedupe_key).unwrap();
|
||||
|
||||
let decision =
|
||||
process_effect_command(&settings, &storage, &runtime, &publisher, &metrics, cmd)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(decision, ProcessDecision::Ack);
|
||||
assert_eq!(calls.load(Ordering::Relaxed), 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn result_publish_failure_does_not_mark_command_as_completed() {
|
||||
let storage = KvClient::in_memory();
|
||||
let calls = Arc::new(AtomicUsize::new(0));
|
||||
let runtime = runtime_with_counting_provider(calls.clone());
|
||||
let published = Arc::new(AtomicUsize::new(0));
|
||||
let publisher = FakePublisher {
|
||||
fail: true,
|
||||
published: published.clone(),
|
||||
};
|
||||
let metrics = Metrics::default();
|
||||
let settings = base_settings();
|
||||
|
||||
let cmd = EffectCommandEnvelope {
|
||||
tenant_id: TenantId::new("t1"),
|
||||
command_id: CommandId::new("c1"),
|
||||
effect_name: EffectName::new("send_email"),
|
||||
payload: serde_json::json!({"a": 1}),
|
||||
metadata: MessageMetadata::default(),
|
||||
};
|
||||
|
||||
let res = process_effect_command(
|
||||
&settings,
|
||||
&storage,
|
||||
&runtime,
|
||||
&publisher,
|
||||
&metrics,
|
||||
cmd.clone(),
|
||||
)
|
||||
.await;
|
||||
assert!(res.is_err());
|
||||
|
||||
let dedupe_key = DedupeEffectKey::new(&cmd.tenant_id, &cmd.command_id);
|
||||
assert!(!storage.is_deduped_effect(&dedupe_key).unwrap());
|
||||
assert_eq!(published.load(Ordering::Relaxed), 1);
|
||||
assert_eq!(calls.load(Ordering::Relaxed), 1);
|
||||
}
|
||||
}
|
||||
198
runner/src/gateway/mod.rs
Normal file
198
runner/src/gateway/mod.rs
Normal file
@@ -0,0 +1,198 @@
|
||||
pub const TENANT_ID_METADATA_KEY: &str = "x-tenant-id";
|
||||
pub const CORRELATION_ID_METADATA_KEY: &str = "x-correlation-id";
|
||||
pub const TRACEPARENT_METADATA_KEY: &str = "traceparent";
|
||||
|
||||
pub mod proto {
|
||||
tonic::include_proto!("aggregate.gateway.v1");
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct GatewayClient {
|
||||
inner: proto::command_service_client::CommandServiceClient<tonic::transport::Channel>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for GatewayClient {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("GatewayClient").finish_non_exhaustive()
|
||||
}
|
||||
}
|
||||
|
||||
impl GatewayClient {
|
||||
pub async fn connect(endpoint: &str) -> Result<Self, crate::types::RunnerError> {
|
||||
let channel = tonic::transport::Endpoint::from_shared(endpoint.to_string())
|
||||
.map_err(|e| crate::types::RunnerError::RuntimeError(e.to_string()))?
|
||||
.connect()
|
||||
.await
|
||||
.map_err(|e| crate::types::RunnerError::RuntimeError(e.to_string()))?;
|
||||
let inner = proto::command_service_client::CommandServiceClient::new(channel);
|
||||
Ok(Self { inner })
|
||||
}
|
||||
|
||||
pub async fn submit_command(
|
||||
&mut self,
|
||||
request: proto::SubmitCommandRequest,
|
||||
) -> Result<proto::SubmitCommandResponse, tonic::Status> {
|
||||
let mut grpc_request = tonic::Request::new(request);
|
||||
|
||||
let tenant_id = grpc_request.get_ref().tenant_id.as_str();
|
||||
if !tenant_id.is_empty() {
|
||||
let value = tonic::metadata::MetadataValue::try_from(tenant_id).map_err(|e| {
|
||||
tonic::Status::invalid_argument(format!("invalid tenant_id metadata: {}", e))
|
||||
})?;
|
||||
grpc_request
|
||||
.metadata_mut()
|
||||
.insert(TENANT_ID_METADATA_KEY, value);
|
||||
}
|
||||
|
||||
let correlation_id = grpc_request
|
||||
.get_ref()
|
||||
.metadata
|
||||
.get("x-correlation-id")
|
||||
.or_else(|| grpc_request.get_ref().metadata.get("correlation_id"))
|
||||
.map(|s| s.trim())
|
||||
.filter(|s| !s.is_empty())
|
||||
.map(|s| s.to_string());
|
||||
if let Some(correlation_id) = correlation_id {
|
||||
let value =
|
||||
tonic::metadata::MetadataValue::try_from(correlation_id.as_str()).map_err(|e| {
|
||||
tonic::Status::invalid_argument(format!(
|
||||
"invalid correlation_id metadata: {}",
|
||||
e
|
||||
))
|
||||
})?;
|
||||
grpc_request
|
||||
.metadata_mut()
|
||||
.insert(CORRELATION_ID_METADATA_KEY, value);
|
||||
}
|
||||
|
||||
let traceparent = grpc_request
|
||||
.get_ref()
|
||||
.metadata
|
||||
.get("traceparent")
|
||||
.map(|s| s.trim())
|
||||
.filter(|s| !s.is_empty())
|
||||
.map(|s| s.to_string())
|
||||
.or_else(|| {
|
||||
grpc_request
|
||||
.get_ref()
|
||||
.metadata
|
||||
.get("trace_id")
|
||||
.map(|s| s.trim())
|
||||
.filter(|s| s.len() == 32 && s.chars().all(|c| c.is_ascii_hexdigit()))
|
||||
.map(|trace_id| {
|
||||
let span_id = uuid::Uuid::new_v4().simple().to_string()[..16].to_string();
|
||||
format!("00-{trace_id}-{span_id}-01")
|
||||
})
|
||||
});
|
||||
if let Some(traceparent) = traceparent {
|
||||
let value =
|
||||
tonic::metadata::MetadataValue::try_from(traceparent.as_str()).map_err(|e| {
|
||||
tonic::Status::invalid_argument(format!("invalid traceparent metadata: {}", e))
|
||||
})?;
|
||||
grpc_request
|
||||
.metadata_mut()
|
||||
.insert(TRACEPARENT_METADATA_KEY, value);
|
||||
}
|
||||
|
||||
let resp = self.inner.submit_command(grpc_request).await?;
|
||||
Ok(resp.into_inner())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn traceparent_is_derived_from_trace_id_when_present() {
|
||||
let req = proto::SubmitCommandRequest {
|
||||
tenant_id: "t1".to_string(),
|
||||
command_id: "c1".to_string(),
|
||||
aggregate_id: "a1".to_string(),
|
||||
aggregate_type: "User".to_string(),
|
||||
payload_json: "{}".to_string(),
|
||||
metadata: std::collections::HashMap::from([(
|
||||
"trace_id".to_string(),
|
||||
"0123456789abcdef0123456789abcdef".to_string(),
|
||||
)]),
|
||||
};
|
||||
let trace_id = req.metadata.get("trace_id").unwrap().as_str();
|
||||
let span_id = uuid::Uuid::new_v4().simple().to_string()[..16].to_string();
|
||||
let traceparent = format!("00-{trace_id}-{span_id}-01");
|
||||
assert!(traceparent.starts_with("00-0123456789abcdef0123456789abcdef-"));
|
||||
assert!(traceparent.ends_with("-01"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn submit_command_propagates_correlation_and_traceparent_metadata_when_present() {
|
||||
use proto::command_service_server::CommandService;
|
||||
|
||||
#[derive(Default)]
|
||||
struct Upstream;
|
||||
|
||||
#[tonic::async_trait]
|
||||
impl CommandService for Upstream {
|
||||
async fn submit_command(
|
||||
&self,
|
||||
request: tonic::Request<proto::SubmitCommandRequest>,
|
||||
) -> Result<tonic::Response<proto::SubmitCommandResponse>, tonic::Status> {
|
||||
let correlation = request
|
||||
.metadata()
|
||||
.get("x-correlation-id")
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.unwrap_or("");
|
||||
if correlation != "corr-1" {
|
||||
return Err(tonic::Status::failed_precondition("missing correlation"));
|
||||
}
|
||||
|
||||
let traceparent = request
|
||||
.metadata()
|
||||
.get("traceparent")
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.unwrap_or("");
|
||||
if traceparent != "00-0123456789abcdef0123456789abcdef-1111111111111111-01" {
|
||||
return Err(tonic::Status::failed_precondition("missing traceparent"));
|
||||
}
|
||||
|
||||
Ok(tonic::Response::new(proto::SubmitCommandResponse {
|
||||
events: vec![],
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
let upstream_listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||
let upstream_addr = upstream_listener.local_addr().unwrap();
|
||||
drop(upstream_listener);
|
||||
tokio::spawn(async move {
|
||||
tonic::transport::Server::builder()
|
||||
.add_service(proto::command_service_server::CommandServiceServer::new(
|
||||
Upstream,
|
||||
))
|
||||
.serve(upstream_addr)
|
||||
.await
|
||||
.unwrap();
|
||||
});
|
||||
tokio::time::sleep(std::time::Duration::from_millis(50)).await;
|
||||
|
||||
let mut client = GatewayClient::connect(&format!("http://{}", upstream_addr))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let req = proto::SubmitCommandRequest {
|
||||
tenant_id: "t1".to_string(),
|
||||
command_id: "c1".to_string(),
|
||||
aggregate_id: "a1".to_string(),
|
||||
aggregate_type: "User".to_string(),
|
||||
payload_json: "{}".to_string(),
|
||||
metadata: std::collections::HashMap::from([
|
||||
("correlation_id".to_string(), "corr-1".to_string()),
|
||||
(
|
||||
"traceparent".to_string(),
|
||||
"00-0123456789abcdef0123456789abcdef-1111111111111111-01".to_string(),
|
||||
),
|
||||
]),
|
||||
};
|
||||
|
||||
client.submit_command(req).await.unwrap();
|
||||
}
|
||||
}
|
||||
665
runner/src/http/mod.rs
Normal file
665
runner/src/http/mod.rs
Normal file
@@ -0,0 +1,665 @@
|
||||
use crate::config::Settings;
|
||||
use crate::effects::EffectsManifest;
|
||||
use crate::observability::Metrics;
|
||||
use crate::storage::KvClient;
|
||||
use crate::tenant_placement::TenantGate;
|
||||
use crate::types::TenantId;
|
||||
use axum::extract::{Path, State};
|
||||
use axum::http::StatusCode;
|
||||
use axum::response::{IntoResponse, Response};
|
||||
use axum::routing::{get, post};
|
||||
use axum::Json;
|
||||
use serde_json::json;
|
||||
use serde_json::Value;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct AppState {
|
||||
pub settings: Settings,
|
||||
draining: Arc<AtomicBool>,
|
||||
tenant_gate: Arc<TenantGate>,
|
||||
pub metrics: Arc<Metrics>,
|
||||
pub storage: KvClient,
|
||||
reload: Arc<tokio::sync::Notify>,
|
||||
}
|
||||
|
||||
impl AppState {
|
||||
pub fn new(
|
||||
settings: Settings,
|
||||
draining: Arc<AtomicBool>,
|
||||
tenant_gate: Arc<TenantGate>,
|
||||
metrics: Arc<Metrics>,
|
||||
storage: KvClient,
|
||||
reload: Arc<tokio::sync::Notify>,
|
||||
) -> Self {
|
||||
Self {
|
||||
settings,
|
||||
draining,
|
||||
tenant_gate,
|
||||
metrics,
|
||||
storage,
|
||||
reload,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_draining(&self) -> bool {
|
||||
self.draining.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
pub fn start_draining(&self) {
|
||||
self.draining.store(true, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn notify_reload(&self) {
|
||||
self.reload.notify_waiters();
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn serve(
|
||||
listener: tokio::net::TcpListener,
|
||||
state: Arc<AppState>,
|
||||
shutdown: impl std::future::Future<Output = ()> + Send + 'static,
|
||||
) {
|
||||
let app = router(state);
|
||||
axum::serve(listener, app)
|
||||
.with_graceful_shutdown(shutdown)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
fn router(state: Arc<AppState>) -> axum::Router {
|
||||
axum::Router::new()
|
||||
.route("/health", get(health))
|
||||
.route("/ready", get(ready))
|
||||
.route("/metrics", get(metrics))
|
||||
.route("/info", get(info))
|
||||
.route("/admin/drain", post(drain))
|
||||
.route("/admin/drain/status", get(drain_status))
|
||||
.route("/admin/reload", post(reload))
|
||||
.route("/admin/config/:key", get(get_config_value))
|
||||
.route("/admin/config/:key", post(set_config_value))
|
||||
.route("/admin/config/:key/delete", post(delete_config_value))
|
||||
.route("/admin/config/effects_manifest", get(get_effects_manifest))
|
||||
.route("/admin/config/effects_manifest", post(set_effects_manifest))
|
||||
.route(
|
||||
"/admin/config/effects_manifest/delete",
|
||||
post(clear_effects_manifest),
|
||||
)
|
||||
.route("/admin/replay", post(replay))
|
||||
.with_state(state)
|
||||
}
|
||||
|
||||
async fn health(State(state): State<Arc<AppState>>) -> Response {
|
||||
let storage_ok = state.storage.writable_probe().is_ok();
|
||||
let stream_ok = tokio::time::timeout(Duration::from_secs(1), async {
|
||||
async_nats::connect(&state.settings.nats_url)
|
||||
.await
|
||||
.map(|_| ())
|
||||
})
|
||||
.await
|
||||
.is_ok_and(|r| r.is_ok());
|
||||
|
||||
let ok = storage_ok && stream_ok;
|
||||
let status = if ok {
|
||||
StatusCode::OK
|
||||
} else {
|
||||
StatusCode::SERVICE_UNAVAILABLE
|
||||
};
|
||||
|
||||
(
|
||||
status,
|
||||
Json(json!({ "ok": ok, "storage": storage_ok, "stream": stream_ok })),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
#[derive(Debug, serde::Deserialize)]
|
||||
struct ReadyQuery {
|
||||
#[serde(default)]
|
||||
tenant_id: Option<String>,
|
||||
}
|
||||
|
||||
async fn ready(
|
||||
State(state): State<Arc<AppState>>,
|
||||
axum::extract::Query(q): axum::extract::Query<ReadyQuery>,
|
||||
) -> Response {
|
||||
if state.is_draining() {
|
||||
return (
|
||||
StatusCode::SERVICE_UNAVAILABLE,
|
||||
Json(json!({ "ok": false, "draining": true })),
|
||||
)
|
||||
.into_response();
|
||||
}
|
||||
|
||||
if let Some(tenant_id) = q.tenant_id {
|
||||
let tenant_id = tenant_id.trim().to_string();
|
||||
if tenant_id.is_empty() {
|
||||
return (
|
||||
StatusCode::BAD_REQUEST,
|
||||
Json(json!({ "ok": false, "error": "tenant_id required" })),
|
||||
)
|
||||
.into_response();
|
||||
}
|
||||
let accepting = state
|
||||
.tenant_gate
|
||||
.should_acquire_processing_work(&tenant_id, state.is_draining());
|
||||
if !accepting {
|
||||
let assigned = state.tenant_gate.is_assigned(&tenant_id);
|
||||
let draining = state.tenant_gate.is_draining(&tenant_id);
|
||||
return (
|
||||
StatusCode::SERVICE_UNAVAILABLE,
|
||||
Json(json!({
|
||||
"ok": false,
|
||||
"tenant_id": tenant_id,
|
||||
"accepting": false,
|
||||
"assigned": assigned,
|
||||
"draining_tenant": draining
|
||||
})),
|
||||
)
|
||||
.into_response();
|
||||
}
|
||||
}
|
||||
|
||||
let health = health(State(state.clone())).await;
|
||||
if health.status() != StatusCode::OK {
|
||||
return (StatusCode::SERVICE_UNAVAILABLE, health.into_body()).into_response();
|
||||
}
|
||||
|
||||
(StatusCode::OK, Json(json!({ "ok": true }))).into_response()
|
||||
}
|
||||
|
||||
async fn metrics(State(state): State<Arc<AppState>>) -> impl IntoResponse {
|
||||
let draining = if state.is_draining() { 1 } else { 0 };
|
||||
let outbox_count = state
|
||||
.storage
|
||||
.list_outbox_all(50_000)
|
||||
.map(|v| v.len())
|
||||
.unwrap_or(0);
|
||||
|
||||
let now_ms = chrono::Utc::now().timestamp_millis().max(0) as u64;
|
||||
let due_schedule_count = state
|
||||
.storage
|
||||
.scan_due_schedule_items_all(now_ms, 50_000)
|
||||
.map(|v| v.len())
|
||||
.unwrap_or(0);
|
||||
|
||||
let body = format!(
|
||||
"{}runner_draining {}\nrunner_outbox_items {}\nrunner_schedule_due_items {}\n{}",
|
||||
state.metrics.export_prometheus(),
|
||||
draining,
|
||||
outbox_count,
|
||||
due_schedule_count,
|
||||
draining_metrics_snapshot(&state)
|
||||
);
|
||||
(StatusCode::OK, body)
|
||||
}
|
||||
|
||||
async fn info(State(state): State<Arc<AppState>>) -> impl IntoResponse {
|
||||
Json(json!({
|
||||
"service": "runner",
|
||||
"mode": format!("{:?}", state.settings.mode),
|
||||
"streams": {
|
||||
"aggregate_events": state.settings.aggregate_events_stream,
|
||||
"workflow_commands": state.settings.workflow_commands_stream,
|
||||
"workflow_events": state.settings.workflow_events_stream,
|
||||
},
|
||||
"draining": state.is_draining(),
|
||||
"tenant_placement_enabled": state.tenant_gate.assigned_tenants_snapshot().is_some(),
|
||||
}))
|
||||
}
|
||||
|
||||
#[derive(Debug, serde::Deserialize)]
|
||||
struct DrainQuery {
|
||||
#[serde(default)]
|
||||
tenant_id: Option<String>,
|
||||
#[serde(default)]
|
||||
wait_ms: Option<u64>,
|
||||
}
|
||||
|
||||
async fn drain(
|
||||
State(state): State<Arc<AppState>>,
|
||||
axum::extract::Query(q): axum::extract::Query<DrainQuery>,
|
||||
) -> Response {
|
||||
match q.tenant_id.as_deref() {
|
||||
None => {
|
||||
state.start_draining();
|
||||
(
|
||||
StatusCode::OK,
|
||||
Json(json!({ "ok": true, "draining": true })),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
Some(tenant_id) => drain_tenant(state, tenant_id, q.wait_ms).await,
|
||||
}
|
||||
}
|
||||
|
||||
async fn drain_status(
|
||||
State(state): State<Arc<AppState>>,
|
||||
axum::extract::Query(q): axum::extract::Query<DrainQuery>,
|
||||
) -> Response {
|
||||
let Some(tenant_id) = q.tenant_id.as_deref() else {
|
||||
return (
|
||||
StatusCode::BAD_REQUEST,
|
||||
Json(json!({ "ok": false, "error": "tenant_id required" })),
|
||||
)
|
||||
.into_response();
|
||||
};
|
||||
tenant_drain_status(state, tenant_id).await
|
||||
}
|
||||
|
||||
async fn drain_tenant(state: Arc<AppState>, tenant_id: &str, wait_ms: Option<u64>) -> Response {
|
||||
let tenant_id = tenant_id.trim();
|
||||
if tenant_id.is_empty() {
|
||||
return (
|
||||
StatusCode::BAD_REQUEST,
|
||||
Json(json!({ "ok": false, "error": "tenant_id required" })),
|
||||
)
|
||||
.into_response();
|
||||
}
|
||||
|
||||
state.tenant_gate.start_draining(tenant_id);
|
||||
if let Some(wait_ms) = wait_ms.filter(|v| *v > 0) {
|
||||
let deadline = tokio::time::Instant::now() + Duration::from_millis(wait_ms);
|
||||
loop {
|
||||
let status = tenant_drain_state(&state, tenant_id);
|
||||
if status.drained {
|
||||
break;
|
||||
}
|
||||
if tokio::time::Instant::now() >= deadline {
|
||||
break;
|
||||
}
|
||||
let remaining = deadline.saturating_duration_since(tokio::time::Instant::now());
|
||||
let _ = state
|
||||
.tenant_gate
|
||||
.wait_inflight_zero(tenant_id, remaining.min(Duration::from_millis(250)))
|
||||
.await;
|
||||
tokio::time::sleep(Duration::from_millis(25)).await;
|
||||
}
|
||||
}
|
||||
|
||||
tenant_drain_status(state, tenant_id).await
|
||||
}
|
||||
|
||||
async fn tenant_drain_status(state: Arc<AppState>, tenant_id: &str) -> Response {
|
||||
let tenant_id = tenant_id.trim();
|
||||
if tenant_id.is_empty() {
|
||||
return (
|
||||
StatusCode::BAD_REQUEST,
|
||||
Json(json!({ "ok": false, "error": "tenant_id required" })),
|
||||
)
|
||||
.into_response();
|
||||
}
|
||||
|
||||
let status = tenant_drain_state(&state, tenant_id);
|
||||
let code = if status.drained {
|
||||
StatusCode::OK
|
||||
} else {
|
||||
StatusCode::ACCEPTED
|
||||
};
|
||||
|
||||
(
|
||||
code,
|
||||
Json(json!({
|
||||
"ok": true,
|
||||
"tenant_id": tenant_id,
|
||||
"draining_tenant": state.tenant_gate.is_draining(tenant_id),
|
||||
"assigned": state.tenant_gate.is_assigned(tenant_id),
|
||||
"in_flight": status.in_flight,
|
||||
"outbox_items": status.outbox_items,
|
||||
"drained": status.drained
|
||||
})),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
struct TenantDrainState {
|
||||
in_flight: usize,
|
||||
outbox_items: usize,
|
||||
drained: bool,
|
||||
}
|
||||
|
||||
fn tenant_drain_state(state: &AppState, tenant_id: &str) -> TenantDrainState {
|
||||
let in_flight = state.tenant_gate.inflight_count(tenant_id);
|
||||
let outbox_items = state
|
||||
.storage
|
||||
.list_outbox_prefix(&TenantId::new(tenant_id.to_string()), 50_000)
|
||||
.map(|v| v.len())
|
||||
.unwrap_or(0);
|
||||
TenantDrainState {
|
||||
in_flight,
|
||||
outbox_items,
|
||||
drained: in_flight == 0 && outbox_items == 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn draining_metrics_snapshot(state: &AppState) -> String {
|
||||
let mut buf = String::new();
|
||||
for tenant in state.tenant_gate.draining_tenants_snapshot() {
|
||||
let in_flight = state.tenant_gate.inflight_count(&tenant);
|
||||
let outbox_items = state
|
||||
.storage
|
||||
.list_outbox_prefix(&TenantId::new(tenant.clone()), 50_000)
|
||||
.map(|v| v.len())
|
||||
.unwrap_or(0);
|
||||
buf.push_str(&format!(
|
||||
"runner_tenant_draining_in_flight{{tenant_id=\"{}\"}} {}\n",
|
||||
tenant, in_flight
|
||||
));
|
||||
buf.push_str(&format!(
|
||||
"runner_tenant_draining_outbox_items{{tenant_id=\"{}\"}} {}\n",
|
||||
tenant, outbox_items
|
||||
));
|
||||
}
|
||||
buf
|
||||
}
|
||||
|
||||
async fn reload(State(state): State<Arc<AppState>>) -> impl IntoResponse {
|
||||
state.notify_reload();
|
||||
(StatusCode::OK, Json(json!({ "ok": true })))
|
||||
}
|
||||
|
||||
async fn get_config_value(State(state): State<Arc<AppState>>, Path(key): Path<String>) -> Response {
|
||||
if key.trim().is_empty() {
|
||||
return (
|
||||
StatusCode::BAD_REQUEST,
|
||||
Json(json!({ "ok": false, "error": "key required" })),
|
||||
)
|
||||
.into_response();
|
||||
}
|
||||
|
||||
match state.storage.get_config_value(&key) {
|
||||
Ok(Some(value)) => (
|
||||
StatusCode::OK,
|
||||
Json(json!({ "ok": true, "key": key, "value": value })),
|
||||
)
|
||||
.into_response(),
|
||||
Ok(None) => (
|
||||
StatusCode::NOT_FOUND,
|
||||
Json(json!({ "ok": false, "key": key, "error": "not found" })),
|
||||
)
|
||||
.into_response(),
|
||||
Err(e) => (
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(json!({ "ok": false, "error": e.to_string() })),
|
||||
)
|
||||
.into_response(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn set_config_value(
|
||||
State(state): State<Arc<AppState>>,
|
||||
Path(key): Path<String>,
|
||||
Json(value): Json<Value>,
|
||||
) -> Response {
|
||||
if key.trim().is_empty() {
|
||||
return (
|
||||
StatusCode::BAD_REQUEST,
|
||||
Json(json!({ "ok": false, "error": "key required" })),
|
||||
)
|
||||
.into_response();
|
||||
}
|
||||
|
||||
if let Err(e) = state.storage.put_config_value(&key, &value) {
|
||||
return (
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(json!({ "ok": false, "error": e.to_string() })),
|
||||
)
|
||||
.into_response();
|
||||
}
|
||||
|
||||
state.notify_reload();
|
||||
(
|
||||
StatusCode::OK,
|
||||
Json(json!({ "ok": true, "key": key, "stored": true })),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
async fn delete_config_value(
|
||||
State(state): State<Arc<AppState>>,
|
||||
Path(key): Path<String>,
|
||||
) -> Response {
|
||||
if key.trim().is_empty() {
|
||||
return (
|
||||
StatusCode::BAD_REQUEST,
|
||||
Json(json!({ "ok": false, "error": "key required" })),
|
||||
)
|
||||
.into_response();
|
||||
}
|
||||
|
||||
if let Err(e) = state.storage.delete_config_value(&key) {
|
||||
return (
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(json!({ "ok": false, "error": e.to_string() })),
|
||||
)
|
||||
.into_response();
|
||||
}
|
||||
state.notify_reload();
|
||||
(
|
||||
StatusCode::OK,
|
||||
Json(json!({ "ok": true, "key": key, "deleted": true })),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
async fn get_effects_manifest(State(state): State<Arc<AppState>>) -> Response {
|
||||
let override_manifest = state.storage.get_effects_manifest_override();
|
||||
let override_manifest = match override_manifest {
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
return (
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(json!({ "ok": false, "error": e.to_string() })),
|
||||
)
|
||||
.into_response();
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(m) = override_manifest {
|
||||
return (
|
||||
StatusCode::OK,
|
||||
Json(json!({ "ok": true, "source": "storage", "manifest": m })),
|
||||
)
|
||||
.into_response();
|
||||
}
|
||||
|
||||
let from_file = EffectsManifest::from_file(&state.settings.effects_manifest_path);
|
||||
match from_file {
|
||||
Ok(m) => (
|
||||
StatusCode::OK,
|
||||
Json(json!({ "ok": true, "source": "file", "manifest": m })),
|
||||
)
|
||||
.into_response(),
|
||||
Err(e) => (
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(json!({ "ok": false, "error": e.to_string() })),
|
||||
)
|
||||
.into_response(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn set_effects_manifest(
|
||||
State(state): State<Arc<AppState>>,
|
||||
Json(manifest): Json<EffectsManifest>,
|
||||
) -> Response {
|
||||
if let Err(e) = manifest.validate() {
|
||||
return (
|
||||
StatusCode::BAD_REQUEST,
|
||||
Json(json!({ "ok": false, "error": e })),
|
||||
)
|
||||
.into_response();
|
||||
}
|
||||
|
||||
if let Err(e) = state.storage.put_effects_manifest_override(&manifest) {
|
||||
return (
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(json!({ "ok": false, "error": e.to_string() })),
|
||||
)
|
||||
.into_response();
|
||||
}
|
||||
|
||||
state.notify_reload();
|
||||
(StatusCode::OK, Json(json!({ "ok": true, "stored": true }))).into_response()
|
||||
}
|
||||
|
||||
async fn clear_effects_manifest(State(state): State<Arc<AppState>>) -> Response {
|
||||
if let Err(e) = state.storage.clear_effects_manifest_override() {
|
||||
return (
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(json!({ "ok": false, "error": e.to_string() })),
|
||||
)
|
||||
.into_response();
|
||||
}
|
||||
state.notify_reload();
|
||||
(StatusCode::OK, Json(json!({ "ok": true, "cleared": true }))).into_response()
|
||||
}
|
||||
|
||||
#[derive(Debug, serde::Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
enum ReplayMode {
|
||||
CheckpointOnly,
|
||||
CheckpointAndDedupe,
|
||||
FullReset,
|
||||
}
|
||||
|
||||
#[derive(Debug, serde::Deserialize)]
|
||||
struct ReplayRequest {
|
||||
tenant_id: String,
|
||||
saga_name: String,
|
||||
#[serde(default = "default_replay_mode")]
|
||||
mode: ReplayMode,
|
||||
#[serde(default = "default_replay_max_keys")]
|
||||
max_keys: usize,
|
||||
}
|
||||
|
||||
fn default_replay_mode() -> ReplayMode {
|
||||
ReplayMode::CheckpointAndDedupe
|
||||
}
|
||||
|
||||
fn default_replay_max_keys() -> usize {
|
||||
100_000
|
||||
}
|
||||
|
||||
async fn replay(State(state): State<Arc<AppState>>, Json(req): Json<ReplayRequest>) -> Response {
|
||||
let tenant_id = req.tenant_id.trim();
|
||||
let saga_name = req.saga_name.trim();
|
||||
if tenant_id.is_empty() || saga_name.is_empty() {
|
||||
return (StatusCode::BAD_REQUEST, "tenant_id and saga_name required").into_response();
|
||||
}
|
||||
|
||||
let checkpoint_prefix = format!("checkpoint:{}:{}", tenant_id, saga_name);
|
||||
let _ = state.storage.delete_prefix(&checkpoint_prefix, 1);
|
||||
|
||||
let mut deleted = 0usize;
|
||||
match req.mode {
|
||||
ReplayMode::CheckpointOnly => {}
|
||||
ReplayMode::CheckpointAndDedupe => {
|
||||
let prefix = format!("dedupe:{}:event:{}:", tenant_id, saga_name);
|
||||
deleted += state
|
||||
.storage
|
||||
.delete_prefix(&prefix, req.max_keys)
|
||||
.unwrap_or(0);
|
||||
}
|
||||
ReplayMode::FullReset => {
|
||||
let dedupe_prefix = format!("dedupe:{}:event:{}:", tenant_id, saga_name);
|
||||
deleted += state
|
||||
.storage
|
||||
.delete_prefix(&dedupe_prefix, req.max_keys)
|
||||
.unwrap_or(0);
|
||||
let saga_prefix = format!("saga:{}:{}:", tenant_id, saga_name);
|
||||
deleted += state
|
||||
.storage
|
||||
.delete_prefix(&saga_prefix, req.max_keys)
|
||||
.unwrap_or(0);
|
||||
let schedule_prefix = format!("schedule:{}:{}:", tenant_id, saga_name);
|
||||
deleted += state
|
||||
.storage
|
||||
.delete_prefix(&schedule_prefix, req.max_keys)
|
||||
.unwrap_or(0);
|
||||
let outbox_prefix = format!("outbox:{}:", tenant_id);
|
||||
deleted += state
|
||||
.storage
|
||||
.delete_prefix(&outbox_prefix, req.max_keys)
|
||||
.unwrap_or(0);
|
||||
}
|
||||
}
|
||||
|
||||
(
|
||||
StatusCode::OK,
|
||||
Json(json!({
|
||||
"ok": true,
|
||||
"tenant_id": tenant_id,
|
||||
"saga_name": saga_name,
|
||||
"deleted_keys": deleted
|
||||
})),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use axum::body::Body;
|
||||
use axum::http::Request;
|
||||
use tower::ServiceExt;
|
||||
|
||||
#[test]
|
||||
fn readiness_toggles_with_draining_flag() {
|
||||
let settings = Settings {
|
||||
nats_url: "nats://127.0.0.1:1".to_string(),
|
||||
..Default::default()
|
||||
};
|
||||
let state = Arc::new(AppState::new(
|
||||
settings,
|
||||
Arc::new(AtomicBool::new(false)),
|
||||
Arc::new(TenantGate::new(None)),
|
||||
Arc::new(Metrics::default()),
|
||||
KvClient::in_memory(),
|
||||
Arc::new(tokio::sync::Notify::new()),
|
||||
));
|
||||
assert!(!state.is_draining());
|
||||
state.start_draining();
|
||||
assert!(state.is_draining());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn health_fails_when_storage_is_unwritable() {
|
||||
let settings = Settings {
|
||||
nats_url: "nats://127.0.0.1:1".to_string(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let storage = KvClient::in_memory();
|
||||
storage.fail_next_txn();
|
||||
|
||||
let state = Arc::new(AppState::new(
|
||||
settings,
|
||||
Arc::new(AtomicBool::new(false)),
|
||||
Arc::new(TenantGate::new(None)),
|
||||
Arc::new(Metrics::default()),
|
||||
storage,
|
||||
Arc::new(tokio::sync::Notify::new()),
|
||||
));
|
||||
let app = router(state);
|
||||
|
||||
let resp = app
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/health")
|
||||
.body(Body::empty())
|
||||
.unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(resp.status(), StatusCode::SERVICE_UNAVAILABLE);
|
||||
|
||||
let body = axum::body::to_bytes(resp.into_body(), usize::MAX)
|
||||
.await
|
||||
.unwrap();
|
||||
let json: serde_json::Value = serde_json::from_slice(&body).unwrap();
|
||||
assert_eq!(json["storage"], false);
|
||||
}
|
||||
}
|
||||
15
runner/src/lib.rs
Normal file
15
runner/src/lib.rs
Normal file
@@ -0,0 +1,15 @@
|
||||
pub mod config;
|
||||
pub mod effects;
|
||||
pub mod gateway;
|
||||
pub mod http;
|
||||
pub mod observability;
|
||||
pub mod outbox;
|
||||
pub mod saga;
|
||||
pub mod schedule;
|
||||
pub mod storage;
|
||||
pub mod stream;
|
||||
pub mod tenant_placement;
|
||||
pub mod types;
|
||||
|
||||
pub use config::Settings;
|
||||
pub use types::*;
|
||||
315
runner/src/main.rs
Normal file
315
runner/src/main.rs
Normal file
@@ -0,0 +1,315 @@
|
||||
use runner::config::Settings;
|
||||
use runner::effects::run_effect_worker;
|
||||
use runner::http;
|
||||
use runner::observability::Observability;
|
||||
use runner::outbox::OutboxRelay;
|
||||
use runner::saga::{run_saga_worker, SagaPrograms, SagaRuntime};
|
||||
use runner::schedule::Scheduler;
|
||||
use runner::storage::KvClient;
|
||||
use runner::stream::JetStreamClient;
|
||||
use runner::tenant_placement::{start_tenant_filter, TenantGate};
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
match std::env::args().nth(1).as_deref() {
|
||||
Some("-h") | Some("--help") => {
|
||||
print_help();
|
||||
return;
|
||||
}
|
||||
Some("serve") | None => serve().await,
|
||||
Some(other) => {
|
||||
eprintln!("Unknown command: {}", other);
|
||||
print_help();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn serve() {
|
||||
let settings = load_settings();
|
||||
if let Err(e) = settings.validate() {
|
||||
eprintln!("Invalid configuration: {}", e);
|
||||
std::process::exit(2);
|
||||
}
|
||||
|
||||
let observability = Observability::default();
|
||||
observability.init();
|
||||
let metrics = observability.metrics();
|
||||
|
||||
tracing::info!(settings = ?settings, "Runner starting");
|
||||
|
||||
let shutdown = Arc::new(tokio::sync::Notify::new());
|
||||
let reload = Arc::new(tokio::sync::Notify::new());
|
||||
let draining = Arc::new(AtomicBool::new(false));
|
||||
|
||||
let storage = match KvClient::open(settings.storage_path.clone()) {
|
||||
Ok(s) => s,
|
||||
Err(e) => {
|
||||
eprintln!("Failed to open storage: {}", e);
|
||||
std::process::exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
let tenant_filter = match start_tenant_filter(&settings).await {
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
tracing::error!(error = %e, "Failed to initialize tenant filter");
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
let tenant_gate = Arc::new(TenantGate::new(tenant_filter.clone()));
|
||||
|
||||
let state = Arc::new(http::AppState::new(
|
||||
settings.clone(),
|
||||
draining.clone(),
|
||||
tenant_gate.clone(),
|
||||
metrics.clone(),
|
||||
storage.clone(),
|
||||
reload.clone(),
|
||||
));
|
||||
|
||||
let http_listener = tokio::net::TcpListener::bind(settings.http_addr.as_str())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let http_shutdown = shutdown.clone();
|
||||
let http_state = state.clone();
|
||||
let http_task = tokio::spawn(async move {
|
||||
http::serve(http_listener, http_state, async move {
|
||||
http_shutdown.notified().await
|
||||
})
|
||||
.await
|
||||
});
|
||||
|
||||
let signal_shutdown = shutdown.clone();
|
||||
let signal_draining = draining.clone();
|
||||
tokio::spawn(async move {
|
||||
#[cfg(unix)]
|
||||
{
|
||||
use tokio::signal::unix::{signal, SignalKind};
|
||||
let mut sigterm = signal(SignalKind::terminate()).ok();
|
||||
let mut sigint = signal(SignalKind::interrupt()).ok();
|
||||
tokio::select! {
|
||||
_ = tokio::signal::ctrl_c() => {},
|
||||
_ = async { if let Some(s) = &mut sigterm { let _ = s.recv().await; } } => {},
|
||||
_ = async { if let Some(s) = &mut sigint { let _ = s.recv().await; } } => {},
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(unix))]
|
||||
{
|
||||
let _ = tokio::signal::ctrl_c().await;
|
||||
}
|
||||
|
||||
signal_draining.store(true, Ordering::Relaxed);
|
||||
signal_shutdown.notify_waiters();
|
||||
});
|
||||
|
||||
let mut tasks = Vec::new();
|
||||
|
||||
match settings.mode {
|
||||
runner::config::RunnerMode::Saga => {
|
||||
let programs = Arc::new(match SagaPrograms::load(&settings) {
|
||||
Ok(p) => p,
|
||||
Err(e) => {
|
||||
tracing::error!(error = %e, "Failed to load saga manifest/programs");
|
||||
std::process::exit(1);
|
||||
}
|
||||
});
|
||||
let saga_runtime = SagaRuntime::default();
|
||||
tasks.push(tokio::spawn(run_saga_worker(
|
||||
settings.clone(),
|
||||
storage.clone(),
|
||||
programs.clone(),
|
||||
saga_runtime.clone(),
|
||||
metrics.clone(),
|
||||
tenant_gate.clone(),
|
||||
tenant_filter.clone(),
|
||||
shutdown.clone(),
|
||||
draining.clone(),
|
||||
)));
|
||||
let outbox_settings = settings.clone();
|
||||
let outbox_storage = storage.clone();
|
||||
let outbox_shutdown = shutdown.clone();
|
||||
let outbox_draining = draining.clone();
|
||||
let outbox_metrics = metrics.clone();
|
||||
let outbox_tenant_gate = tenant_gate.clone();
|
||||
tasks.push(tokio::spawn(async move {
|
||||
let js = JetStreamClient::connect(&outbox_settings)
|
||||
.await
|
||||
.map_err(|e| runner::types::RunnerError::StreamError(e.to_string()))?;
|
||||
OutboxRelay
|
||||
.run(
|
||||
outbox_settings,
|
||||
outbox_storage,
|
||||
js,
|
||||
outbox_metrics,
|
||||
outbox_tenant_gate,
|
||||
outbox_shutdown,
|
||||
outbox_draining,
|
||||
)
|
||||
.await
|
||||
}));
|
||||
let scheduler_settings = settings.clone();
|
||||
let scheduler_storage = storage.clone();
|
||||
let scheduler_shutdown = shutdown.clone();
|
||||
let scheduler_draining = draining.clone();
|
||||
let scheduler_metrics = metrics.clone();
|
||||
let scheduler_tenant_gate = tenant_gate.clone();
|
||||
tasks.push(tokio::spawn(async move {
|
||||
Scheduler
|
||||
.run(
|
||||
scheduler_settings,
|
||||
scheduler_storage,
|
||||
programs,
|
||||
saga_runtime,
|
||||
scheduler_metrics,
|
||||
scheduler_tenant_gate,
|
||||
scheduler_shutdown,
|
||||
scheduler_draining,
|
||||
)
|
||||
.await
|
||||
}));
|
||||
}
|
||||
runner::config::RunnerMode::Effect => {
|
||||
tasks.push(tokio::spawn(run_effect_worker(
|
||||
settings.clone(),
|
||||
storage.clone(),
|
||||
metrics.clone(),
|
||||
tenant_gate.clone(),
|
||||
tenant_filter.clone(),
|
||||
reload.clone(),
|
||||
shutdown.clone(),
|
||||
draining.clone(),
|
||||
)));
|
||||
}
|
||||
runner::config::RunnerMode::Combined => {
|
||||
let programs = Arc::new(match SagaPrograms::load(&settings) {
|
||||
Ok(p) => p,
|
||||
Err(e) => {
|
||||
tracing::error!(error = %e, "Failed to load saga manifest/programs");
|
||||
std::process::exit(1);
|
||||
}
|
||||
});
|
||||
let saga_runtime = SagaRuntime::default();
|
||||
tasks.push(tokio::spawn(run_saga_worker(
|
||||
settings.clone(),
|
||||
storage.clone(),
|
||||
programs.clone(),
|
||||
saga_runtime.clone(),
|
||||
metrics.clone(),
|
||||
tenant_gate.clone(),
|
||||
tenant_filter.clone(),
|
||||
shutdown.clone(),
|
||||
draining.clone(),
|
||||
)));
|
||||
tasks.push(tokio::spawn(run_effect_worker(
|
||||
settings.clone(),
|
||||
storage.clone(),
|
||||
metrics.clone(),
|
||||
tenant_gate.clone(),
|
||||
tenant_filter.clone(),
|
||||
reload.clone(),
|
||||
shutdown.clone(),
|
||||
draining.clone(),
|
||||
)));
|
||||
let outbox_settings = settings.clone();
|
||||
let outbox_storage = storage.clone();
|
||||
let outbox_shutdown = shutdown.clone();
|
||||
let outbox_draining = draining.clone();
|
||||
let outbox_metrics = metrics.clone();
|
||||
let outbox_tenant_gate = tenant_gate.clone();
|
||||
tasks.push(tokio::spawn(async move {
|
||||
let js = JetStreamClient::connect(&outbox_settings)
|
||||
.await
|
||||
.map_err(|e| runner::types::RunnerError::StreamError(e.to_string()))?;
|
||||
OutboxRelay
|
||||
.run(
|
||||
outbox_settings,
|
||||
outbox_storage,
|
||||
js,
|
||||
outbox_metrics,
|
||||
outbox_tenant_gate,
|
||||
outbox_shutdown,
|
||||
outbox_draining,
|
||||
)
|
||||
.await
|
||||
}));
|
||||
let scheduler_settings = settings.clone();
|
||||
let scheduler_storage = storage.clone();
|
||||
let scheduler_shutdown = shutdown.clone();
|
||||
let scheduler_draining = draining.clone();
|
||||
let scheduler_metrics = metrics.clone();
|
||||
let scheduler_tenant_gate = tenant_gate.clone();
|
||||
tasks.push(tokio::spawn(async move {
|
||||
Scheduler
|
||||
.run(
|
||||
scheduler_settings,
|
||||
scheduler_storage,
|
||||
programs,
|
||||
saga_runtime,
|
||||
scheduler_metrics,
|
||||
scheduler_tenant_gate,
|
||||
scheduler_shutdown,
|
||||
scheduler_draining,
|
||||
)
|
||||
.await
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
let mut failed = None;
|
||||
for task in tasks {
|
||||
match task.await {
|
||||
Ok(Ok(())) => {}
|
||||
Ok(Err(e)) => {
|
||||
failed = Some(e);
|
||||
break;
|
||||
}
|
||||
Err(e) => {
|
||||
failed = Some(runner::types::RunnerError::RuntimeError(e.to_string()));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
draining.store(true, Ordering::Relaxed);
|
||||
shutdown.notify_waiters();
|
||||
let _ = http_task.await;
|
||||
|
||||
if let Some(e) = failed {
|
||||
tracing::error!(error = %e, "Runner terminated with error");
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
fn print_help() {
|
||||
println!(
|
||||
"runner\n\nUSAGE:\n runner [COMMAND]\n\nCOMMANDS:\n serve Start the HTTP server (default)\n\nOPTIONS:\n -h, --help Print help\n"
|
||||
);
|
||||
}
|
||||
|
||||
fn load_settings() -> Settings {
|
||||
if let Ok(path) = std::env::var("RUNNER_CONFIG_PATH") {
|
||||
if let Ok(settings) = Settings::load_from_file_with_env_overrides(path) {
|
||||
return settings;
|
||||
}
|
||||
}
|
||||
|
||||
Settings::from_env().unwrap_or_default()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
#[test]
|
||||
fn test_harness_runs() {
|
||||
let settings = runner::Settings::default();
|
||||
assert_eq!(settings.aggregate_events_stream, "AGGREGATE_EVENTS");
|
||||
assert!(settings
|
||||
.saga_trigger_subject_filters
|
||||
.iter()
|
||||
.any(|s| s == "tenant.*.aggregate.*.*"));
|
||||
}
|
||||
}
|
||||
220
runner/src/observability/mod.rs
Normal file
220
runner/src/observability/mod.rs
Normal file
@@ -0,0 +1,220 @@
|
||||
use edge_logger_client::{Config as EdgeLoggerConfig, EdgeLoggerLayer};
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tracing_subscriber::prelude::*;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct Observability {
|
||||
service_name: String,
|
||||
metrics: Arc<Metrics>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Observability {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("Observability").finish_non_exhaustive()
|
||||
}
|
||||
}
|
||||
|
||||
impl Observability {
|
||||
pub fn new(service_name: impl Into<String>) -> Self {
|
||||
Self {
|
||||
service_name: service_name.into(),
|
||||
metrics: Arc::new(Metrics::default()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn init(&self) {
|
||||
let filter = std::env::var("RUST_LOG").unwrap_or_else(|_| "info".to_string());
|
||||
let env_filter = tracing_subscriber::EnvFilter::new(filter);
|
||||
|
||||
let fmt_layer = tracing_subscriber::fmt::layer().json();
|
||||
|
||||
let edge_layer = edge_logger_layer_from_env(&self.service_name);
|
||||
|
||||
let registry = tracing_subscriber::registry()
|
||||
.with(env_filter)
|
||||
.with(fmt_layer);
|
||||
let _ = match edge_layer {
|
||||
Some(layer) => registry.with(layer).try_init(),
|
||||
None => registry.try_init(),
|
||||
};
|
||||
}
|
||||
|
||||
pub fn service_name(&self) -> &str {
|
||||
&self.service_name
|
||||
}
|
||||
|
||||
pub fn metrics(&self) -> Arc<Metrics> {
|
||||
self.metrics.clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Observability {
|
||||
fn default() -> Self {
|
||||
Self::new("runner")
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct Metrics {
|
||||
saga_events_processed_total: AtomicU64,
|
||||
saga_events_skipped_checkpoint_total: AtomicU64,
|
||||
saga_events_skipped_dedupe_total: AtomicU64,
|
||||
saga_commit_failed_total: AtomicU64,
|
||||
|
||||
schedule_processed_total: AtomicU64,
|
||||
schedule_failed_total: AtomicU64,
|
||||
|
||||
outbox_dispatch_success_total: AtomicU64,
|
||||
outbox_dispatch_failed_total: AtomicU64,
|
||||
|
||||
effect_exec_success_total: AtomicU64,
|
||||
effect_exec_failed_total: AtomicU64,
|
||||
effect_exec_timeout_total: AtomicU64,
|
||||
effect_result_publish_failed_total: AtomicU64,
|
||||
|
||||
deadletter_written_total: AtomicU64,
|
||||
}
|
||||
|
||||
impl Metrics {
|
||||
pub fn inc_saga_processed(&self) {
|
||||
self.saga_events_processed_total
|
||||
.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn inc_saga_skipped_checkpoint(&self) {
|
||||
self.saga_events_skipped_checkpoint_total
|
||||
.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn inc_saga_skipped_dedupe(&self) {
|
||||
self.saga_events_skipped_dedupe_total
|
||||
.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn inc_saga_commit_failed(&self) {
|
||||
self.saga_commit_failed_total
|
||||
.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn inc_schedule_processed(&self) {
|
||||
self.schedule_processed_total
|
||||
.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn inc_schedule_failed(&self) {
|
||||
self.schedule_failed_total.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn inc_outbox_dispatch_success(&self) {
|
||||
self.outbox_dispatch_success_total
|
||||
.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn inc_outbox_dispatch_failed(&self) {
|
||||
self.outbox_dispatch_failed_total
|
||||
.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn inc_effect_exec_success(&self) {
|
||||
self.effect_exec_success_total
|
||||
.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn inc_effect_exec_failed(&self) {
|
||||
self.effect_exec_failed_total
|
||||
.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn inc_effect_exec_timeout(&self) {
|
||||
self.effect_exec_timeout_total
|
||||
.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn inc_effect_publish_failed(&self) {
|
||||
self.effect_result_publish_failed_total
|
||||
.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn inc_deadletter_written(&self) {
|
||||
self.deadletter_written_total
|
||||
.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn export_prometheus(&self) -> String {
|
||||
format!(
|
||||
"runner_saga_events_processed_total {}\nrunner_saga_events_skipped_checkpoint_total {}\nrunner_saga_events_skipped_dedupe_total {}\nrunner_saga_commit_failed_total {}\nrunner_schedule_processed_total {}\nrunner_schedule_failed_total {}\nrunner_outbox_dispatch_success_total {}\nrunner_outbox_dispatch_failed_total {}\nrunner_effect_exec_success_total {}\nrunner_effect_exec_failed_total {}\nrunner_effect_exec_timeout_total {}\nrunner_effect_result_publish_failed_total {}\nrunner_deadletter_written_total {}\n",
|
||||
self.saga_events_processed_total.load(Ordering::Relaxed),
|
||||
self.saga_events_skipped_checkpoint_total
|
||||
.load(Ordering::Relaxed),
|
||||
self.saga_events_skipped_dedupe_total
|
||||
.load(Ordering::Relaxed),
|
||||
self.saga_commit_failed_total.load(Ordering::Relaxed),
|
||||
self.schedule_processed_total.load(Ordering::Relaxed),
|
||||
self.schedule_failed_total.load(Ordering::Relaxed),
|
||||
self.outbox_dispatch_success_total.load(Ordering::Relaxed),
|
||||
self.outbox_dispatch_failed_total.load(Ordering::Relaxed),
|
||||
self.effect_exec_success_total.load(Ordering::Relaxed),
|
||||
self.effect_exec_failed_total.load(Ordering::Relaxed),
|
||||
self.effect_exec_timeout_total.load(Ordering::Relaxed),
|
||||
self.effect_result_publish_failed_total
|
||||
.load(Ordering::Relaxed),
|
||||
self.deadletter_written_total.load(Ordering::Relaxed),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
fn edge_logger_layer_from_env(service_name: &str) -> Option<EdgeLoggerLayer> {
|
||||
let enabled = std::env::var("EDGE_LOGGER_ENABLED")
|
||||
.ok()
|
||||
.map(|v| matches!(v.trim().to_ascii_lowercase().as_str(), "1" | "true" | "yes"))
|
||||
.unwrap_or(false);
|
||||
|
||||
let socket_path = std::env::var("EDGE_LOGGER_SOCKET_PATH").ok();
|
||||
if !enabled && socket_path.is_none() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let environment = std::env::var("EDGE_LOGGER_ENVIRONMENT")
|
||||
.or_else(|_| std::env::var("ENVIRONMENT"))
|
||||
.unwrap_or_else(|_| "production".to_string());
|
||||
|
||||
let tenant_id =
|
||||
std::env::var("EDGE_LOGGER_TENANT_ID").unwrap_or_else(|_| "default".to_string());
|
||||
|
||||
let batch_size = std::env::var("EDGE_LOGGER_BATCH_SIZE")
|
||||
.ok()
|
||||
.and_then(|v| v.parse::<usize>().ok())
|
||||
.unwrap_or(100);
|
||||
|
||||
let flush_interval = std::env::var("EDGE_LOGGER_FLUSH_INTERVAL_MS")
|
||||
.ok()
|
||||
.and_then(|v| v.parse::<u64>().ok())
|
||||
.map(Duration::from_millis)
|
||||
.unwrap_or(Duration::from_secs(1));
|
||||
|
||||
Some(EdgeLoggerLayer::new(EdgeLoggerConfig {
|
||||
socket_path: socket_path.unwrap_or_else(|| "/var/run/edge-logger/logger.sock".to_string()),
|
||||
service: service_name.to_string(),
|
||||
environment,
|
||||
tenant_id,
|
||||
batch_size,
|
||||
flush_interval,
|
||||
}))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn metrics_exporter_emits_key_metrics() {
|
||||
let metrics = Metrics::default();
|
||||
metrics.inc_saga_processed();
|
||||
metrics.inc_outbox_dispatch_failed();
|
||||
let body = metrics.export_prometheus();
|
||||
assert!(body.contains("runner_saga_events_processed_total 1"));
|
||||
assert!(body.contains("runner_outbox_dispatch_failed_total 1"));
|
||||
}
|
||||
}
|
||||
3
runner/src/outbox/mod.rs
Normal file
3
runner/src/outbox/mod.rs
Normal file
@@ -0,0 +1,3 @@
|
||||
mod relay;
|
||||
|
||||
pub use relay::OutboxRelay;
|
||||
448
runner/src/outbox/relay.rs
Normal file
448
runner/src/outbox/relay.rs
Normal file
@@ -0,0 +1,448 @@
|
||||
use crate::config::Settings;
|
||||
use crate::gateway::{proto as gateway_proto, GatewayClient};
|
||||
use crate::observability::Metrics;
|
||||
use crate::storage::KvClient;
|
||||
use crate::stream::JetStreamClient;
|
||||
use crate::tenant_placement::TenantGate;
|
||||
use crate::types::{RunnerError, WorkItem};
|
||||
use futures::future::BoxFuture;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::sync::{Mutex, Semaphore};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct OutboxRelay;
|
||||
|
||||
impl std::fmt::Debug for OutboxRelay {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("OutboxRelay").finish_non_exhaustive()
|
||||
}
|
||||
}
|
||||
|
||||
impl OutboxRelay {
|
||||
pub fn new() -> Self {
|
||||
Self
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn run(
|
||||
&self,
|
||||
settings: Settings,
|
||||
storage: KvClient,
|
||||
jetstream: JetStreamClient,
|
||||
metrics: Arc<Metrics>,
|
||||
tenant_gate: Arc<TenantGate>,
|
||||
shutdown: Arc<tokio::sync::Notify>,
|
||||
draining: Arc<AtomicBool>,
|
||||
) -> Result<(), RunnerError> {
|
||||
let gateway = if let Some(url) = settings.aggregate_gateway_url.clone() {
|
||||
Some(Arc::new(Mutex::new(
|
||||
GatewayClient::connect(&url)
|
||||
.await
|
||||
.map_err(|e| RunnerError::RuntimeError(e.to_string()))?,
|
||||
)))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let dispatcher = DefaultOutboxDispatcher { jetstream, gateway };
|
||||
self.run_with_dispatcher(
|
||||
settings,
|
||||
storage,
|
||||
Arc::new(dispatcher),
|
||||
metrics,
|
||||
tenant_gate,
|
||||
shutdown,
|
||||
draining,
|
||||
)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for OutboxRelay {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
pub trait OutboxDispatcher: Send + Sync {
|
||||
fn dispatch(&self, item: WorkItem) -> BoxFuture<'static, Result<(), RunnerError>>;
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct DefaultOutboxDispatcher {
|
||||
jetstream: JetStreamClient,
|
||||
gateway: Option<Arc<Mutex<GatewayClient>>>,
|
||||
}
|
||||
|
||||
impl OutboxDispatcher for DefaultOutboxDispatcher {
|
||||
fn dispatch(&self, item: WorkItem) -> BoxFuture<'static, Result<(), RunnerError>> {
|
||||
let jetstream = self.jetstream.clone();
|
||||
let gateway = self.gateway.clone();
|
||||
Box::pin(async move {
|
||||
match item {
|
||||
WorkItem::EffectCommand(cmd) => jetstream.publish_effect_command(&cmd).await,
|
||||
WorkItem::AggregateCommand(cmd) => {
|
||||
let Some(gateway) = gateway else {
|
||||
return Err(RunnerError::RuntimeError(
|
||||
"Aggregate gateway URL not configured".to_string(),
|
||||
));
|
||||
};
|
||||
|
||||
let payload_json = serde_json::to_string(&cmd.payload_json)
|
||||
.map_err(|e| RunnerError::DecodeError(e.to_string()))?;
|
||||
let metadata = to_string_metadata(&cmd.metadata);
|
||||
|
||||
let req = gateway_proto::SubmitCommandRequest {
|
||||
tenant_id: cmd.tenant_id.as_str().to_string(),
|
||||
command_id: cmd.command_id.as_str().to_string(),
|
||||
aggregate_id: cmd.aggregate_id,
|
||||
aggregate_type: cmd.aggregate_type,
|
||||
payload_json,
|
||||
metadata,
|
||||
};
|
||||
|
||||
let mut client = gateway.lock().await;
|
||||
let _ = client
|
||||
.submit_command(req)
|
||||
.await
|
||||
.map_err(|e| RunnerError::StreamError(e.to_string()))?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl OutboxRelay {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn run_with_dispatcher(
|
||||
&self,
|
||||
settings: Settings,
|
||||
storage: KvClient,
|
||||
dispatcher: Arc<dyn OutboxDispatcher>,
|
||||
metrics: Arc<Metrics>,
|
||||
tenant_gate: Arc<TenantGate>,
|
||||
shutdown: Arc<tokio::sync::Notify>,
|
||||
draining: Arc<AtomicBool>,
|
||||
) -> Result<(), RunnerError> {
|
||||
let mut tick = tokio::time::interval(Duration::from_millis(
|
||||
settings.outbox_scan_interval_ms.max(1),
|
||||
));
|
||||
tick.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
|
||||
|
||||
let global = Arc::new(Semaphore::new(settings.outbox_max_in_flight));
|
||||
let per_tenant = Arc::new(Mutex::new(HashMap::<String, Arc<Semaphore>>::new()));
|
||||
let mut join_set = tokio::task::JoinSet::<Result<(), RunnerError>>::new();
|
||||
|
||||
loop {
|
||||
if draining.load(Ordering::Relaxed) && join_set.is_empty() {
|
||||
tokio::select! {
|
||||
_ = shutdown.notified() => break,
|
||||
_ = tokio::time::sleep(Duration::from_millis(50)) => {}
|
||||
};
|
||||
}
|
||||
|
||||
if join_set.len() < settings.outbox_max_in_flight && !draining.load(Ordering::Relaxed) {
|
||||
let batch = storage.list_outbox_all(settings.outbox_batch_size)?;
|
||||
if !batch.is_empty() {
|
||||
for (key, item) in batch {
|
||||
if join_set.len() >= settings.outbox_max_in_flight {
|
||||
break;
|
||||
}
|
||||
|
||||
let tenant_id = work_item_tenant_id(&item).unwrap_or_default();
|
||||
if !tenant_gate.should_dispatch_outbox_work(
|
||||
&tenant_id,
|
||||
draining.load(Ordering::Relaxed),
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
let tenant_sem = get_tenant_semaphore(
|
||||
per_tenant.clone(),
|
||||
tenant_id.clone(),
|
||||
settings.outbox_max_in_flight_per_tenant,
|
||||
)
|
||||
.await;
|
||||
|
||||
let storage = storage.clone();
|
||||
let dispatcher = dispatcher.clone();
|
||||
let global = global.clone();
|
||||
let metrics = metrics.clone();
|
||||
let crash_after_dispatch = settings.test_outbox_crash_after_dispatch;
|
||||
let tenant_gate = tenant_gate.clone();
|
||||
let tenant_id = tenant_id.clone();
|
||||
|
||||
join_set.spawn(async move {
|
||||
let _g = global
|
||||
.acquire_owned()
|
||||
.await
|
||||
.map_err(|e| RunnerError::RuntimeError(e.to_string()))?;
|
||||
let _t = tenant_sem
|
||||
.acquire_owned()
|
||||
.await
|
||||
.map_err(|e| RunnerError::RuntimeError(e.to_string()))?;
|
||||
let _work = tenant_gate.begin_work(&tenant_id);
|
||||
|
||||
match dispatcher.dispatch(item).await {
|
||||
Ok(()) => {
|
||||
if crash_after_dispatch {
|
||||
panic!("test_outbox_crash_after_dispatch");
|
||||
}
|
||||
storage.delete_outbox_item(&key)?;
|
||||
metrics.inc_outbox_dispatch_success();
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
metrics.inc_outbox_dispatch_failed();
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if join_set.is_empty() {
|
||||
tokio::select! {
|
||||
_ = shutdown.notified() => break,
|
||||
_ = tick.tick() => {}
|
||||
};
|
||||
continue;
|
||||
}
|
||||
|
||||
tokio::select! {
|
||||
_ = shutdown.notified() => break,
|
||||
_ = tick.tick() => {},
|
||||
res = join_set.join_next() => {
|
||||
if let Some(res) = res {
|
||||
match res {
|
||||
Ok(Ok(())) => {}
|
||||
Ok(Err(e)) => {
|
||||
tracing::error!(error = %e, "Outbox dispatch failed");
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!(error = %e, "Outbox task panicked");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_tenant_semaphore(
|
||||
map: Arc<Mutex<HashMap<String, Arc<Semaphore>>>>,
|
||||
tenant_id: String,
|
||||
permits: usize,
|
||||
) -> Arc<Semaphore> {
|
||||
let mut map = map.lock().await;
|
||||
map.entry(tenant_id)
|
||||
.or_insert_with(|| Arc::new(Semaphore::new(permits)))
|
||||
.clone()
|
||||
}
|
||||
|
||||
fn work_item_tenant_id(item: &WorkItem) -> Option<String> {
|
||||
match item {
|
||||
WorkItem::AggregateCommand(cmd) => Some(cmd.tenant_id.as_str().to_string()),
|
||||
WorkItem::EffectCommand(cmd) => Some(cmd.tenant_id.as_str().to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
fn to_string_metadata(metadata: &crate::types::MessageMetadata) -> HashMap<String, String> {
|
||||
let mut map = HashMap::new();
|
||||
if let Some(correlation_id) = metadata.correlation_id.as_ref() {
|
||||
map.insert(
|
||||
"correlation_id".to_string(),
|
||||
correlation_id.as_str().to_string(),
|
||||
);
|
||||
}
|
||||
if let Some(trace_id) = metadata.trace_id.as_ref() {
|
||||
map.insert("trace_id".to_string(), trace_id.as_str().to_string());
|
||||
}
|
||||
for (k, v) in &metadata.extra {
|
||||
if let Some(s) = v.as_str() {
|
||||
map.insert(k.clone(), s.to_string());
|
||||
}
|
||||
}
|
||||
map
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::types::{
|
||||
CommandId, EffectCommandEnvelope, EffectName, MessageMetadata, TenantId, WorkId,
|
||||
};
|
||||
use serde_json::json;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
#[derive(Clone)]
|
||||
struct FakeDispatcher {
|
||||
should_fail: bool,
|
||||
calls: Arc<AtomicUsize>,
|
||||
}
|
||||
|
||||
impl OutboxDispatcher for FakeDispatcher {
|
||||
fn dispatch(&self, _item: WorkItem) -> BoxFuture<'static, Result<(), RunnerError>> {
|
||||
let calls = self.calls.clone();
|
||||
let should_fail = self.should_fail;
|
||||
Box::pin(async move {
|
||||
calls.fetch_add(1, Ordering::Relaxed);
|
||||
if should_fail {
|
||||
Err(RunnerError::StreamError("fail".to_string()))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn test_settings() -> Settings {
|
||||
Settings {
|
||||
outbox_scan_interval_ms: 1,
|
||||
outbox_batch_size: 10,
|
||||
outbox_max_in_flight: 2,
|
||||
outbox_max_in_flight_per_tenant: 2,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn outbox_item_is_not_deleted_if_dispatch_fails() {
|
||||
let settings = test_settings();
|
||||
let storage = KvClient::in_memory();
|
||||
let tenant = TenantId::new("t1");
|
||||
let work_id = WorkId::new_v7();
|
||||
let key = storage
|
||||
.put_outbox_item(
|
||||
&tenant,
|
||||
"effect",
|
||||
&work_id,
|
||||
&WorkItem::EffectCommand(EffectCommandEnvelope {
|
||||
tenant_id: tenant.clone(),
|
||||
command_id: CommandId::new("c1"),
|
||||
effect_name: EffectName::new("noop"),
|
||||
payload: json!({"a": 1}),
|
||||
metadata: MessageMetadata::default(),
|
||||
}),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let dispatcher = Arc::new(FakeDispatcher {
|
||||
should_fail: true,
|
||||
calls: Arc::new(AtomicUsize::new(0)),
|
||||
});
|
||||
|
||||
let relay = OutboxRelay::new();
|
||||
let shutdown = Arc::new(tokio::sync::Notify::new());
|
||||
let draining = Arc::new(AtomicBool::new(false));
|
||||
|
||||
tokio::spawn({
|
||||
let relay = relay.clone();
|
||||
let settings = settings.clone();
|
||||
let storage = storage.clone();
|
||||
let shutdown = shutdown.clone();
|
||||
let draining = draining.clone();
|
||||
async move {
|
||||
let metrics = Arc::new(Metrics::default());
|
||||
let tenant_gate = Arc::new(TenantGate::new(None));
|
||||
let _ = relay
|
||||
.run_with_dispatcher(
|
||||
settings,
|
||||
storage,
|
||||
dispatcher,
|
||||
metrics,
|
||||
tenant_gate,
|
||||
shutdown,
|
||||
draining,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
});
|
||||
|
||||
tokio::time::sleep(Duration::from_millis(10)).await;
|
||||
shutdown.notify_waiters();
|
||||
tokio::time::sleep(Duration::from_millis(10)).await;
|
||||
|
||||
let items = storage.list_outbox_all(10).unwrap();
|
||||
assert!(items.iter().any(|(k, _)| k == &key));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn dispatch_success_deletes_outbox_item_exactly_once() {
|
||||
let settings = test_settings();
|
||||
let storage = KvClient::in_memory();
|
||||
let tenant = TenantId::new("t1");
|
||||
let work_id = WorkId::new_v7();
|
||||
let key = storage
|
||||
.put_outbox_item(
|
||||
&tenant,
|
||||
"effect",
|
||||
&work_id,
|
||||
&WorkItem::EffectCommand(EffectCommandEnvelope {
|
||||
tenant_id: tenant.clone(),
|
||||
command_id: CommandId::new("c1"),
|
||||
effect_name: EffectName::new("noop"),
|
||||
payload: json!({"a": 1}),
|
||||
metadata: MessageMetadata::default(),
|
||||
}),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let calls = Arc::new(AtomicUsize::new(0));
|
||||
let dispatcher = Arc::new(FakeDispatcher {
|
||||
should_fail: false,
|
||||
calls: calls.clone(),
|
||||
});
|
||||
|
||||
let relay = OutboxRelay::new();
|
||||
let shutdown = Arc::new(tokio::sync::Notify::new());
|
||||
let draining = Arc::new(AtomicBool::new(false));
|
||||
|
||||
tokio::spawn({
|
||||
let relay = relay.clone();
|
||||
let settings = settings.clone();
|
||||
let storage = storage.clone();
|
||||
let shutdown = shutdown.clone();
|
||||
let draining = draining.clone();
|
||||
async move {
|
||||
let metrics = Arc::new(Metrics::default());
|
||||
let tenant_gate = Arc::new(TenantGate::new(None));
|
||||
let _ = relay
|
||||
.run_with_dispatcher(
|
||||
settings,
|
||||
storage,
|
||||
dispatcher,
|
||||
metrics,
|
||||
tenant_gate,
|
||||
shutdown,
|
||||
draining,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
});
|
||||
|
||||
let start = std::time::Instant::now();
|
||||
loop {
|
||||
let items = storage.list_outbox_all(10).unwrap();
|
||||
if !items.iter().any(|(k, _)| k == &key) {
|
||||
break;
|
||||
}
|
||||
if start.elapsed() > Duration::from_secs(1) {
|
||||
panic!("outbox item was not deleted");
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(10)).await;
|
||||
}
|
||||
|
||||
shutdown.notify_waiters();
|
||||
tokio::time::sleep(Duration::from_millis(10)).await;
|
||||
|
||||
assert_eq!(calls.load(Ordering::Relaxed), 1);
|
||||
}
|
||||
}
|
||||
122
runner/src/saga/manifest.rs
Normal file
122
runner/src/saga/manifest.rs
Normal file
@@ -0,0 +1,122 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::Path;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
||||
#[serde(default)]
|
||||
pub struct SagaManifest {
|
||||
pub sagas: Vec<SagaDefinition>,
|
||||
}
|
||||
|
||||
impl SagaManifest {
|
||||
pub fn validate(&self) -> Result<(), String> {
|
||||
for saga in &self.sagas {
|
||||
saga.validate()?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn from_yaml(yaml: &str) -> Result<Self, serde_yaml::Error> {
|
||||
serde_yaml::from_str(yaml)
|
||||
}
|
||||
|
||||
pub fn from_toml(toml_str: &str) -> Result<Self, toml::de::Error> {
|
||||
toml::from_str(toml_str)
|
||||
}
|
||||
|
||||
pub fn from_json(json: &str) -> Result<Self, serde_json::Error> {
|
||||
serde_json::from_str(json)
|
||||
}
|
||||
|
||||
pub fn from_file(path: impl AsRef<Path>) -> Result<Self, SagaManifestLoadError> {
|
||||
let path = path.as_ref();
|
||||
let raw = std::fs::read_to_string(path)?;
|
||||
let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
|
||||
|
||||
match ext {
|
||||
"yaml" | "yml" => Ok(Self::from_yaml(&raw)?),
|
||||
"toml" => Ok(Self::from_toml(&raw)?),
|
||||
"json" => Ok(Self::from_json(&raw)?),
|
||||
_ => Err(SagaManifestLoadError::UnsupportedFormat {
|
||||
path: path.display().to_string(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
||||
#[serde(default)]
|
||||
pub struct SagaDefinition {
|
||||
pub name: String,
|
||||
pub trigger_subjects: Vec<String>,
|
||||
pub on_event: String,
|
||||
pub compensation: Option<String>,
|
||||
}
|
||||
|
||||
impl SagaDefinition {
|
||||
pub fn validate(&self) -> Result<(), String> {
|
||||
if self.name.trim().is_empty() {
|
||||
return Err("Saga name is required".to_string());
|
||||
}
|
||||
if self.on_event.trim().is_empty() {
|
||||
return Err(format!(
|
||||
"Saga '{}' must specify on_event program",
|
||||
self.name
|
||||
));
|
||||
}
|
||||
if !Path::new(&self.on_event).exists() {
|
||||
return Err(format!(
|
||||
"Saga '{}' on_event program not found: {}",
|
||||
self.name, self.on_event
|
||||
));
|
||||
}
|
||||
if let Some(path) = &self.compensation {
|
||||
if !path.trim().is_empty() && !Path::new(path).exists() {
|
||||
return Err(format!(
|
||||
"Saga '{}' compensation program not found: {}",
|
||||
self.name, path
|
||||
));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum SagaManifestLoadError {
|
||||
#[error("Failed to read manifest file: {0}")]
|
||||
Io(#[from] std::io::Error),
|
||||
#[error("Failed to parse YAML manifest: {0}")]
|
||||
Yaml(#[from] serde_yaml::Error),
|
||||
#[error("Failed to parse TOML manifest: {0}")]
|
||||
Toml(#[from] toml::de::Error),
|
||||
#[error("Failed to parse JSON manifest: {0}")]
|
||||
Json(#[from] serde_json::Error),
|
||||
#[error("Unsupported manifest format: {path}")]
|
||||
UnsupportedFormat { path: String },
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use tempfile::tempdir;
|
||||
|
||||
#[test]
|
||||
fn manifest_loads_and_validates() {
|
||||
let dir = tempdir().unwrap();
|
||||
let program_path = dir.path().join("saga.json");
|
||||
std::fs::write(&program_path, r#"{"specVersion":"1.1","id":"p","name":"p","inputs":[],"nodes":[],"edges":[],"outputNodeId":"x"}"#).unwrap();
|
||||
|
||||
let yaml = format!(
|
||||
r#"
|
||||
sagas:
|
||||
- name: billing
|
||||
trigger_subjects: ["tenant.*.aggregate.*.*"]
|
||||
on_event: "{}"
|
||||
"#,
|
||||
program_path.to_string_lossy()
|
||||
);
|
||||
|
||||
let manifest = SagaManifest::from_yaml(&yaml).unwrap();
|
||||
assert!(manifest.validate().is_ok());
|
||||
}
|
||||
}
|
||||
7
runner/src/saga/mod.rs
Normal file
7
runner/src/saga/mod.rs
Normal file
@@ -0,0 +1,7 @@
|
||||
mod manifest;
|
||||
mod runtime;
|
||||
mod worker;
|
||||
|
||||
pub use manifest::{SagaDefinition, SagaManifest};
|
||||
pub use runtime::SagaRuntime;
|
||||
pub use worker::{run_saga_worker, AckDecision, SagaPrograms};
|
||||
194
runner/src/saga/runtime.rs
Normal file
194
runner/src/saga/runtime.rs
Normal file
@@ -0,0 +1,194 @@
|
||||
use crate::types::{AggregateEventEnvelope, RunnerError, WorkItem};
|
||||
use chrono::{DateTime, Utc};
|
||||
use runtime_function::engine::ExecutionOptions;
|
||||
use runtime_function::{Context, Engine, Program, Value as RtValue};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
use std::collections::BTreeMap;
|
||||
use std::time::Duration;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct SagaRuntime {
|
||||
engine: Engine,
|
||||
options: ExecutionOptions,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for SagaRuntime {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("SagaRuntime").finish_non_exhaustive()
|
||||
}
|
||||
}
|
||||
|
||||
impl SagaRuntime {
|
||||
pub fn new(gas_limit: u64, timeout: Duration) -> Self {
|
||||
let timeout_secs = timeout.as_secs().max(1);
|
||||
Self {
|
||||
engine: Engine::with_options(ExecutionOptions {
|
||||
gas_limit,
|
||||
timeout_secs,
|
||||
trace: false,
|
||||
}),
|
||||
options: ExecutionOptions {
|
||||
gas_limit,
|
||||
timeout_secs,
|
||||
trace: false,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn execute_on_event(
|
||||
&self,
|
||||
program: &Program,
|
||||
saga_state: &Value,
|
||||
event: &AggregateEventEnvelope,
|
||||
) -> Result<SagaExecutionOutput, RunnerError> {
|
||||
let event_json =
|
||||
serde_json::to_value(event).map_err(|e| RunnerError::DecodeError(e.to_string()))?;
|
||||
let ctx = deterministic_context(event.timestamp, event.event_id.to_string())
|
||||
.with_tenant_id(event.tenant_id.as_str());
|
||||
self.execute(program, saga_state, &event_json, ctx).await
|
||||
}
|
||||
|
||||
pub async fn execute(
|
||||
&self,
|
||||
program: &Program,
|
||||
saga_state: &Value,
|
||||
event: &Value,
|
||||
context: Context,
|
||||
) -> Result<SagaExecutionOutput, RunnerError> {
|
||||
let mut inputs = BTreeMap::new();
|
||||
inputs.insert(
|
||||
"saga_state".to_string(),
|
||||
to_runtime_value(saga_state.clone())?,
|
||||
);
|
||||
inputs.insert("event".to_string(), to_runtime_value(event.clone())?);
|
||||
|
||||
let result =
|
||||
self.engine
|
||||
.execute_with_options(program, inputs, context, self.options.clone());
|
||||
|
||||
if !result.success {
|
||||
return Err(RunnerError::RuntimeError(
|
||||
result
|
||||
.error
|
||||
.map(|e| e.to_string())
|
||||
.unwrap_or_else(|| "Saga execution failed".to_string()),
|
||||
));
|
||||
}
|
||||
|
||||
let output = result.output.ok_or_else(|| {
|
||||
RunnerError::RuntimeError("Saga execution produced no output".to_string())
|
||||
})?;
|
||||
|
||||
let json = serde_json::to_value(output)
|
||||
.map_err(|e| RunnerError::DecodeError(format!("Output encoding failed: {}", e)))?;
|
||||
serde_json::from_value(json)
|
||||
.map_err(|e| RunnerError::DecodeError(format!("Output decoding failed: {}", e)))
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for SagaRuntime {
|
||||
fn default() -> Self {
|
||||
Self::new(runtime_function::DEFAULT_GAS_LIMIT, Duration::from_secs(5))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SagaExecutionOutput {
|
||||
pub new_saga_state: Value,
|
||||
pub work_items: Vec<WorkItem>,
|
||||
pub schedules: Vec<SagaSchedule>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SagaSchedule {
|
||||
pub due_at_ms: u64,
|
||||
pub payload: Value,
|
||||
}
|
||||
|
||||
fn to_runtime_value(value: Value) -> Result<RtValue, RunnerError> {
|
||||
serde_json::from_value::<RtValue>(value).map_err(|e| RunnerError::DecodeError(e.to_string()))
|
||||
}
|
||||
|
||||
fn deterministic_context(now: DateTime<Utc>, causation_id: String) -> Context {
|
||||
Context::new(now, causation_id)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::types::TenantId;
|
||||
use chrono::TimeZone;
|
||||
use uuid::Uuid;
|
||||
|
||||
fn test_program() -> Program {
|
||||
let json = r#"
|
||||
{
|
||||
"specVersion": "1.1",
|
||||
"id": "saga_test",
|
||||
"name": "saga_test",
|
||||
"inputs": [
|
||||
{ "name": "saga_state", "type": "Any" },
|
||||
{ "name": "event", "type": "Any" }
|
||||
],
|
||||
"nodes": [
|
||||
{
|
||||
"id": "const",
|
||||
"type": "Const",
|
||||
"data": {
|
||||
"value": {
|
||||
"new_saga_state": { "x": 1 },
|
||||
"work_items": [],
|
||||
"schedules": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{ "id": "output", "type": "Output", "data": {} }
|
||||
],
|
||||
"edges": [
|
||||
{ "id": "e1", "source": "const", "sourceHandle": "out", "target": "output", "targetHandle": "value" }
|
||||
],
|
||||
"outputNodeId": "output"
|
||||
}
|
||||
"#;
|
||||
serde_json::from_str(json).unwrap()
|
||||
}
|
||||
|
||||
fn test_event() -> AggregateEventEnvelope {
|
||||
AggregateEventEnvelope {
|
||||
tenant_id: TenantId::new("t1"),
|
||||
event_id: Uuid::parse_str("00000000-0000-0000-0000-000000000001").unwrap(),
|
||||
aggregate_id: "a1".to_string(),
|
||||
aggregate_type: "Account".to_string(),
|
||||
version: 1,
|
||||
event_type: "Created".to_string(),
|
||||
payload: serde_json::json!({"a": 1}),
|
||||
command_id: Uuid::parse_str("00000000-0000-0000-0000-000000000002").unwrap(),
|
||||
timestamp: Utc.with_ymd_and_hms(2026, 2, 9, 12, 0, 0).unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn same_inputs_produce_same_outputs() {
|
||||
let program = test_program();
|
||||
let runtime = SagaRuntime::default();
|
||||
let event = test_event();
|
||||
let saga_state = serde_json::json!({"balance": 10});
|
||||
|
||||
let rt = tokio::runtime::Runtime::new().unwrap();
|
||||
rt.block_on(async {
|
||||
let out1 = runtime
|
||||
.execute_on_event(&program, &saga_state, &event)
|
||||
.await
|
||||
.unwrap();
|
||||
let out2 = runtime
|
||||
.execute_on_event(&program, &saga_state, &event)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(out1.new_saga_state, out2.new_saga_state);
|
||||
assert_eq!(out1.work_items.len(), out2.work_items.len());
|
||||
assert_eq!(out1.schedules.len(), out2.schedules.len());
|
||||
});
|
||||
}
|
||||
}
|
||||
838
runner/src/saga/worker.rs
Normal file
838
runner/src/saga/worker.rs
Normal file
@@ -0,0 +1,838 @@
|
||||
use crate::config::Settings;
|
||||
use crate::observability::Metrics;
|
||||
use crate::saga::{SagaManifest, SagaRuntime};
|
||||
use crate::storage::KvClient;
|
||||
use crate::stream::{ConsumerOptions, JetStreamClient};
|
||||
use crate::tenant_placement::TenantGate;
|
||||
use crate::types::{
|
||||
AggregateEventEnvelope, CheckpointKey, CorrelationId, DedupeEventKey, EventId, RunnerError,
|
||||
SagaName, SagaStateKey, ScheduleKey, TenantId, WorkId, WorkItem,
|
||||
};
|
||||
use async_nats::jetstream::consumer::DeliverPolicy;
|
||||
use async_nats::jetstream::AckKind;
|
||||
use chrono::Utc;
|
||||
use futures::StreamExt;
|
||||
use runtime_function::Program;
|
||||
use serde_json::Value;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::HashSet;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::sync::watch;
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum AckDecision {
|
||||
Ack,
|
||||
Term,
|
||||
None,
|
||||
}
|
||||
|
||||
pub struct SagaPrograms {
|
||||
manifest: SagaManifest,
|
||||
programs: HashMap<String, Program>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for SagaPrograms {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("SagaPrograms").finish_non_exhaustive()
|
||||
}
|
||||
}
|
||||
|
||||
impl SagaPrograms {
|
||||
pub fn load(settings: &Settings) -> Result<Self, RunnerError> {
|
||||
let manifest = SagaManifest::from_file(&settings.saga_manifest_path)
|
||||
.map_err(|e| RunnerError::DecodeError(e.to_string()))?;
|
||||
manifest
|
||||
.validate()
|
||||
.map_err(|e| RunnerError::DecodeError(e.to_string()))?;
|
||||
|
||||
let mut programs = HashMap::new();
|
||||
for saga in &manifest.sagas {
|
||||
let raw = std::fs::read_to_string(&saga.on_event)
|
||||
.map_err(|e| RunnerError::DecodeError(e.to_string()))?;
|
||||
let program: Program =
|
||||
serde_json::from_str(&raw).map_err(|e| RunnerError::DecodeError(e.to_string()))?;
|
||||
program
|
||||
.validate()
|
||||
.map_err(|e| RunnerError::RuntimeError(e.to_string()))?;
|
||||
programs.insert(saga.name.clone(), program);
|
||||
}
|
||||
|
||||
Ok(Self { manifest, programs })
|
||||
}
|
||||
|
||||
pub fn manifest(&self) -> &SagaManifest {
|
||||
&self.manifest
|
||||
}
|
||||
|
||||
pub fn program_for(&self, saga_name: &str) -> Option<&Program> {
|
||||
self.programs.get(saga_name)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn run_saga_worker(
|
||||
settings: Settings,
|
||||
storage: KvClient,
|
||||
programs: Arc<SagaPrograms>,
|
||||
runtime: SagaRuntime,
|
||||
metrics: Arc<Metrics>,
|
||||
tenant_gate: Arc<TenantGate>,
|
||||
tenant_filter: Option<watch::Receiver<HashSet<String>>>,
|
||||
shutdown: Arc<tokio::sync::Notify>,
|
||||
draining: Arc<AtomicBool>,
|
||||
) -> Result<(), RunnerError> {
|
||||
if tenant_filter.is_none() && settings.tenant_allowlist.is_empty() {
|
||||
return run_saga_worker_single(
|
||||
settings,
|
||||
storage,
|
||||
programs,
|
||||
runtime,
|
||||
metrics,
|
||||
tenant_gate,
|
||||
shutdown,
|
||||
draining,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
let settings = Arc::new(settings);
|
||||
let jetstream = JetStreamClient::connect(&settings)
|
||||
.await
|
||||
.map_err(|e| RunnerError::StreamError(e.to_string()))?;
|
||||
|
||||
let mut tenant_rx = match tenant_filter {
|
||||
Some(rx) => rx,
|
||||
None => {
|
||||
let initial = settings
|
||||
.tenant_allowlist
|
||||
.iter()
|
||||
.cloned()
|
||||
.collect::<HashSet<_>>();
|
||||
let (_tx, rx) = watch::channel(initial);
|
||||
rx
|
||||
}
|
||||
};
|
||||
|
||||
let mut tasks: HashMap<String, tokio::task::JoinHandle<()>> = HashMap::new();
|
||||
let mut stops: HashMap<String, Arc<tokio::sync::Notify>> = HashMap::new();
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = shutdown.notified() => break,
|
||||
_ = tokio::time::sleep(Duration::from_millis(250)) => {},
|
||||
changed = tenant_rx.changed() => {
|
||||
if changed.is_err() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let tenants = tenant_rx.borrow().clone();
|
||||
|
||||
for tenant in tasks.keys().cloned().collect::<Vec<_>>() {
|
||||
if !tenants.contains(&tenant) {
|
||||
if let Some(n) = stops.remove(&tenant) {
|
||||
n.notify_waiters();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for tenant in tasks
|
||||
.iter()
|
||||
.filter_map(|(t, h)| {
|
||||
if h.is_finished() {
|
||||
Some(t.clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
{
|
||||
if let Some(h) = tasks.remove(&tenant) {
|
||||
let _ = h.await;
|
||||
}
|
||||
stops.remove(&tenant);
|
||||
}
|
||||
|
||||
for tenant in tenants {
|
||||
if tasks.contains_key(&tenant) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let stop = Arc::new(tokio::sync::Notify::new());
|
||||
stops.insert(tenant.clone(), stop.clone());
|
||||
let tenant_key = tenant.clone();
|
||||
|
||||
let settings = settings.clone();
|
||||
let jetstream = jetstream.clone();
|
||||
let storage = storage.clone();
|
||||
let programs = programs.clone();
|
||||
let runtime = runtime.clone();
|
||||
let metrics = metrics.clone();
|
||||
let tenant_gate = tenant_gate.clone();
|
||||
let shutdown = shutdown.clone();
|
||||
let draining = draining.clone();
|
||||
|
||||
let handle = tokio::spawn(async move {
|
||||
let _ = run_saga_worker_for_tenant(
|
||||
settings,
|
||||
jetstream,
|
||||
storage,
|
||||
programs,
|
||||
runtime,
|
||||
metrics,
|
||||
tenant_gate,
|
||||
tenant,
|
||||
shutdown,
|
||||
stop,
|
||||
draining,
|
||||
)
|
||||
.await;
|
||||
});
|
||||
tasks.insert(tenant_key, handle);
|
||||
}
|
||||
}
|
||||
|
||||
for (_, n) in stops {
|
||||
n.notify_waiters();
|
||||
}
|
||||
for (_, h) in tasks {
|
||||
let _ = h.await;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn run_saga_worker_single(
|
||||
settings: Settings,
|
||||
storage: KvClient,
|
||||
programs: Arc<SagaPrograms>,
|
||||
runtime: SagaRuntime,
|
||||
metrics: Arc<Metrics>,
|
||||
tenant_gate: Arc<TenantGate>,
|
||||
shutdown: Arc<tokio::sync::Notify>,
|
||||
draining: Arc<AtomicBool>,
|
||||
) -> Result<(), RunnerError> {
|
||||
let jetstream = JetStreamClient::connect(&settings)
|
||||
.await
|
||||
.map_err(|e| RunnerError::StreamError(e.to_string()))?;
|
||||
|
||||
let durable_name = format!("{}_saga", settings.consumer_durable_prefix);
|
||||
let filter_subject = settings
|
||||
.saga_trigger_subject_filters
|
||||
.first()
|
||||
.cloned()
|
||||
.unwrap_or_else(|| "tenant.*.aggregate.*.*".to_string());
|
||||
|
||||
let consumer = jetstream
|
||||
.saga_trigger_consumer(
|
||||
&settings,
|
||||
ConsumerOptions {
|
||||
durable_name,
|
||||
filter_subject,
|
||||
deliver_policy: DeliverPolicy::All,
|
||||
},
|
||||
)
|
||||
.await
|
||||
.map_err(|e| RunnerError::StreamError(e.to_string()))?;
|
||||
|
||||
let mut messages = consumer
|
||||
.messages()
|
||||
.await
|
||||
.map_err(|e| RunnerError::StreamError(e.to_string()))?;
|
||||
|
||||
loop {
|
||||
if draining.load(Ordering::Relaxed) {
|
||||
tokio::select! {
|
||||
_ = shutdown.notified() => break,
|
||||
_ = tokio::time::sleep(Duration::from_millis(50)) => continue,
|
||||
};
|
||||
}
|
||||
|
||||
let next = tokio::select! {
|
||||
_ = shutdown.notified() => break,
|
||||
msg = messages.next() => msg,
|
||||
};
|
||||
|
||||
let Some(msg) = next else { break };
|
||||
match msg {
|
||||
Ok(m) => {
|
||||
handle_saga_message(
|
||||
&settings,
|
||||
&storage,
|
||||
&programs,
|
||||
&runtime,
|
||||
&metrics,
|
||||
&tenant_gate,
|
||||
draining.load(Ordering::Relaxed),
|
||||
m,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!(error = %e, "JetStream message stream error");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn run_saga_worker_for_tenant(
|
||||
settings: Arc<Settings>,
|
||||
jetstream: JetStreamClient,
|
||||
storage: KvClient,
|
||||
programs: Arc<SagaPrograms>,
|
||||
runtime: SagaRuntime,
|
||||
metrics: Arc<Metrics>,
|
||||
tenant_gate: Arc<TenantGate>,
|
||||
tenant: String,
|
||||
shutdown: Arc<tokio::sync::Notify>,
|
||||
stop: Arc<tokio::sync::Notify>,
|
||||
draining: Arc<AtomicBool>,
|
||||
) -> Result<(), RunnerError> {
|
||||
let durable_name = format!("{}_saga_{}", settings.consumer_durable_prefix, tenant);
|
||||
let filter_subject = format!("tenant.{}.aggregate.*.*", tenant);
|
||||
|
||||
let consumer = jetstream
|
||||
.saga_trigger_consumer(
|
||||
&settings,
|
||||
ConsumerOptions {
|
||||
durable_name,
|
||||
filter_subject,
|
||||
deliver_policy: DeliverPolicy::All,
|
||||
},
|
||||
)
|
||||
.await
|
||||
.map_err(|e| RunnerError::StreamError(e.to_string()))?;
|
||||
|
||||
let mut messages = consumer
|
||||
.messages()
|
||||
.await
|
||||
.map_err(|e| RunnerError::StreamError(e.to_string()))?;
|
||||
|
||||
loop {
|
||||
if !tenant_gate.should_acquire_processing_work(&tenant, draining.load(Ordering::Relaxed)) {
|
||||
tokio::select! {
|
||||
_ = shutdown.notified() => break,
|
||||
_ = stop.notified() => break,
|
||||
_ = tokio::time::sleep(Duration::from_millis(50)) => continue,
|
||||
};
|
||||
}
|
||||
|
||||
if draining.load(Ordering::Relaxed) {
|
||||
tokio::select! {
|
||||
_ = shutdown.notified() => break,
|
||||
_ = stop.notified() => break,
|
||||
_ = tokio::time::sleep(Duration::from_millis(50)) => continue,
|
||||
};
|
||||
}
|
||||
|
||||
let next = tokio::select! {
|
||||
_ = shutdown.notified() => break,
|
||||
_ = stop.notified() => break,
|
||||
msg = messages.next() => msg,
|
||||
};
|
||||
|
||||
let Some(msg) = next else { break };
|
||||
match msg {
|
||||
Ok(m) => {
|
||||
handle_saga_message(
|
||||
&settings,
|
||||
&storage,
|
||||
&programs,
|
||||
&runtime,
|
||||
&metrics,
|
||||
&tenant_gate,
|
||||
draining.load(Ordering::Relaxed),
|
||||
m,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!(error = %e, "JetStream message stream error");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn handle_saga_message(
|
||||
settings: &Settings,
|
||||
storage: &KvClient,
|
||||
programs: &SagaPrograms,
|
||||
runtime: &SagaRuntime,
|
||||
metrics: &Metrics,
|
||||
tenant_gate: &TenantGate,
|
||||
global_draining: bool,
|
||||
msg: async_nats::jetstream::Message,
|
||||
) {
|
||||
let info = match msg.info() {
|
||||
Ok(i) => i,
|
||||
Err(e) => {
|
||||
tracing::error!(error = %e, "Failed to parse JetStream message info");
|
||||
let _ = msg.ack().await;
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
let delivered = info.delivered.max(0) as u64;
|
||||
let sequence = info.stream_sequence;
|
||||
|
||||
let envelope: AggregateEventEnvelope = match serde_json::from_slice(&msg.payload) {
|
||||
Ok(e) => e,
|
||||
Err(e) => {
|
||||
tracing::error!(error = %e, "Failed to decode aggregate event envelope");
|
||||
match handle_poison(
|
||||
storage,
|
||||
settings,
|
||||
metrics,
|
||||
delivered,
|
||||
TenantId::default(),
|
||||
"decode_error",
|
||||
&Value::Null,
|
||||
) {
|
||||
AckDecision::Term => {
|
||||
let _ = msg.ack_with(AckKind::Term).await;
|
||||
}
|
||||
AckDecision::Ack => {
|
||||
let _ = msg.ack().await;
|
||||
}
|
||||
AckDecision::None => {}
|
||||
}
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
if !tenant_gate.should_acquire_processing_work(envelope.tenant_id.as_str(), global_draining) {
|
||||
let _ = msg
|
||||
.ack_with(AckKind::Nak(Some(Duration::from_millis(250))))
|
||||
.await;
|
||||
return;
|
||||
}
|
||||
let _work = tenant_gate.begin_work(envelope.tenant_id.as_str());
|
||||
|
||||
let subject = msg.subject.to_string();
|
||||
|
||||
let decision = process_aggregate_event(
|
||||
settings, storage, programs, runtime, metrics, &envelope, &subject, sequence, delivered,
|
||||
)
|
||||
.await;
|
||||
|
||||
match decision {
|
||||
Ok(AckDecision::Ack) => {
|
||||
let _ = msg.ack().await;
|
||||
}
|
||||
Ok(AckDecision::Term) => {
|
||||
let _ = msg.ack_with(AckKind::Term).await;
|
||||
}
|
||||
Ok(AckDecision::None) => {}
|
||||
Err(RunnerError::StorageError(e)) => {
|
||||
tracing::error!(error = %e, "Storage error while processing saga message");
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!(error = %e, "Saga message processing failed");
|
||||
let decision = handle_poison(
|
||||
storage,
|
||||
settings,
|
||||
metrics,
|
||||
delivered,
|
||||
envelope.tenant_id.clone(),
|
||||
"runtime_error",
|
||||
&serde_json::to_value(&envelope).unwrap_or(Value::Null),
|
||||
);
|
||||
match decision {
|
||||
AckDecision::Term => {
|
||||
let _ = msg.ack_with(AckKind::Term).await;
|
||||
}
|
||||
AckDecision::Ack => {
|
||||
let _ = msg.ack().await;
|
||||
}
|
||||
AckDecision::None => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn process_aggregate_event(
|
||||
settings: &Settings,
|
||||
storage: &KvClient,
|
||||
programs: &SagaPrograms,
|
||||
runtime: &SagaRuntime,
|
||||
metrics: &Metrics,
|
||||
envelope: &AggregateEventEnvelope,
|
||||
subject: &str,
|
||||
sequence: u64,
|
||||
_delivered: u64,
|
||||
) -> Result<AckDecision, RunnerError> {
|
||||
let mut any_processed = false;
|
||||
|
||||
for saga in &programs.manifest().sagas {
|
||||
if !saga
|
||||
.trigger_subjects
|
||||
.iter()
|
||||
.any(|pat| subject_matches(pat, subject))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
let tenant_id = envelope.tenant_id.clone();
|
||||
let saga_name = SagaName::new(saga.name.clone());
|
||||
let correlation_id = CorrelationId::new(extract_correlation_id(envelope));
|
||||
|
||||
let checkpoint_key = CheckpointKey::new(&tenant_id, &saga_name);
|
||||
let checkpoint = storage.get_checkpoint(&checkpoint_key)?.unwrap_or(0);
|
||||
if sequence <= checkpoint {
|
||||
metrics.inc_saga_skipped_checkpoint();
|
||||
continue;
|
||||
}
|
||||
|
||||
let event_id = EventId::new(envelope.event_id.to_string());
|
||||
let dedupe_key = DedupeEventKey::new(&tenant_id, &saga_name, &event_id);
|
||||
if storage.is_deduped_event(&dedupe_key)? {
|
||||
metrics.inc_saga_skipped_dedupe();
|
||||
continue;
|
||||
}
|
||||
|
||||
let saga_key = SagaStateKey::new(&tenant_id, &saga_name, &correlation_id);
|
||||
let current_state = storage
|
||||
.get_saga_state(&saga_key)?
|
||||
.unwrap_or_else(|| serde_json::json!({}));
|
||||
|
||||
let program = programs
|
||||
.program_for(&saga.name)
|
||||
.ok_or_else(|| RunnerError::RuntimeError("Saga program missing".to_string()))?;
|
||||
|
||||
let output = runtime
|
||||
.execute_on_event(program, ¤t_state, envelope)
|
||||
.await?;
|
||||
|
||||
let outbox_items = output
|
||||
.work_items
|
||||
.into_iter()
|
||||
.map(|item| {
|
||||
let kind = match &item {
|
||||
WorkItem::AggregateCommand(_) => "aggregate",
|
||||
WorkItem::EffectCommand(_) => "effect",
|
||||
};
|
||||
let work_id = WorkId::new_v7();
|
||||
let key = format!(
|
||||
"outbox:{}:{}:{}",
|
||||
tenant_id.as_str(),
|
||||
kind,
|
||||
work_id.as_uuid()
|
||||
);
|
||||
(key, item)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let schedule_items = output
|
||||
.schedules
|
||||
.into_iter()
|
||||
.map(|s| {
|
||||
let key = ScheduleKey::new(&tenant_id, &saga_name, &correlation_id, s.due_at_ms);
|
||||
(key, s.payload)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
storage
|
||||
.commit_saga_processing(
|
||||
&saga_key,
|
||||
&output.new_saga_state,
|
||||
outbox_items,
|
||||
schedule_items,
|
||||
&checkpoint_key,
|
||||
sequence,
|
||||
Some(&dedupe_key),
|
||||
)
|
||||
.inspect_err(|_| metrics.inc_saga_commit_failed())?;
|
||||
|
||||
if settings.test_saga_crash_after_commit {
|
||||
panic!("test_saga_crash_after_commit");
|
||||
}
|
||||
|
||||
metrics.inc_saga_processed();
|
||||
any_processed = true;
|
||||
}
|
||||
|
||||
let _ = any_processed;
|
||||
Ok(AckDecision::Ack)
|
||||
}
|
||||
|
||||
fn extract_correlation_id(envelope: &AggregateEventEnvelope) -> String {
|
||||
envelope
|
||||
.payload
|
||||
.get("correlation_id")
|
||||
.and_then(|v| v.as_str())
|
||||
.map(|s| s.to_string())
|
||||
.unwrap_or_else(|| envelope.aggregate_id.clone())
|
||||
}
|
||||
|
||||
fn handle_poison(
|
||||
storage: &KvClient,
|
||||
settings: &Settings,
|
||||
metrics: &Metrics,
|
||||
delivered: u64,
|
||||
tenant_id: TenantId,
|
||||
reason: &str,
|
||||
payload: &Value,
|
||||
) -> AckDecision {
|
||||
if delivered < settings.max_deliver.max(1) as u64 {
|
||||
return AckDecision::None;
|
||||
}
|
||||
|
||||
let key = format!(
|
||||
"deadletter:{}:{}:{}",
|
||||
tenant_id.as_str(),
|
||||
reason,
|
||||
Utc::now().timestamp_millis()
|
||||
);
|
||||
let record = serde_json::json!({
|
||||
"tenant_id": tenant_id,
|
||||
"reason": reason,
|
||||
"delivered": delivered,
|
||||
"payload": payload,
|
||||
"timestamp": Utc::now(),
|
||||
});
|
||||
let _ = storage.put_deadletter(&key, &record);
|
||||
metrics.inc_deadletter_written();
|
||||
AckDecision::Term
|
||||
}
|
||||
|
||||
fn subject_matches(pattern: &str, subject: &str) -> bool {
|
||||
let p = pattern.split('.').collect::<Vec<_>>();
|
||||
let s = subject.split('.').collect::<Vec<_>>();
|
||||
|
||||
let mut i = 0usize;
|
||||
let mut j = 0usize;
|
||||
while i < p.len() && j < s.len() {
|
||||
match p[i] {
|
||||
">" => return true,
|
||||
"*" => {}
|
||||
other => {
|
||||
if other != s[j] {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
i += 1;
|
||||
j += 1;
|
||||
}
|
||||
|
||||
if i == p.len() && j == s.len() {
|
||||
return true;
|
||||
}
|
||||
if i + 1 == p.len() && p[i] == ">" {
|
||||
return true;
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::saga::SagaDefinition;
|
||||
use chrono::TimeZone;
|
||||
use std::collections::HashMap;
|
||||
use uuid::Uuid;
|
||||
|
||||
fn test_program_with_one_outbox_item() -> Program {
|
||||
let json = r#"
|
||||
{
|
||||
"specVersion": "1.1",
|
||||
"id": "saga_test",
|
||||
"name": "saga_test",
|
||||
"inputs": [
|
||||
{ "name": "saga_state", "type": "Any" },
|
||||
{ "name": "event", "type": "Any" }
|
||||
],
|
||||
"nodes": [
|
||||
{
|
||||
"id": "const",
|
||||
"type": "Const",
|
||||
"data": {
|
||||
"value": {
|
||||
"new_saga_state": { "x": 1 },
|
||||
"work_items": [
|
||||
{
|
||||
"kind": "effect_command",
|
||||
"tenant_id": "t1",
|
||||
"command_id": "c1",
|
||||
"effect_name": "noop",
|
||||
"payload": { "a": 1 },
|
||||
"metadata": {}
|
||||
}
|
||||
],
|
||||
"schedules": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{ "id": "output", "type": "Output", "data": {} }
|
||||
],
|
||||
"edges": [
|
||||
{ "id": "e1", "source": "const", "sourceHandle": "out", "target": "output", "targetHandle": "value" }
|
||||
],
|
||||
"outputNodeId": "output"
|
||||
}
|
||||
"#;
|
||||
serde_json::from_str(json).unwrap()
|
||||
}
|
||||
|
||||
fn saga_programs() -> SagaPrograms {
|
||||
let manifest = SagaManifest {
|
||||
sagas: vec![SagaDefinition {
|
||||
name: "billing".to_string(),
|
||||
trigger_subjects: vec!["tenant.*.aggregate.*.*".to_string()],
|
||||
on_event: "unused.json".to_string(),
|
||||
compensation: None,
|
||||
}],
|
||||
};
|
||||
|
||||
let mut programs = HashMap::new();
|
||||
let program = test_program_with_one_outbox_item();
|
||||
program.validate().unwrap();
|
||||
programs.insert("billing".to_string(), program);
|
||||
|
||||
SagaPrograms { manifest, programs }
|
||||
}
|
||||
|
||||
fn event_with_id(event_id: Uuid) -> AggregateEventEnvelope {
|
||||
AggregateEventEnvelope {
|
||||
tenant_id: TenantId::new("t1"),
|
||||
event_id,
|
||||
aggregate_id: "a1".to_string(),
|
||||
aggregate_type: "Account".to_string(),
|
||||
version: 1,
|
||||
event_type: "Created".to_string(),
|
||||
payload: serde_json::json!({"correlation_id": "corr1"}),
|
||||
command_id: Uuid::parse_str("00000000-0000-0000-0000-000000000002").unwrap(),
|
||||
timestamp: Utc.with_ymd_and_hms(2026, 2, 9, 12, 0, 0).unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn checkpoint_and_dedupe_gates_skip_already_processed_items() {
|
||||
let storage = KvClient::in_memory();
|
||||
let programs = saga_programs();
|
||||
let runtime = SagaRuntime::default();
|
||||
let metrics = Metrics::default();
|
||||
let subject = "tenant.t1.aggregate.Account.a1";
|
||||
|
||||
let event1 =
|
||||
event_with_id(Uuid::parse_str("00000000-0000-0000-0000-000000000010").unwrap());
|
||||
process_aggregate_event(
|
||||
&Settings::default(),
|
||||
&storage,
|
||||
&programs,
|
||||
&runtime,
|
||||
&metrics,
|
||||
&event1,
|
||||
subject,
|
||||
10,
|
||||
1,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(storage.list_outbox_all(100).unwrap().len(), 1);
|
||||
|
||||
process_aggregate_event(
|
||||
&Settings::default(),
|
||||
&storage,
|
||||
&programs,
|
||||
&runtime,
|
||||
&metrics,
|
||||
&event1,
|
||||
subject,
|
||||
11,
|
||||
1,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(storage.list_outbox_all(100).unwrap().len(), 1);
|
||||
|
||||
let event2 =
|
||||
event_with_id(Uuid::parse_str("00000000-0000-0000-0000-000000000011").unwrap());
|
||||
process_aggregate_event(
|
||||
&Settings::default(),
|
||||
&storage,
|
||||
&programs,
|
||||
&runtime,
|
||||
&metrics,
|
||||
&event2,
|
||||
subject,
|
||||
10,
|
||||
1,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(storage.list_outbox_all(100).unwrap().len(), 1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn pipeline_does_not_advance_checkpoint_if_commit_fails() {
|
||||
let storage = KvClient::in_memory();
|
||||
let programs = saga_programs();
|
||||
let runtime = SagaRuntime::default();
|
||||
let metrics = Metrics::default();
|
||||
let subject = "tenant.t1.aggregate.Account.a1";
|
||||
|
||||
storage.fail_next_txn();
|
||||
|
||||
let event1 =
|
||||
event_with_id(Uuid::parse_str("00000000-0000-0000-0000-000000000010").unwrap());
|
||||
let res = process_aggregate_event(
|
||||
&Settings::default(),
|
||||
&storage,
|
||||
&programs,
|
||||
&runtime,
|
||||
&metrics,
|
||||
&event1,
|
||||
subject,
|
||||
10,
|
||||
1,
|
||||
)
|
||||
.await;
|
||||
assert!(res.is_err());
|
||||
|
||||
let saga_name = SagaName::new("billing");
|
||||
let checkpoint_key = CheckpointKey::new(&TenantId::new("t1"), &saga_name);
|
||||
assert!(storage.get_checkpoint(&checkpoint_key).unwrap().is_none());
|
||||
assert_eq!(storage.list_outbox_all(100).unwrap().len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn quarantine_record_is_written_on_poison_handling_path() {
|
||||
let storage = KvClient::in_memory();
|
||||
let settings = Settings {
|
||||
max_deliver: 1,
|
||||
..Default::default()
|
||||
};
|
||||
let metrics = Metrics::default();
|
||||
|
||||
let decision = handle_poison(
|
||||
&storage,
|
||||
&settings,
|
||||
&metrics,
|
||||
settings.max_deliver as u64,
|
||||
TenantId::new("t1"),
|
||||
"poison",
|
||||
&serde_json::json!({"x": 1}),
|
||||
);
|
||||
assert_eq!(decision, AckDecision::Term);
|
||||
|
||||
let deadletters = storage.list_deadletters(10).unwrap();
|
||||
assert_eq!(deadletters.len(), 1);
|
||||
assert_eq!(deadletters[0].1["reason"], "poison");
|
||||
assert_eq!(deadletters[0].1["tenant_id"], "t1");
|
||||
}
|
||||
}
|
||||
3
runner/src/schedule/mod.rs
Normal file
3
runner/src/schedule/mod.rs
Normal file
@@ -0,0 +1,3 @@
|
||||
mod scheduler;
|
||||
|
||||
pub use scheduler::Scheduler;
|
||||
349
runner/src/schedule/scheduler.rs
Normal file
349
runner/src/schedule/scheduler.rs
Normal file
@@ -0,0 +1,349 @@
|
||||
use crate::config::Settings;
|
||||
use crate::observability::Metrics;
|
||||
use crate::saga::{SagaPrograms, SagaRuntime};
|
||||
use crate::storage::KvClient;
|
||||
use crate::tenant_placement::TenantGate;
|
||||
use crate::types::{
|
||||
CorrelationId, RunnerError, SagaName, SagaStateKey, ScheduleKey, TenantId, WorkId, WorkItem,
|
||||
};
|
||||
use chrono::{TimeZone, Utc};
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct Scheduler;
|
||||
|
||||
impl std::fmt::Debug for Scheduler {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("Scheduler").finish_non_exhaustive()
|
||||
}
|
||||
}
|
||||
|
||||
impl Scheduler {
|
||||
pub fn new() -> Self {
|
||||
Self
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn run(
|
||||
&self,
|
||||
settings: Settings,
|
||||
storage: KvClient,
|
||||
programs: Arc<SagaPrograms>,
|
||||
runtime: SagaRuntime,
|
||||
metrics: Arc<Metrics>,
|
||||
tenant_gate: Arc<TenantGate>,
|
||||
shutdown: Arc<tokio::sync::Notify>,
|
||||
draining: Arc<AtomicBool>,
|
||||
) -> Result<(), RunnerError> {
|
||||
let mut tick = tokio::time::interval(Duration::from_millis(
|
||||
settings.schedule_scan_interval_ms.max(1),
|
||||
));
|
||||
tick.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = shutdown.notified() => break,
|
||||
_ = tick.tick() => {}
|
||||
}
|
||||
|
||||
if draining.load(Ordering::Relaxed) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let now_ms = Utc::now().timestamp_millis().max(0) as u64;
|
||||
let due = storage.scan_due_schedule_items_all(now_ms, settings.schedule_batch_size)?;
|
||||
|
||||
for (key, payload) in due {
|
||||
let (tenant_id, saga_name, correlation_id, due_at_ms) = parse_schedule_key(&key)
|
||||
.ok_or_else(|| RunnerError::DecodeError("Invalid schedule key".to_string()))?;
|
||||
|
||||
if !tenant_gate.should_acquire_processing_work(
|
||||
tenant_id.as_str(),
|
||||
draining.load(Ordering::Relaxed),
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
let _work = tenant_gate.begin_work(tenant_id.as_str());
|
||||
|
||||
let saga_def = programs
|
||||
.manifest()
|
||||
.sagas
|
||||
.iter()
|
||||
.find(|s| s.name == saga_name.as_str())
|
||||
.ok_or_else(|| RunnerError::RuntimeError("Unknown saga".to_string()))?;
|
||||
|
||||
let program = programs
|
||||
.program_for(&saga_def.name)
|
||||
.ok_or_else(|| RunnerError::RuntimeError("Saga program missing".to_string()))?;
|
||||
|
||||
let saga_key = SagaStateKey::new(&tenant_id, &saga_name, &correlation_id);
|
||||
let current_state = storage
|
||||
.get_saga_state(&saga_key)?
|
||||
.unwrap_or_else(|| serde_json::json!({}));
|
||||
|
||||
let event = serde_json::json!({
|
||||
"type": "schedule",
|
||||
"due_at_ms": due_at_ms,
|
||||
"payload": payload
|
||||
});
|
||||
|
||||
let ctx = runtime_function::Context::new(
|
||||
Utc.timestamp_millis_opt(due_at_ms as i64)
|
||||
.single()
|
||||
.unwrap_or_else(|| Utc.timestamp_millis_opt(0).single().unwrap()),
|
||||
format!(
|
||||
"schedule:{}:{}:{}:{}",
|
||||
tenant_id.as_str(),
|
||||
saga_name.as_str(),
|
||||
correlation_id.as_str(),
|
||||
due_at_ms
|
||||
),
|
||||
)
|
||||
.with_tenant_id(tenant_id.as_str())
|
||||
.with_correlation_id(correlation_id.as_str());
|
||||
|
||||
let output = runtime
|
||||
.execute(program, ¤t_state, &event, ctx)
|
||||
.await?;
|
||||
|
||||
let outbox_items = output
|
||||
.work_items
|
||||
.into_iter()
|
||||
.map(|item| {
|
||||
let kind = match &item {
|
||||
WorkItem::AggregateCommand(_) => "aggregate",
|
||||
WorkItem::EffectCommand(_) => "effect",
|
||||
};
|
||||
let work_id = WorkId::new_v7();
|
||||
let key = format!(
|
||||
"outbox:{}:{}:{}",
|
||||
tenant_id.as_str(),
|
||||
kind,
|
||||
work_id.as_uuid()
|
||||
);
|
||||
(key, item)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let new_schedule_items = output
|
||||
.schedules
|
||||
.into_iter()
|
||||
.map(|s| {
|
||||
let key =
|
||||
ScheduleKey::new(&tenant_id, &saga_name, &correlation_id, s.due_at_ms);
|
||||
(key, s.payload)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let schedule_key =
|
||||
ScheduleKey::new(&tenant_id, &saga_name, &correlation_id, due_at_ms);
|
||||
|
||||
storage
|
||||
.commit_schedule_processing(
|
||||
&saga_key,
|
||||
&output.new_saga_state,
|
||||
outbox_items,
|
||||
&schedule_key,
|
||||
new_schedule_items,
|
||||
)
|
||||
.inspect_err(|_| metrics.inc_schedule_failed())?;
|
||||
metrics.inc_schedule_processed();
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Scheduler {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_schedule_key(key: &str) -> Option<(TenantId, SagaName, CorrelationId, u64)> {
|
||||
let mut it = key.split(':');
|
||||
if it.next()? != "schedule" {
|
||||
return None;
|
||||
}
|
||||
let tenant = TenantId::new(it.next()?.to_string());
|
||||
let saga = SagaName::new(it.next()?.to_string());
|
||||
let corr = CorrelationId::new(it.next()?.to_string());
|
||||
let due = it.next()?.parse::<u64>().ok()?;
|
||||
Some((tenant, saga, corr, due))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use serde_json::json;
|
||||
|
||||
fn program_json_with_one_outbox_item() -> &'static str {
|
||||
r#"
|
||||
{
|
||||
"specVersion": "1.1",
|
||||
"id": "schedule_test",
|
||||
"name": "schedule_test",
|
||||
"inputs": [
|
||||
{ "name": "saga_state", "type": "Any" },
|
||||
{ "name": "event", "type": "Any" }
|
||||
],
|
||||
"nodes": [
|
||||
{
|
||||
"id": "const",
|
||||
"type": "Const",
|
||||
"data": {
|
||||
"value": {
|
||||
"new_saga_state": { "x": 1 },
|
||||
"work_items": [
|
||||
{
|
||||
"kind": "effect_command",
|
||||
"tenant_id": "t1",
|
||||
"command_id": "c1",
|
||||
"effect_name": "noop",
|
||||
"payload": { "a": 1 },
|
||||
"metadata": {}
|
||||
}
|
||||
],
|
||||
"schedules": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{ "id": "output", "type": "Output", "data": {} }
|
||||
],
|
||||
"edges": [
|
||||
{ "id": "e1", "source": "const", "sourceHandle": "out", "target": "output", "targetHandle": "value" }
|
||||
],
|
||||
"outputNodeId": "output"
|
||||
}
|
||||
"#
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn restart_rescans_and_delivers_due_schedule_exactly_once() {
|
||||
let storage = KvClient::in_memory();
|
||||
let runtime = SagaRuntime::default();
|
||||
let metrics = Arc::new(Metrics::default());
|
||||
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let program_path = tmp.path().join("saga_on_event.json");
|
||||
let manifest_path = tmp.path().join("sagas.yaml");
|
||||
std::fs::write(&program_path, program_json_with_one_outbox_item()).unwrap();
|
||||
std::fs::write(
|
||||
&manifest_path,
|
||||
format!(
|
||||
"sagas:\n - name: noop\n trigger_subjects: [\"tenant.*.aggregate.*.*\"]\n on_event: \"{}\"\n",
|
||||
program_path.to_string_lossy()
|
||||
),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let settings = Settings {
|
||||
saga_manifest_path: manifest_path.to_string_lossy().to_string(),
|
||||
schedule_scan_interval_ms: 1,
|
||||
schedule_batch_size: 10,
|
||||
..Default::default()
|
||||
};
|
||||
let programs = Arc::new(SagaPrograms::load(&settings).unwrap());
|
||||
|
||||
let tenant = TenantId::new("t1");
|
||||
let saga = SagaName::new("noop");
|
||||
let corr = CorrelationId::new("c1");
|
||||
let due_at_ms = Utc::now().timestamp_millis().max(0) as u64;
|
||||
let schedule_key = ScheduleKey::new(&tenant, &saga, &corr, due_at_ms);
|
||||
storage
|
||||
.put_schedule_item(&schedule_key, &json!({"x": 1}))
|
||||
.unwrap();
|
||||
|
||||
storage.fail_next_txn();
|
||||
|
||||
let shutdown = Arc::new(tokio::sync::Notify::new());
|
||||
let draining = Arc::new(AtomicBool::new(false));
|
||||
let task = tokio::spawn({
|
||||
let settings = settings.clone();
|
||||
let storage = storage.clone();
|
||||
let programs = programs.clone();
|
||||
let runtime = runtime.clone();
|
||||
let metrics = metrics.clone();
|
||||
let tenant_gate = Arc::new(TenantGate::new(None));
|
||||
let shutdown = shutdown.clone();
|
||||
async move {
|
||||
let scheduler = Scheduler;
|
||||
let _ = scheduler
|
||||
.run(
|
||||
settings,
|
||||
storage,
|
||||
programs,
|
||||
runtime,
|
||||
metrics,
|
||||
tenant_gate,
|
||||
shutdown,
|
||||
draining,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
});
|
||||
|
||||
tokio::time::sleep(Duration::from_millis(50)).await;
|
||||
shutdown.notify_waiters();
|
||||
let _ = tokio::time::timeout(Duration::from_secs(1), task).await;
|
||||
|
||||
assert_eq!(storage.list_outbox_all(100).unwrap().len(), 0);
|
||||
assert_eq!(
|
||||
storage
|
||||
.scan_due_schedule_items_all(due_at_ms + 1, 10)
|
||||
.unwrap()
|
||||
.len(),
|
||||
1
|
||||
);
|
||||
|
||||
let shutdown = Arc::new(tokio::sync::Notify::new());
|
||||
let draining = Arc::new(AtomicBool::new(false));
|
||||
let task = tokio::spawn({
|
||||
let storage = storage.clone();
|
||||
let shutdown = shutdown.clone();
|
||||
async move {
|
||||
let scheduler = Scheduler;
|
||||
let tenant_gate = Arc::new(TenantGate::new(None));
|
||||
let _ = scheduler
|
||||
.run(
|
||||
settings,
|
||||
storage,
|
||||
programs,
|
||||
runtime,
|
||||
metrics,
|
||||
tenant_gate,
|
||||
shutdown,
|
||||
draining,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
});
|
||||
|
||||
let start = tokio::time::Instant::now();
|
||||
loop {
|
||||
let outbox = storage.list_outbox_all(100).unwrap().len();
|
||||
if outbox == 1 {
|
||||
break;
|
||||
}
|
||||
if start.elapsed() > Duration::from_secs(2) {
|
||||
panic!("timed out waiting for outbox");
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(10)).await;
|
||||
}
|
||||
|
||||
shutdown.notify_waiters();
|
||||
let _ = tokio::time::timeout(Duration::from_secs(1), task).await;
|
||||
|
||||
assert_eq!(storage.list_outbox_all(100).unwrap().len(), 1);
|
||||
assert_eq!(
|
||||
storage
|
||||
.scan_due_schedule_items_all(due_at_ms + 1, 10)
|
||||
.unwrap()
|
||||
.len(),
|
||||
0
|
||||
);
|
||||
}
|
||||
}
|
||||
793
runner/src/storage/kv.rs
Normal file
793
runner/src/storage/kv.rs
Normal file
@@ -0,0 +1,793 @@
|
||||
use crate::types::{
|
||||
CheckpointKey, DedupeEffectKey, DedupeEventKey, RunnerError, SagaStateKey, ScheduleKey,
|
||||
TenantId, WorkId, WorkItem,
|
||||
};
|
||||
use edge_storage::{Config as EdgeConfig, EdgeStorage, KvStore, TableNames, Writer};
|
||||
use libmdbx::{NoWriteMap, WriteFlags, RW};
|
||||
use serde_json::Value;
|
||||
use std::sync::Arc;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct KvClient {
|
||||
storage: Arc<EdgeStorage>,
|
||||
kv: KvStore,
|
||||
#[cfg(test)]
|
||||
fail_next_txn: Arc<std::sync::atomic::AtomicBool>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for KvClient {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("KvClient").finish_non_exhaustive()
|
||||
}
|
||||
}
|
||||
|
||||
impl KvClient {
|
||||
pub fn open(storage_path: impl Into<String>) -> Result<Self, StorageInitError> {
|
||||
let config = EdgeConfig::new(storage_path.into());
|
||||
let storage = EdgeStorage::open(config.clone())?;
|
||||
let writer = Arc::new(Writer::new(storage.db().clone(), &config));
|
||||
let kv = KvStore::new(storage.db().clone(), writer);
|
||||
|
||||
Ok(Self {
|
||||
storage: Arc::new(storage),
|
||||
kv,
|
||||
#[cfg(test)]
|
||||
fail_next_txn: Arc::new(std::sync::atomic::AtomicBool::new(false)),
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn in_memory() -> Self {
|
||||
use tempfile::tempdir;
|
||||
let dir = tempdir().expect("failed to create temp dir");
|
||||
let path = dir.path().join("test.mdbx");
|
||||
std::mem::forget(dir);
|
||||
Self::open(path.to_string_lossy().to_string()).expect("failed to open storage")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn fail_next_txn(&self) {
|
||||
self.fail_next_txn
|
||||
.store(true, std::sync::atomic::Ordering::SeqCst);
|
||||
}
|
||||
|
||||
pub fn get_saga_state(&self, key: &SagaStateKey) -> Result<Option<Value>, RunnerError> {
|
||||
self.get_json(key.as_str().as_bytes())
|
||||
}
|
||||
|
||||
pub fn put_saga_state(&self, key: &SagaStateKey, value: &Value) -> Result<(), RunnerError> {
|
||||
self.put_json(key.as_str().as_bytes(), value)
|
||||
}
|
||||
|
||||
pub fn put_config_value(&self, key: &str, value: &Value) -> Result<(), RunnerError> {
|
||||
let storage_key = format!("config:{}", key);
|
||||
self.put_json(storage_key.as_bytes(), value)
|
||||
}
|
||||
|
||||
pub fn get_config_value(&self, key: &str) -> Result<Option<Value>, RunnerError> {
|
||||
let storage_key = format!("config:{}", key);
|
||||
self.get_json(storage_key.as_bytes())
|
||||
}
|
||||
|
||||
pub fn delete_config_value(&self, key: &str) -> Result<(), RunnerError> {
|
||||
let storage_key = format!("config:{}", key);
|
||||
self.delete_key(storage_key.as_bytes())
|
||||
}
|
||||
|
||||
pub fn put_effects_manifest_override(
|
||||
&self,
|
||||
manifest: &crate::effects::EffectsManifest,
|
||||
) -> Result<(), RunnerError> {
|
||||
let value =
|
||||
serde_json::to_value(manifest).map_err(|e| RunnerError::StorageError(e.to_string()))?;
|
||||
self.put_config_value("effects_manifest", &value)
|
||||
}
|
||||
|
||||
pub fn get_effects_manifest_override(
|
||||
&self,
|
||||
) -> Result<Option<crate::effects::EffectsManifest>, RunnerError> {
|
||||
let value = self.get_config_value("effects_manifest")?;
|
||||
match value {
|
||||
Some(v) => serde_json::from_value(v)
|
||||
.map(Some)
|
||||
.map_err(|e| RunnerError::DecodeError(e.to_string())),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn clear_effects_manifest_override(&self) -> Result<(), RunnerError> {
|
||||
self.delete_config_value("effects_manifest")
|
||||
}
|
||||
|
||||
pub fn get_checkpoint(&self, key: &CheckpointKey) -> Result<Option<u64>, RunnerError> {
|
||||
let bytes = self
|
||||
.kv
|
||||
.get(key.as_str().as_bytes())
|
||||
.map_err(|e| RunnerError::StorageError(e.to_string()))?;
|
||||
let Some(bytes) = bytes else {
|
||||
return Ok(None);
|
||||
};
|
||||
serde_json::from_slice::<u64>(&bytes)
|
||||
.map(Some)
|
||||
.map_err(|e| RunnerError::DecodeError(format!("Failed to decode checkpoint: {}", e)))
|
||||
}
|
||||
|
||||
pub fn put_checkpoint(&self, key: &CheckpointKey, value: u64) -> Result<(), RunnerError> {
|
||||
let bytes =
|
||||
serde_json::to_vec(&value).map_err(|e| RunnerError::StorageError(e.to_string()))?;
|
||||
self.commit_kv_txn(|txn, table| {
|
||||
txn.put(
|
||||
table,
|
||||
key.as_str().as_bytes(),
|
||||
bytes.as_slice(),
|
||||
WriteFlags::empty(),
|
||||
)?;
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
pub fn put_outbox_item(
|
||||
&self,
|
||||
tenant_id: &TenantId,
|
||||
work_kind: &str,
|
||||
work_id: &WorkId,
|
||||
item: &WorkItem,
|
||||
) -> Result<String, RunnerError> {
|
||||
let key = format!("outbox:{}:{}:{}", tenant_id.as_str(), work_kind, work_id);
|
||||
let bytes =
|
||||
serde_json::to_vec(item).map_err(|e| RunnerError::StorageError(e.to_string()))?;
|
||||
self.commit_kv_txn(|txn, table| {
|
||||
txn.put(table, key.as_bytes(), bytes.as_slice(), WriteFlags::empty())?;
|
||||
Ok(())
|
||||
})?;
|
||||
Ok(key)
|
||||
}
|
||||
|
||||
pub fn list_outbox_prefix(
|
||||
&self,
|
||||
tenant_id: &TenantId,
|
||||
max_items: usize,
|
||||
) -> Result<Vec<(String, WorkItem)>, RunnerError> {
|
||||
let prefix = format!("outbox:{}:", tenant_id.as_str());
|
||||
let txn = self
|
||||
.storage
|
||||
.db()
|
||||
.begin_ro_txn()
|
||||
.map_err(|e| RunnerError::StorageError(e.to_string()))?;
|
||||
|
||||
let mut items = Vec::new();
|
||||
for res in self
|
||||
.kv
|
||||
.prefix_scan(&txn, prefix.as_bytes())
|
||||
.map_err(|e| RunnerError::StorageError(e.to_string()))?
|
||||
{
|
||||
let (k, v) = res.map_err(|e| RunnerError::StorageError(e.to_string()))?;
|
||||
let key = String::from_utf8_lossy(&k).to_string();
|
||||
let item: WorkItem =
|
||||
serde_json::from_slice(&v).map_err(|e| RunnerError::DecodeError(e.to_string()))?;
|
||||
items.push((key, item));
|
||||
if items.len() >= max_items {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(items)
|
||||
}
|
||||
|
||||
pub fn list_outbox_all(
|
||||
&self,
|
||||
max_items: usize,
|
||||
) -> Result<Vec<(String, WorkItem)>, RunnerError> {
|
||||
let txn = self
|
||||
.storage
|
||||
.db()
|
||||
.begin_ro_txn()
|
||||
.map_err(|e| RunnerError::StorageError(e.to_string()))?;
|
||||
|
||||
let mut items = Vec::new();
|
||||
for res in self
|
||||
.kv
|
||||
.prefix_scan(&txn, b"outbox:")
|
||||
.map_err(|e| RunnerError::StorageError(e.to_string()))?
|
||||
{
|
||||
let (k, v) = res.map_err(|e| RunnerError::StorageError(e.to_string()))?;
|
||||
let key = String::from_utf8_lossy(&k).to_string();
|
||||
let item: WorkItem =
|
||||
serde_json::from_slice(&v).map_err(|e| RunnerError::DecodeError(e.to_string()))?;
|
||||
items.push((key, item));
|
||||
if items.len() >= max_items {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(items)
|
||||
}
|
||||
|
||||
pub fn delete_outbox_item(&self, key: &str) -> Result<(), RunnerError> {
|
||||
self.delete_key(key.as_bytes())
|
||||
}
|
||||
|
||||
pub fn put_schedule_item(&self, key: &ScheduleKey, payload: &Value) -> Result<(), RunnerError> {
|
||||
self.put_json(key.as_str().as_bytes(), payload)
|
||||
}
|
||||
|
||||
pub fn scan_due_schedule_items(
|
||||
&self,
|
||||
tenant_id: &TenantId,
|
||||
now_ms: u64,
|
||||
max_items: usize,
|
||||
) -> Result<Vec<(String, Value)>, RunnerError> {
|
||||
let prefix = ScheduleKey::prefix_for_tenant(tenant_id);
|
||||
let txn = self
|
||||
.storage
|
||||
.db()
|
||||
.begin_ro_txn()
|
||||
.map_err(|e| RunnerError::StorageError(e.to_string()))?;
|
||||
|
||||
let mut due = Vec::new();
|
||||
for res in self
|
||||
.kv
|
||||
.prefix_scan(&txn, prefix.as_bytes())
|
||||
.map_err(|e| RunnerError::StorageError(e.to_string()))?
|
||||
{
|
||||
let (k, v) = res.map_err(|e| RunnerError::StorageError(e.to_string()))?;
|
||||
let key = String::from_utf8_lossy(&k).to_string();
|
||||
let due_at = key
|
||||
.rsplit(':')
|
||||
.next()
|
||||
.and_then(|s| s.parse::<u64>().ok())
|
||||
.unwrap_or(u64::MAX);
|
||||
if due_at > now_ms {
|
||||
continue;
|
||||
}
|
||||
let payload: Value =
|
||||
serde_json::from_slice(&v).map_err(|e| RunnerError::DecodeError(e.to_string()))?;
|
||||
due.push((key, payload));
|
||||
if due.len() >= max_items {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(due)
|
||||
}
|
||||
|
||||
pub fn scan_due_schedule_items_all(
|
||||
&self,
|
||||
now_ms: u64,
|
||||
max_items: usize,
|
||||
) -> Result<Vec<(String, Value)>, RunnerError> {
|
||||
let txn = self
|
||||
.storage
|
||||
.db()
|
||||
.begin_ro_txn()
|
||||
.map_err(|e| RunnerError::StorageError(e.to_string()))?;
|
||||
|
||||
let mut due = Vec::new();
|
||||
for res in self
|
||||
.kv
|
||||
.prefix_scan(&txn, b"schedule:")
|
||||
.map_err(|e| RunnerError::StorageError(e.to_string()))?
|
||||
{
|
||||
let (k, v) = res.map_err(|e| RunnerError::StorageError(e.to_string()))?;
|
||||
let key = String::from_utf8_lossy(&k).to_string();
|
||||
let due_at = key
|
||||
.rsplit(':')
|
||||
.next()
|
||||
.and_then(|s| s.parse::<u64>().ok())
|
||||
.unwrap_or(u64::MAX);
|
||||
if due_at > now_ms {
|
||||
continue;
|
||||
}
|
||||
let payload: Value =
|
||||
serde_json::from_slice(&v).map_err(|e| RunnerError::DecodeError(e.to_string()))?;
|
||||
due.push((key, payload));
|
||||
if due.len() >= max_items {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(due)
|
||||
}
|
||||
|
||||
pub fn delete_schedule_item(&self, key: &ScheduleKey) -> Result<(), RunnerError> {
|
||||
self.delete_key(key.as_str().as_bytes())
|
||||
}
|
||||
|
||||
pub fn put_deadletter(&self, key: &str, record: &Value) -> Result<(), RunnerError> {
|
||||
self.put_json(key.as_bytes(), record)
|
||||
}
|
||||
|
||||
pub fn list_deadletters(&self, max_items: usize) -> Result<Vec<(String, Value)>, RunnerError> {
|
||||
let txn = self
|
||||
.storage
|
||||
.db()
|
||||
.begin_ro_txn()
|
||||
.map_err(|e| RunnerError::StorageError(e.to_string()))?;
|
||||
|
||||
let mut items = Vec::new();
|
||||
for res in self
|
||||
.kv
|
||||
.prefix_scan(&txn, b"deadletter:")
|
||||
.map_err(|e| RunnerError::StorageError(e.to_string()))?
|
||||
{
|
||||
let (k, v) = res.map_err(|e| RunnerError::StorageError(e.to_string()))?;
|
||||
let key = String::from_utf8_lossy(&k).to_string();
|
||||
let record: Value =
|
||||
serde_json::from_slice(&v).map_err(|e| RunnerError::DecodeError(e.to_string()))?;
|
||||
items.push((key, record));
|
||||
if items.len() >= max_items {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(items)
|
||||
}
|
||||
|
||||
pub fn delete_prefix(&self, prefix: &str, max_items: usize) -> Result<usize, RunnerError> {
|
||||
let keys = self.list_keys_with_prefix(prefix.as_bytes(), max_items)?;
|
||||
if keys.is_empty() {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
self.commit_kv_txn(|txn, table| {
|
||||
for key in &keys {
|
||||
let _ = txn.del(table, key.as_bytes(), None)?;
|
||||
}
|
||||
Ok(())
|
||||
})?;
|
||||
|
||||
Ok(keys.len())
|
||||
}
|
||||
|
||||
fn list_keys_with_prefix(
|
||||
&self,
|
||||
prefix: &[u8],
|
||||
max_items: usize,
|
||||
) -> Result<Vec<String>, RunnerError> {
|
||||
let txn = self
|
||||
.storage
|
||||
.db()
|
||||
.begin_ro_txn()
|
||||
.map_err(|e| RunnerError::StorageError(e.to_string()))?;
|
||||
|
||||
let mut keys = Vec::new();
|
||||
for res in self
|
||||
.kv
|
||||
.prefix_scan(&txn, prefix)
|
||||
.map_err(|e| RunnerError::StorageError(e.to_string()))?
|
||||
{
|
||||
let (k, _) = res.map_err(|e| RunnerError::StorageError(e.to_string()))?;
|
||||
keys.push(String::from_utf8_lossy(&k).to_string());
|
||||
if keys.len() >= max_items {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Ok(keys)
|
||||
}
|
||||
|
||||
pub fn writable_probe(&self) -> Result<(), RunnerError> {
|
||||
let key = b"__runner_health_probe";
|
||||
self.commit_kv_txn(|txn, table| {
|
||||
txn.put(table, key, b"1", WriteFlags::empty())?;
|
||||
let _ = txn.del(table, key, None)?;
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
pub fn is_deduped_event(&self, key: &DedupeEventKey) -> Result<bool, RunnerError> {
|
||||
let bytes = self
|
||||
.kv
|
||||
.get(key.as_str().as_bytes())
|
||||
.map_err(|e| RunnerError::StorageError(e.to_string()))?;
|
||||
Ok(bytes.is_some())
|
||||
}
|
||||
|
||||
pub fn mark_deduped_event(&self, key: &DedupeEventKey) -> Result<(), RunnerError> {
|
||||
self.commit_kv_txn(|txn, table| {
|
||||
txn.put(table, key.as_str().as_bytes(), b"1", WriteFlags::empty())?;
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
pub fn is_deduped_effect(&self, key: &DedupeEffectKey) -> Result<bool, RunnerError> {
|
||||
let bytes = self
|
||||
.kv
|
||||
.get(key.as_str().as_bytes())
|
||||
.map_err(|e| RunnerError::StorageError(e.to_string()))?;
|
||||
Ok(bytes.is_some())
|
||||
}
|
||||
|
||||
pub fn mark_deduped_effect(&self, key: &DedupeEffectKey) -> Result<(), RunnerError> {
|
||||
self.commit_kv_txn(|txn, table| {
|
||||
txn.put(table, key.as_str().as_bytes(), b"1", WriteFlags::empty())?;
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn commit_saga_processing(
|
||||
&self,
|
||||
saga_key: &SagaStateKey,
|
||||
new_saga_state: &Value,
|
||||
outbox_items: Vec<(String, WorkItem)>,
|
||||
schedule_items: Vec<(ScheduleKey, Value)>,
|
||||
checkpoint_key: &CheckpointKey,
|
||||
checkpoint_sequence: u64,
|
||||
dedupe_event: Option<&DedupeEventKey>,
|
||||
) -> Result<(), RunnerError> {
|
||||
let saga_bytes = serde_json::to_vec(new_saga_state)
|
||||
.map_err(|e| RunnerError::StorageError(e.to_string()))?;
|
||||
let checkpoint_bytes = serde_json::to_vec(&checkpoint_sequence)
|
||||
.map_err(|e| RunnerError::StorageError(e.to_string()))?;
|
||||
let outbox_bytes = outbox_items
|
||||
.into_iter()
|
||||
.map(|(k, item)| {
|
||||
serde_json::to_vec(&item)
|
||||
.map(|bytes| (k, bytes))
|
||||
.map_err(|e| RunnerError::StorageError(e.to_string()))
|
||||
})
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
|
||||
let schedule_bytes = schedule_items
|
||||
.into_iter()
|
||||
.map(|(k, payload)| {
|
||||
serde_json::to_vec(&payload)
|
||||
.map(|bytes| (k, bytes))
|
||||
.map_err(|e| RunnerError::StorageError(e.to_string()))
|
||||
})
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
|
||||
self.commit_kv_txn(|txn, table| {
|
||||
txn.put(
|
||||
table,
|
||||
saga_key.as_str().as_bytes(),
|
||||
saga_bytes.as_slice(),
|
||||
WriteFlags::empty(),
|
||||
)?;
|
||||
|
||||
for (key, bytes) in outbox_bytes {
|
||||
txn.put(table, key.as_bytes(), bytes.as_slice(), WriteFlags::empty())?;
|
||||
}
|
||||
|
||||
for (key, bytes) in schedule_bytes {
|
||||
txn.put(
|
||||
table,
|
||||
key.as_str().as_bytes(),
|
||||
bytes.as_slice(),
|
||||
WriteFlags::empty(),
|
||||
)?;
|
||||
}
|
||||
|
||||
txn.put(
|
||||
table,
|
||||
checkpoint_key.as_str().as_bytes(),
|
||||
checkpoint_bytes.as_slice(),
|
||||
WriteFlags::empty(),
|
||||
)?;
|
||||
|
||||
if let Some(key) = dedupe_event {
|
||||
txn.put(table, key.as_str().as_bytes(), b"1", WriteFlags::empty())?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
pub fn commit_schedule_processing(
|
||||
&self,
|
||||
saga_key: &SagaStateKey,
|
||||
new_saga_state: &Value,
|
||||
outbox_items: Vec<(String, WorkItem)>,
|
||||
schedule_key: &ScheduleKey,
|
||||
new_schedule_items: Vec<(ScheduleKey, Value)>,
|
||||
) -> Result<(), RunnerError> {
|
||||
let saga_bytes = serde_json::to_vec(new_saga_state)
|
||||
.map_err(|e| RunnerError::StorageError(e.to_string()))?;
|
||||
let outbox_bytes = outbox_items
|
||||
.into_iter()
|
||||
.map(|(k, item)| {
|
||||
serde_json::to_vec(&item)
|
||||
.map(|bytes| (k, bytes))
|
||||
.map_err(|e| RunnerError::StorageError(e.to_string()))
|
||||
})
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
|
||||
let schedule_bytes = new_schedule_items
|
||||
.into_iter()
|
||||
.map(|(k, payload)| {
|
||||
serde_json::to_vec(&payload)
|
||||
.map(|bytes| (k, bytes))
|
||||
.map_err(|e| RunnerError::StorageError(e.to_string()))
|
||||
})
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
|
||||
self.commit_kv_txn(|txn, table| {
|
||||
txn.put(
|
||||
table,
|
||||
saga_key.as_str().as_bytes(),
|
||||
saga_bytes.as_slice(),
|
||||
WriteFlags::empty(),
|
||||
)?;
|
||||
|
||||
for (key, bytes) in outbox_bytes {
|
||||
txn.put(table, key.as_bytes(), bytes.as_slice(), WriteFlags::empty())?;
|
||||
}
|
||||
|
||||
for (key, bytes) in schedule_bytes {
|
||||
txn.put(
|
||||
table,
|
||||
key.as_str().as_bytes(),
|
||||
bytes.as_slice(),
|
||||
WriteFlags::empty(),
|
||||
)?;
|
||||
}
|
||||
|
||||
let _ = txn.del(table, schedule_key.as_str().as_bytes(), None)?;
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
fn get_json(&self, key: &[u8]) -> Result<Option<Value>, RunnerError> {
|
||||
let bytes = self
|
||||
.kv
|
||||
.get(key)
|
||||
.map_err(|e| RunnerError::StorageError(e.to_string()))?;
|
||||
|
||||
match bytes {
|
||||
Some(bytes) => serde_json::from_slice(&bytes)
|
||||
.map(Some)
|
||||
.map_err(|e| RunnerError::DecodeError(e.to_string())),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
fn put_json(&self, key: &[u8], value: &Value) -> Result<(), RunnerError> {
|
||||
let bytes =
|
||||
serde_json::to_vec(value).map_err(|e| RunnerError::StorageError(e.to_string()))?;
|
||||
self.commit_kv_txn(|txn, table| {
|
||||
txn.put(table, key, bytes.as_slice(), WriteFlags::empty())?;
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
fn delete_key(&self, key: &[u8]) -> Result<(), RunnerError> {
|
||||
self.commit_kv_txn(|txn, table| {
|
||||
let _ = txn.del(table, key, None)?;
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
fn commit_kv_txn<F>(&self, f: F) -> Result<(), RunnerError>
|
||||
where
|
||||
F: FnOnce(
|
||||
&libmdbx::Transaction<'_, RW, NoWriteMap>,
|
||||
&libmdbx::Table<'_>,
|
||||
) -> Result<(), libmdbx::Error>,
|
||||
{
|
||||
#[cfg(test)]
|
||||
{
|
||||
if self
|
||||
.fail_next_txn
|
||||
.swap(false, std::sync::atomic::Ordering::SeqCst)
|
||||
{
|
||||
return Err(RunnerError::StorageError("failpoint".to_string()));
|
||||
}
|
||||
}
|
||||
|
||||
let txn = self
|
||||
.storage
|
||||
.db()
|
||||
.begin_rw_txn()
|
||||
.map_err(|e| RunnerError::StorageError(e.to_string()))?;
|
||||
let table = txn
|
||||
.open_table(TableNames::KV_STORE)
|
||||
.map_err(|e| RunnerError::StorageError(e.to_string()))?;
|
||||
if let Err(e) = f(&txn, &table) {
|
||||
return Err(RunnerError::StorageError(e.to_string()));
|
||||
}
|
||||
txn.commit()
|
||||
.map_err(|e| RunnerError::StorageError(e.to_string()))?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum StorageInitError {
|
||||
#[error("Failed to open storage: {0}")]
|
||||
OpenError(#[from] edge_storage::Error),
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::types::{CorrelationId, EventId, SagaName};
|
||||
use serde_json::json;
|
||||
|
||||
#[test]
|
||||
fn saga_state_roundtrip_put_get() {
|
||||
let client = KvClient::in_memory();
|
||||
let tenant = TenantId::new("t1");
|
||||
let saga = SagaName::new("billing");
|
||||
let corr = CorrelationId::new("c1");
|
||||
let key = SagaStateKey::new(&tenant, &saga, &corr);
|
||||
|
||||
client.put_saga_state(&key, &json!({"a": 1})).unwrap();
|
||||
let loaded = client.get_saga_state(&key).unwrap().unwrap();
|
||||
assert_eq!(loaded["a"], 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn checkpoint_roundtrip_put_get() {
|
||||
let client = KvClient::in_memory();
|
||||
let tenant = TenantId::new("t1");
|
||||
let saga = SagaName::new("billing");
|
||||
let key = CheckpointKey::new(&tenant, &saga);
|
||||
|
||||
client.put_checkpoint(&key, 42).unwrap();
|
||||
let loaded = client.get_checkpoint(&key).unwrap().unwrap();
|
||||
assert_eq!(loaded, 42);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn atomicity_no_partial_commit_on_failure() {
|
||||
let client = KvClient::in_memory();
|
||||
let tenant = TenantId::new("t1");
|
||||
let saga = SagaName::new("billing");
|
||||
let corr = CorrelationId::new("c1");
|
||||
let saga_key = SagaStateKey::new(&tenant, &saga, &corr);
|
||||
let checkpoint_key = CheckpointKey::new(&tenant, &saga);
|
||||
|
||||
let preexisting = "preexisting".as_bytes().to_vec();
|
||||
client
|
||||
.commit_kv_txn(|txn, table| {
|
||||
txn.put(table, preexisting.as_slice(), b"1", WriteFlags::empty())?;
|
||||
Ok(())
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let txn_result = client.commit_kv_txn(|txn, table| {
|
||||
let saga_bytes = serde_json::to_vec(&json!({"x": 1})).unwrap();
|
||||
txn.put(
|
||||
table,
|
||||
saga_key.as_str().as_bytes(),
|
||||
saga_bytes.as_slice(),
|
||||
WriteFlags::empty(),
|
||||
)?;
|
||||
txn.put(
|
||||
table,
|
||||
preexisting.as_slice(),
|
||||
b"2",
|
||||
WriteFlags::NO_OVERWRITE,
|
||||
)?;
|
||||
let checkpoint_bytes = serde_json::to_vec(&123u64).unwrap();
|
||||
txn.put(
|
||||
table,
|
||||
checkpoint_key.as_str().as_bytes(),
|
||||
checkpoint_bytes.as_slice(),
|
||||
WriteFlags::empty(),
|
||||
)?;
|
||||
Ok(())
|
||||
});
|
||||
|
||||
assert!(txn_result.is_err());
|
||||
assert!(client.get_saga_state(&saga_key).unwrap().is_none());
|
||||
assert!(client.get_checkpoint(&checkpoint_key).unwrap().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn outbox_delete_removes_key() {
|
||||
let client = KvClient::in_memory();
|
||||
let tenant = TenantId::new("t1");
|
||||
let work_id = WorkId::new_v7();
|
||||
let key = client
|
||||
.put_outbox_item(
|
||||
&tenant,
|
||||
"effect",
|
||||
&work_id,
|
||||
&WorkItem::EffectCommand(crate::types::EffectCommandEnvelope {
|
||||
tenant_id: tenant.clone(),
|
||||
command_id: crate::types::CommandId::new("c1"),
|
||||
effect_name: crate::types::EffectName::new("noop"),
|
||||
payload: json!({"a": 1}),
|
||||
metadata: crate::types::MessageMetadata::default(),
|
||||
}),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(client.list_outbox_prefix(&tenant, 10).unwrap().len(), 1);
|
||||
client.delete_outbox_item(&key).unwrap();
|
||||
assert_eq!(client.list_outbox_prefix(&tenant, 10).unwrap().len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn schedule_scan_only_returns_due_items() {
|
||||
let client = KvClient::in_memory();
|
||||
let tenant = TenantId::new("t1");
|
||||
let saga = SagaName::new("billing");
|
||||
let corr = CorrelationId::new("c1");
|
||||
|
||||
let due_key = ScheduleKey::new(&tenant, &saga, &corr, 100);
|
||||
let future_key = ScheduleKey::new(&tenant, &saga, &corr, 200);
|
||||
client
|
||||
.put_schedule_item(&due_key, &json!({"x": 1}))
|
||||
.unwrap();
|
||||
client
|
||||
.put_schedule_item(&future_key, &json!({"x": 2}))
|
||||
.unwrap();
|
||||
|
||||
let due = client.scan_due_schedule_items(&tenant, 150, 10).unwrap();
|
||||
assert_eq!(due.len(), 1);
|
||||
assert_eq!(due[0].1["x"], 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scheduler_is_tenant_scoped() {
|
||||
let client = KvClient::in_memory();
|
||||
let tenant_a = TenantId::new("t1");
|
||||
let tenant_b = TenantId::new("t2");
|
||||
let saga = SagaName::new("billing");
|
||||
let corr = CorrelationId::new("c1");
|
||||
|
||||
let key_a = ScheduleKey::new(&tenant_a, &saga, &corr, 100);
|
||||
let key_b = ScheduleKey::new(&tenant_b, &saga, &corr, 100);
|
||||
client
|
||||
.put_schedule_item(&key_a, &json!({"x": "a"}))
|
||||
.unwrap();
|
||||
client
|
||||
.put_schedule_item(&key_b, &json!({"x": "b"}))
|
||||
.unwrap();
|
||||
|
||||
let due_a = client.scan_due_schedule_items(&tenant_a, 200, 10).unwrap();
|
||||
assert_eq!(due_a.len(), 1);
|
||||
assert_eq!(due_a[0].1["x"], "a");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn due_schedule_item_can_be_committed_and_deleted() {
|
||||
let client = KvClient::in_memory();
|
||||
let tenant = TenantId::new("t1");
|
||||
let saga = SagaName::new("billing");
|
||||
let corr = CorrelationId::new("c1");
|
||||
|
||||
let schedule_key = ScheduleKey::new(&tenant, &saga, &corr, 100);
|
||||
client
|
||||
.put_schedule_item(&schedule_key, &json!({"x": 1}))
|
||||
.unwrap();
|
||||
|
||||
let saga_key = SagaStateKey::new(&tenant, &saga, &corr);
|
||||
let outbox_key = format!("outbox:{}:effect:{}", tenant.as_str(), WorkId::new_v7());
|
||||
client
|
||||
.commit_schedule_processing(
|
||||
&saga_key,
|
||||
&json!({"state": 1}),
|
||||
vec![(
|
||||
outbox_key,
|
||||
WorkItem::EffectCommand(crate::types::EffectCommandEnvelope {
|
||||
tenant_id: tenant.clone(),
|
||||
command_id: crate::types::CommandId::new("c1"),
|
||||
effect_name: crate::types::EffectName::new("noop"),
|
||||
payload: json!({"a": 1}),
|
||||
metadata: crate::types::MessageMetadata::default(),
|
||||
}),
|
||||
)],
|
||||
&schedule_key,
|
||||
Vec::new(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let due = client.scan_due_schedule_items(&tenant, 200, 10).unwrap();
|
||||
assert!(due.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dedupe_marker_roundtrip() {
|
||||
let client = KvClient::in_memory();
|
||||
let tenant = TenantId::new("t1");
|
||||
let saga = SagaName::new("billing");
|
||||
let event_id = EventId::new("e1");
|
||||
let key = DedupeEventKey::new(&tenant, &saga, &event_id);
|
||||
assert!(!client.is_deduped_event(&key).unwrap());
|
||||
client.mark_deduped_event(&key).unwrap();
|
||||
assert!(client.is_deduped_event(&key).unwrap());
|
||||
}
|
||||
}
|
||||
3
runner/src/storage/mod.rs
Normal file
3
runner/src/storage/mod.rs
Normal file
@@ -0,0 +1,3 @@
|
||||
mod kv;
|
||||
|
||||
pub use kv::{KvClient, StorageInitError};
|
||||
289
runner/src/stream/jetstream.rs
Normal file
289
runner/src/stream/jetstream.rs
Normal file
@@ -0,0 +1,289 @@
|
||||
use crate::config::Settings;
|
||||
use crate::types::{EffectCommandEnvelope, EffectResultEnvelope, RunnerError};
|
||||
use async_nats::jetstream::{
|
||||
self,
|
||||
consumer::pull::Config as PullConfig,
|
||||
consumer::{AckPolicy, DeliverPolicy, ReplayPolicy},
|
||||
stream::Config as StreamConfig,
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct JetStreamClient {
|
||||
jetstream: jetstream::Context,
|
||||
aggregate_events_stream: jetstream::stream::Stream,
|
||||
workflow_commands_stream: jetstream::stream::Stream,
|
||||
#[allow(dead_code)]
|
||||
workflow_events_stream: jetstream::stream::Stream,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ConsumerOptions {
|
||||
pub durable_name: String,
|
||||
pub filter_subject: String,
|
||||
pub deliver_policy: DeliverPolicy,
|
||||
}
|
||||
|
||||
impl JetStreamClient {
|
||||
pub async fn connect(settings: &Settings) -> Result<Self, StreamInitError> {
|
||||
let client = async_nats::connect(&settings.nats_url)
|
||||
.await
|
||||
.map_err(|e| StreamInitError::Nats(e.to_string()))?;
|
||||
|
||||
let jetstream = jetstream::new(client);
|
||||
|
||||
let aggregate_events_subjects = if settings.saga_trigger_subject_filters.is_empty() {
|
||||
vec!["tenant.*.aggregate.*.*".to_string()]
|
||||
} else {
|
||||
settings.saga_trigger_subject_filters.clone()
|
||||
};
|
||||
let workflow_commands_subjects = vec![
|
||||
"tenant.*.effect.*.*".to_string(),
|
||||
"tenant.*.workflow.*.*".to_string(),
|
||||
];
|
||||
|
||||
let workflow_events_subjects = vec![
|
||||
"tenant.*.effect_result.*.*".to_string(),
|
||||
"tenant.*.workflow_event.*.*".to_string(),
|
||||
];
|
||||
|
||||
let mut last_err = None;
|
||||
for attempt in 0..30u64 {
|
||||
match try_init_streams(
|
||||
&jetstream,
|
||||
settings,
|
||||
aggregate_events_subjects.clone(),
|
||||
workflow_commands_subjects.clone(),
|
||||
workflow_events_subjects.clone(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok((aggregate_events_stream, workflow_commands_stream, workflow_events_stream)) => {
|
||||
return Ok(Self {
|
||||
jetstream,
|
||||
aggregate_events_stream,
|
||||
workflow_commands_stream,
|
||||
workflow_events_stream,
|
||||
});
|
||||
}
|
||||
Err(e) => {
|
||||
last_err = Some(e);
|
||||
let backoff = std::time::Duration::from_millis(100 * (attempt + 1).min(20));
|
||||
tokio::time::sleep(backoff).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(last_err.unwrap_or(StreamInitError::Stream(
|
||||
"Stream initialization failed".to_string(),
|
||||
)))
|
||||
}
|
||||
|
||||
pub async fn saga_trigger_consumer(
|
||||
&self,
|
||||
settings: &Settings,
|
||||
options: ConsumerOptions,
|
||||
) -> Result<jetstream::consumer::PullConsumer, StreamInitError> {
|
||||
let consumer_config = PullConfig {
|
||||
durable_name: Some(options.durable_name.clone()),
|
||||
deliver_policy: options.deliver_policy,
|
||||
ack_policy: AckPolicy::Explicit,
|
||||
ack_wait: std::time::Duration::from_millis(settings.ack_timeout_ms),
|
||||
filter_subject: options.filter_subject,
|
||||
replay_policy: ReplayPolicy::Instant,
|
||||
max_ack_pending: settings.max_in_flight as i64,
|
||||
max_deliver: settings.max_deliver,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
self.aggregate_events_stream
|
||||
.get_or_create_consumer(&options.durable_name, consumer_config)
|
||||
.await
|
||||
.map_err(|e| StreamInitError::Consumer(e.to_string()))
|
||||
}
|
||||
|
||||
pub async fn effect_command_consumer(
|
||||
&self,
|
||||
settings: &Settings,
|
||||
options: ConsumerOptions,
|
||||
) -> Result<jetstream::consumer::PullConsumer, StreamInitError> {
|
||||
let consumer_config = PullConfig {
|
||||
durable_name: Some(options.durable_name.clone()),
|
||||
deliver_policy: options.deliver_policy,
|
||||
ack_policy: AckPolicy::Explicit,
|
||||
ack_wait: std::time::Duration::from_millis(settings.ack_timeout_ms),
|
||||
filter_subject: options.filter_subject,
|
||||
replay_policy: ReplayPolicy::Instant,
|
||||
max_ack_pending: settings.max_in_flight as i64,
|
||||
max_deliver: settings.max_deliver,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
self.workflow_commands_stream
|
||||
.get_or_create_consumer(&options.durable_name, consumer_config)
|
||||
.await
|
||||
.map_err(|e| StreamInitError::Consumer(e.to_string()))
|
||||
}
|
||||
|
||||
pub async fn publish_effect_result(
|
||||
&self,
|
||||
subject: String,
|
||||
result: &EffectResultEnvelope,
|
||||
) -> Result<(), RunnerError> {
|
||||
let payload =
|
||||
serde_json::to_vec(result).map_err(|e| RunnerError::DecodeError(e.to_string()))?;
|
||||
let mut headers = async_nats::HeaderMap::new();
|
||||
headers.insert("tenant-id", result.tenant_id.as_str());
|
||||
headers.insert("command-id", result.command_id.as_str());
|
||||
headers.insert("effect-name", result.effect_name.as_str());
|
||||
if let Some(correlation_id) = result.metadata.correlation_id.as_ref() {
|
||||
headers.insert("x-correlation-id", correlation_id.as_str());
|
||||
headers.insert("correlation-id", correlation_id.as_str());
|
||||
}
|
||||
if let Some(trace_id) = result.metadata.trace_id.as_ref() {
|
||||
headers.insert("trace-id", trace_id.as_str());
|
||||
if let Some(traceparent) = shared::traceparent_from_trace_id(trace_id) {
|
||||
headers.insert("traceparent", traceparent.as_str());
|
||||
}
|
||||
}
|
||||
if let Some(traceparent) = result
|
||||
.metadata
|
||||
.extra
|
||||
.get("traceparent")
|
||||
.and_then(|v| v.as_str())
|
||||
{
|
||||
headers.insert("traceparent", traceparent);
|
||||
if result.metadata.trace_id.is_none() {
|
||||
if let Some(trace_id) = shared::trace_id_from_traceparent(traceparent) {
|
||||
headers.insert("trace-id", trace_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.jetstream
|
||||
.publish_with_headers(subject, headers, payload.into())
|
||||
.await
|
||||
.map_err(|e| RunnerError::StreamError(e.to_string()))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn publish_effect_command(
|
||||
&self,
|
||||
cmd: &EffectCommandEnvelope,
|
||||
) -> Result<(), RunnerError> {
|
||||
let subject = format!(
|
||||
"tenant.{}.effect.{}.{}",
|
||||
cmd.tenant_id.as_str(),
|
||||
cmd.effect_name.as_str(),
|
||||
cmd.command_id.as_str()
|
||||
);
|
||||
|
||||
let payload =
|
||||
serde_json::to_vec(cmd).map_err(|e| RunnerError::DecodeError(e.to_string()))?;
|
||||
let mut headers = async_nats::HeaderMap::new();
|
||||
headers.insert("Nats-Msg-Id", cmd.command_id.as_str());
|
||||
headers.insert("tenant-id", cmd.tenant_id.as_str());
|
||||
headers.insert("command-id", cmd.command_id.as_str());
|
||||
headers.insert("effect-name", cmd.effect_name.as_str());
|
||||
if let Some(correlation_id) = cmd.metadata.correlation_id.as_ref() {
|
||||
headers.insert("x-correlation-id", correlation_id.as_str());
|
||||
headers.insert("correlation-id", correlation_id.as_str());
|
||||
}
|
||||
if let Some(trace_id) = cmd.metadata.trace_id.as_ref() {
|
||||
headers.insert("trace-id", trace_id.as_str());
|
||||
if let Some(traceparent) = shared::traceparent_from_trace_id(trace_id) {
|
||||
headers.insert("traceparent", traceparent.as_str());
|
||||
}
|
||||
}
|
||||
if let Some(traceparent) = cmd
|
||||
.metadata
|
||||
.extra
|
||||
.get("traceparent")
|
||||
.and_then(|v| v.as_str())
|
||||
{
|
||||
headers.insert("traceparent", traceparent);
|
||||
if cmd.metadata.trace_id.is_none() {
|
||||
if let Some(trace_id) = shared::trace_id_from_traceparent(traceparent) {
|
||||
headers.insert("trace-id", trace_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.jetstream
|
||||
.publish_with_headers(subject, headers, payload.into())
|
||||
.await
|
||||
.map_err(|e| RunnerError::StreamError(e.to_string()))?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
async fn try_init_streams(
|
||||
jetstream: &jetstream::Context,
|
||||
settings: &Settings,
|
||||
aggregate_events_subjects: Vec<String>,
|
||||
workflow_commands_subjects: Vec<String>,
|
||||
workflow_events_subjects: Vec<String>,
|
||||
) -> Result<
|
||||
(
|
||||
jetstream::stream::Stream,
|
||||
jetstream::stream::Stream,
|
||||
jetstream::stream::Stream,
|
||||
),
|
||||
StreamInitError,
|
||||
> {
|
||||
let aggregate_events_stream = ensure_stream(
|
||||
jetstream,
|
||||
&settings.aggregate_events_stream,
|
||||
aggregate_events_subjects,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let workflow_commands_stream = ensure_stream(
|
||||
jetstream,
|
||||
&settings.workflow_commands_stream,
|
||||
workflow_commands_subjects,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let workflow_events_stream = ensure_stream(
|
||||
jetstream,
|
||||
&settings.workflow_events_stream,
|
||||
workflow_events_subjects,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok((
|
||||
aggregate_events_stream,
|
||||
workflow_commands_stream,
|
||||
workflow_events_stream,
|
||||
))
|
||||
}
|
||||
|
||||
async fn ensure_stream(
|
||||
jetstream: &jetstream::Context,
|
||||
name: &str,
|
||||
subjects: Vec<String>,
|
||||
) -> Result<jetstream::stream::Stream, StreamInitError> {
|
||||
let config = StreamConfig {
|
||||
name: name.to_string(),
|
||||
subjects,
|
||||
max_messages: 10_000_000,
|
||||
max_bytes: -1,
|
||||
max_age: std::time::Duration::from_secs(365 * 24 * 60 * 60),
|
||||
duplicate_window: std::time::Duration::from_secs(120),
|
||||
..Default::default()
|
||||
};
|
||||
jetstream
|
||||
.get_or_create_stream(config)
|
||||
.await
|
||||
.map_err(|e| StreamInitError::Stream(e.to_string()))
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum StreamInitError {
|
||||
#[error("Failed to connect to NATS: {0}")]
|
||||
Nats(String),
|
||||
#[error("Stream error: {0}")]
|
||||
Stream(String),
|
||||
#[error("Consumer error: {0}")]
|
||||
Consumer(String),
|
||||
}
|
||||
3
runner/src/stream/mod.rs
Normal file
3
runner/src/stream/mod.rs
Normal file
@@ -0,0 +1,3 @@
|
||||
mod jetstream;
|
||||
|
||||
pub use jetstream::{ConsumerOptions, JetStreamClient, StreamInitError};
|
||||
285
runner/src/tenant_placement.rs
Normal file
285
runner/src/tenant_placement.rs
Normal file
@@ -0,0 +1,285 @@
|
||||
use crate::config::Settings;
|
||||
use crate::types::RunnerError;
|
||||
use async_nats::jetstream;
|
||||
use futures::StreamExt;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::{Arc, Mutex, RwLock};
|
||||
use std::time::Duration;
|
||||
use tokio::sync::watch;
|
||||
use tokio::sync::Notify;
|
||||
|
||||
pub async fn start_tenant_filter(
|
||||
settings: &Settings,
|
||||
) -> Result<Option<watch::Receiver<HashSet<String>>>, RunnerError> {
|
||||
if let (Some(bucket), Some(shard_id)) = (
|
||||
settings.tenant_placement_bucket.clone(),
|
||||
settings.shard_id.clone(),
|
||||
) {
|
||||
let nats_url = settings.nats_url.clone();
|
||||
let (tx, rx) = watch::channel(HashSet::<String>::new());
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = watch_tenant_placement(nats_url, bucket, shard_id, tx).await {
|
||||
tracing::error!(error = %e, "Tenant placement watcher failed");
|
||||
}
|
||||
});
|
||||
return Ok(Some(rx));
|
||||
}
|
||||
|
||||
if !settings.tenant_allowlist.is_empty() {
|
||||
let initial = settings
|
||||
.tenant_allowlist
|
||||
.iter()
|
||||
.cloned()
|
||||
.collect::<HashSet<_>>();
|
||||
let (_tx, rx) = watch::channel(initial);
|
||||
return Ok(Some(rx));
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
async fn watch_tenant_placement(
|
||||
nats_url: String,
|
||||
bucket: String,
|
||||
shard_id: String,
|
||||
tx: watch::Sender<HashSet<String>>,
|
||||
) -> Result<(), RunnerError> {
|
||||
let client = async_nats::connect(&nats_url)
|
||||
.await
|
||||
.map_err(|e| RunnerError::StreamError(e.to_string()))?;
|
||||
let js = jetstream::new(client);
|
||||
|
||||
let store = match js.get_key_value(bucket.clone()).await {
|
||||
Ok(store) => store,
|
||||
Err(_) => js
|
||||
.create_key_value(jetstream::kv::Config {
|
||||
bucket: bucket.clone(),
|
||||
history: 1,
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.map_err(|e| RunnerError::StreamError(e.to_string()))?,
|
||||
};
|
||||
|
||||
let mut assignments: HashMap<String, String> = HashMap::new();
|
||||
let mut current: HashSet<String> = HashSet::new();
|
||||
|
||||
let mut watch = store
|
||||
.watch_all_from_revision(1)
|
||||
.await
|
||||
.map_err(|e| RunnerError::StreamError(e.to_string()))?;
|
||||
|
||||
while let Some(entry) = watch.next().await {
|
||||
let entry = entry.map_err(|e| RunnerError::StreamError(e.to_string()))?;
|
||||
|
||||
match entry.operation {
|
||||
jetstream::kv::Operation::Put => {
|
||||
let value = String::from_utf8(entry.value.to_vec())
|
||||
.map_err(|e| RunnerError::DecodeError(e.to_string()))?;
|
||||
assignments.insert(entry.key, value);
|
||||
}
|
||||
jetstream::kv::Operation::Delete | jetstream::kv::Operation::Purge => {
|
||||
assignments.remove(&entry.key);
|
||||
}
|
||||
}
|
||||
|
||||
let next = assignments
|
||||
.iter()
|
||||
.filter_map(|(tenant, shard)| {
|
||||
if shard == &shard_id {
|
||||
Some(tenant.clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
if next != current {
|
||||
current = next.clone();
|
||||
let _ = tx.send(next);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct TenantGate {
|
||||
assigned: Arc<RwLock<Option<HashSet<String>>>>,
|
||||
draining: Arc<RwLock<HashSet<String>>>,
|
||||
inflight: Arc<Mutex<HashMap<String, usize>>>,
|
||||
inflight_notify: Arc<Mutex<HashMap<String, Arc<Notify>>>>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for TenantGate {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("TenantGate").finish_non_exhaustive()
|
||||
}
|
||||
}
|
||||
|
||||
impl TenantGate {
|
||||
pub fn new(tenant_filter: Option<watch::Receiver<HashSet<String>>>) -> Self {
|
||||
let assigned = Arc::new(RwLock::new(
|
||||
tenant_filter.as_ref().map(|rx| rx.borrow().clone()),
|
||||
));
|
||||
|
||||
if let Some(mut rx) = tenant_filter {
|
||||
let assigned = assigned.clone();
|
||||
tokio::spawn(async move {
|
||||
loop {
|
||||
if rx.changed().await.is_err() {
|
||||
break;
|
||||
}
|
||||
let next = rx.borrow().clone();
|
||||
if let Ok(mut g) = assigned.write() {
|
||||
*g = Some(next);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
Self {
|
||||
assigned,
|
||||
draining: Arc::new(RwLock::new(HashSet::new())),
|
||||
inflight: Arc::new(Mutex::new(HashMap::new())),
|
||||
inflight_notify: Arc::new(Mutex::new(HashMap::new())),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_assigned(&self, tenant_id: &str) -> bool {
|
||||
let Ok(g) = self.assigned.read() else {
|
||||
return true;
|
||||
};
|
||||
match &*g {
|
||||
None => true,
|
||||
Some(tenants) => tenants.contains(tenant_id),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn assigned_tenants_snapshot(&self) -> Option<HashSet<String>> {
|
||||
let Ok(g) = self.assigned.read() else {
|
||||
return None;
|
||||
};
|
||||
g.clone()
|
||||
}
|
||||
|
||||
pub fn draining_tenants_snapshot(&self) -> HashSet<String> {
|
||||
let Ok(g) = self.draining.read() else {
|
||||
return HashSet::new();
|
||||
};
|
||||
g.clone()
|
||||
}
|
||||
|
||||
pub fn start_draining(&self, tenant_id: &str) {
|
||||
if let Ok(mut g) = self.draining.write() {
|
||||
g.insert(tenant_id.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
pub fn stop_draining(&self, tenant_id: &str) {
|
||||
if let Ok(mut g) = self.draining.write() {
|
||||
g.remove(tenant_id);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_draining(&self, tenant_id: &str) -> bool {
|
||||
let Ok(g) = self.draining.read() else {
|
||||
return false;
|
||||
};
|
||||
g.contains(tenant_id)
|
||||
}
|
||||
|
||||
pub fn inflight_count(&self, tenant_id: &str) -> usize {
|
||||
let Ok(g) = self.inflight.lock() else {
|
||||
return 0;
|
||||
};
|
||||
g.get(tenant_id).copied().unwrap_or(0)
|
||||
}
|
||||
|
||||
pub fn should_acquire_processing_work(&self, tenant_id: &str, global_draining: bool) -> bool {
|
||||
if global_draining {
|
||||
return false;
|
||||
}
|
||||
if self.is_draining(tenant_id) {
|
||||
return false;
|
||||
}
|
||||
self.is_assigned(tenant_id)
|
||||
}
|
||||
|
||||
pub fn should_dispatch_outbox_work(&self, tenant_id: &str, global_draining: bool) -> bool {
|
||||
if global_draining {
|
||||
return false;
|
||||
}
|
||||
self.is_assigned(tenant_id) || self.is_draining(tenant_id)
|
||||
}
|
||||
|
||||
pub fn begin_work(&self, tenant_id: &str) -> TenantWorkGuard {
|
||||
if let Ok(mut g) = self.inflight.lock() {
|
||||
*g.entry(tenant_id.to_string()).or_insert(0) += 1;
|
||||
}
|
||||
TenantWorkGuard {
|
||||
gate: self.clone(),
|
||||
tenant_id: tenant_id.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn wait_inflight_zero(&self, tenant_id: &str, timeout: Duration) -> bool {
|
||||
let start = tokio::time::Instant::now();
|
||||
loop {
|
||||
if self.inflight_count(tenant_id) == 0 {
|
||||
return true;
|
||||
}
|
||||
let remaining = timeout.saturating_sub(start.elapsed());
|
||||
if remaining.is_zero() {
|
||||
return false;
|
||||
}
|
||||
|
||||
let notify = {
|
||||
let Ok(mut g) = self.inflight_notify.lock() else {
|
||||
tokio::time::sleep(Duration::from_millis(50)).await;
|
||||
continue;
|
||||
};
|
||||
g.entry(tenant_id.to_string())
|
||||
.or_insert_with(|| Arc::new(Notify::new()))
|
||||
.clone()
|
||||
};
|
||||
|
||||
let _ =
|
||||
tokio::time::timeout(remaining.min(Duration::from_millis(250)), notify.notified())
|
||||
.await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TenantWorkGuard {
|
||||
gate: TenantGate,
|
||||
tenant_id: String,
|
||||
}
|
||||
|
||||
impl Drop for TenantWorkGuard {
|
||||
fn drop(&mut self) {
|
||||
let count = if let Ok(mut g) = self.gate.inflight.lock() {
|
||||
match g.get_mut(&self.tenant_id) {
|
||||
Some(v) => {
|
||||
*v = v.saturating_sub(1);
|
||||
let next = *v;
|
||||
if next == 0 {
|
||||
g.remove(&self.tenant_id);
|
||||
}
|
||||
next
|
||||
}
|
||||
None => 0,
|
||||
}
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
if count == 0 {
|
||||
if let Ok(g) = self.gate.inflight_notify.lock() {
|
||||
if let Some(n) = g.get(&self.tenant_id) {
|
||||
n.notify_waiters();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
105
runner/src/types/envelope.rs
Normal file
105
runner/src/types/envelope.rs
Normal file
@@ -0,0 +1,105 @@
|
||||
use crate::types::{CommandId, CorrelationId, EffectName, TenantId, TraceId};
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
use std::collections::BTreeMap;
|
||||
use uuid::Uuid;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
||||
#[serde(default)]
|
||||
pub struct MessageMetadata {
|
||||
pub correlation_id: Option<CorrelationId>,
|
||||
pub trace_id: Option<TraceId>,
|
||||
#[serde(flatten)]
|
||||
pub extra: BTreeMap<String, Value>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct AggregateEventEnvelope {
|
||||
pub tenant_id: TenantId,
|
||||
pub event_id: Uuid,
|
||||
pub aggregate_id: String,
|
||||
pub aggregate_type: String,
|
||||
pub version: u64,
|
||||
pub event_type: String,
|
||||
pub payload: Value,
|
||||
pub command_id: Uuid,
|
||||
pub timestamp: DateTime<Utc>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct GatewayCommandEnvelope {
|
||||
pub tenant_id: TenantId,
|
||||
pub command_id: CommandId,
|
||||
pub aggregate_id: String,
|
||||
pub aggregate_type: String,
|
||||
pub payload_json: Value,
|
||||
#[serde(default)]
|
||||
pub metadata: MessageMetadata,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct EffectCommandEnvelope {
|
||||
pub tenant_id: TenantId,
|
||||
pub command_id: CommandId,
|
||||
pub effect_name: EffectName,
|
||||
pub payload: Value,
|
||||
#[serde(default)]
|
||||
pub metadata: MessageMetadata,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum EffectResultType {
|
||||
Succeeded,
|
||||
Failed,
|
||||
TimedOut,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct EffectResultEnvelope {
|
||||
pub tenant_id: TenantId,
|
||||
pub command_id: CommandId,
|
||||
pub effect_name: EffectName,
|
||||
pub result_type: EffectResultType,
|
||||
pub payload: Value,
|
||||
pub timestamp: DateTime<Utc>,
|
||||
#[serde(default)]
|
||||
pub metadata: MessageMetadata,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(tag = "kind", rename_all = "snake_case")]
|
||||
pub enum WorkItem {
|
||||
AggregateCommand(GatewayCommandEnvelope),
|
||||
EffectCommand(EffectCommandEnvelope),
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::types::{CommandId, EffectName, TenantId};
|
||||
use serde_json::json;
|
||||
|
||||
#[test]
|
||||
fn envelope_decoding_ignores_unknown_fields() {
|
||||
let raw = json!({
|
||||
"tenant_id": "t1",
|
||||
"command_id": "c1",
|
||||
"effect_name": "send_email",
|
||||
"payload": {"to": "a@example.com"},
|
||||
"metadata": {"correlation_id": "corr", "trace_id": "trace", "extra_key": 123},
|
||||
"unknown_field": "ignored"
|
||||
});
|
||||
|
||||
let decoded: EffectCommandEnvelope = serde_json::from_value(raw).unwrap();
|
||||
assert_eq!(decoded.tenant_id, TenantId::new("t1"));
|
||||
assert_eq!(decoded.command_id, CommandId::new("c1"));
|
||||
assert_eq!(decoded.effect_name, EffectName::new("send_email"));
|
||||
assert_eq!(
|
||||
decoded.metadata.correlation_id.as_ref().map(|v| v.as_str()),
|
||||
Some("corr")
|
||||
);
|
||||
assert_eq!(decoded.metadata.extra.get("extra_key"), Some(&json!(123)));
|
||||
}
|
||||
}
|
||||
19
runner/src/types/error.rs
Normal file
19
runner/src/types/error.rs
Normal file
@@ -0,0 +1,19 @@
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum RunnerError {
|
||||
#[error("Storage error: {0}")]
|
||||
StorageError(String),
|
||||
#[error("Stream error: {0}")]
|
||||
StreamError(String),
|
||||
#[error("Decode error: {0}")]
|
||||
DecodeError(String),
|
||||
#[error("Runtime error: {0}")]
|
||||
RuntimeError(String),
|
||||
#[error("Tenant access error: {0}")]
|
||||
TenantAccessError(String),
|
||||
#[error("Policy error: {0}")]
|
||||
PolicyError(String),
|
||||
}
|
||||
|
||||
pub type RunnerResult<T> = Result<T, RunnerError>;
|
||||
204
runner/src/types/id.rs
Normal file
204
runner/src/types/id.rs
Normal file
@@ -0,0 +1,204 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fmt;
|
||||
use std::str::FromStr;
|
||||
use uuid::Uuid;
|
||||
|
||||
pub type TenantId = shared::TenantId;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct SagaName(String);
|
||||
|
||||
impl SagaName {
|
||||
pub fn new(name: impl Into<String>) -> Self {
|
||||
Self(name.into())
|
||||
}
|
||||
|
||||
pub fn as_str(&self) -> &str {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for SagaName {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for SagaName {
|
||||
type Err = std::convert::Infallible;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
Ok(Self(s.to_string()))
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<str> for SagaName {
|
||||
fn as_ref(&self) -> &str {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct EffectName(String);
|
||||
|
||||
impl EffectName {
|
||||
pub fn new(name: impl Into<String>) -> Self {
|
||||
Self(name.into())
|
||||
}
|
||||
|
||||
pub fn as_str(&self) -> &str {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for EffectName {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for EffectName {
|
||||
type Err = std::convert::Infallible;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
Ok(Self(s.to_string()))
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<str> for EffectName {
|
||||
fn as_ref(&self) -> &str {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
pub type CorrelationId = shared::CorrelationId;
|
||||
pub type TraceId = shared::TraceId;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct CommandId(String);
|
||||
|
||||
impl CommandId {
|
||||
pub fn new(id: impl Into<String>) -> Self {
|
||||
Self(id.into())
|
||||
}
|
||||
|
||||
pub fn as_str(&self) -> &str {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for CommandId {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for CommandId {
|
||||
type Err = std::convert::Infallible;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
Ok(Self(s.to_string()))
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<str> for CommandId {
|
||||
fn as_ref(&self) -> &str {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct EventId(String);
|
||||
|
||||
impl EventId {
|
||||
pub fn new(id: impl Into<String>) -> Self {
|
||||
Self(id.into())
|
||||
}
|
||||
|
||||
pub fn as_str(&self) -> &str {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for EventId {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for EventId {
|
||||
type Err = std::convert::Infallible;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
Ok(Self(s.to_string()))
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<str> for EventId {
|
||||
fn as_ref(&self) -> &str {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct WorkId(Uuid);
|
||||
|
||||
impl WorkId {
|
||||
pub fn new_v7() -> Self {
|
||||
Self(Uuid::now_v7())
|
||||
}
|
||||
|
||||
pub fn as_uuid(&self) -> &Uuid {
|
||||
&self.0
|
||||
}
|
||||
|
||||
pub fn as_str(&self) -> String {
|
||||
self.0.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for WorkId {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for WorkId {
|
||||
type Err = uuid::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
Ok(Self(Uuid::parse_str(s)?))
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for WorkId {
|
||||
fn default() -> Self {
|
||||
Self::new_v7()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn tenant_id_roundtrips_and_defaults_to_empty() {
|
||||
let id = TenantId::new("acme");
|
||||
let json = serde_json::to_string(&id).unwrap();
|
||||
let decoded: TenantId = serde_json::from_str(&json).unwrap();
|
||||
assert_eq!(id, decoded);
|
||||
assert!(TenantId::default().is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ids_are_send_sync() {
|
||||
fn assert_send_sync<T: Send + Sync>() {}
|
||||
assert_send_sync::<TenantId>();
|
||||
assert_send_sync::<SagaName>();
|
||||
assert_send_sync::<EffectName>();
|
||||
assert_send_sync::<CorrelationId>();
|
||||
assert_send_sync::<WorkId>();
|
||||
assert_send_sync::<CommandId>();
|
||||
assert_send_sync::<EventId>();
|
||||
}
|
||||
}
|
||||
149
runner/src/types/keys.rs
Normal file
149
runner/src/types/keys.rs
Normal file
@@ -0,0 +1,149 @@
|
||||
use crate::types::{CommandId, CorrelationId, EventId, SagaName, TenantId, WorkId};
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct SagaStateKey(String);
|
||||
|
||||
impl SagaStateKey {
|
||||
pub fn new(tenant_id: &TenantId, saga_name: &SagaName, correlation_id: &CorrelationId) -> Self {
|
||||
Self(format!(
|
||||
"saga:{}:{}:{}",
|
||||
tenant_id.as_str(),
|
||||
saga_name.as_str(),
|
||||
correlation_id.as_str()
|
||||
))
|
||||
}
|
||||
|
||||
pub fn as_str(&self) -> &str {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct CheckpointKey(String);
|
||||
|
||||
impl CheckpointKey {
|
||||
pub fn new(tenant_id: &TenantId, saga_name: &SagaName) -> Self {
|
||||
Self(format!(
|
||||
"checkpoint:{}:{}",
|
||||
tenant_id.as_str(),
|
||||
saga_name.as_str()
|
||||
))
|
||||
}
|
||||
|
||||
pub fn as_str(&self) -> &str {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct OutboxKey(String);
|
||||
|
||||
impl OutboxKey {
|
||||
pub fn new(tenant_id: &TenantId, work_kind: &str, work_id: &WorkId) -> Self {
|
||||
Self(format!(
|
||||
"outbox:{}:{}:{}",
|
||||
tenant_id.as_str(),
|
||||
work_kind,
|
||||
work_id
|
||||
))
|
||||
}
|
||||
|
||||
pub fn prefix_for_tenant(tenant_id: &TenantId) -> String {
|
||||
format!("outbox:{}:", tenant_id.as_str())
|
||||
}
|
||||
|
||||
pub fn as_str(&self) -> &str {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct ScheduleKey(String);
|
||||
|
||||
impl ScheduleKey {
|
||||
pub fn new(
|
||||
tenant_id: &TenantId,
|
||||
saga_name: &SagaName,
|
||||
correlation_id: &CorrelationId,
|
||||
due_at_ms: u64,
|
||||
) -> Self {
|
||||
Self(format!(
|
||||
"schedule:{}:{}:{}:{}",
|
||||
tenant_id.as_str(),
|
||||
saga_name.as_str(),
|
||||
correlation_id.as_str(),
|
||||
due_at_ms
|
||||
))
|
||||
}
|
||||
|
||||
pub fn prefix_for_tenant(tenant_id: &TenantId) -> String {
|
||||
format!("schedule:{}:", tenant_id.as_str())
|
||||
}
|
||||
|
||||
pub fn as_str(&self) -> &str {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct DedupeEventKey(String);
|
||||
|
||||
impl DedupeEventKey {
|
||||
pub fn new(tenant_id: &TenantId, saga_name: &SagaName, event_id: &EventId) -> Self {
|
||||
Self(format!(
|
||||
"dedupe:{}:{}:{}",
|
||||
tenant_id.as_str(),
|
||||
saga_name.as_str(),
|
||||
event_id.as_str()
|
||||
))
|
||||
}
|
||||
|
||||
pub fn as_str(&self) -> &str {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct DedupeEffectKey(String);
|
||||
|
||||
impl DedupeEffectKey {
|
||||
pub fn new(tenant_id: &TenantId, command_id: &CommandId) -> Self {
|
||||
Self(format!(
|
||||
"dedupe:{}:effect:{}",
|
||||
tenant_id.as_str(),
|
||||
command_id.as_str()
|
||||
))
|
||||
}
|
||||
|
||||
pub fn as_str(&self) -> &str {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::types::{CorrelationId, SagaName, TenantId, WorkId};
|
||||
|
||||
#[test]
|
||||
fn key_composition_is_stable() {
|
||||
let tenant = TenantId::new("t1");
|
||||
let saga = SagaName::new("billing");
|
||||
let corr = CorrelationId::new("c1");
|
||||
let work_id = WorkId::new_v7();
|
||||
|
||||
let saga_key = SagaStateKey::new(&tenant, &saga, &corr);
|
||||
assert_eq!(saga_key.as_str(), "saga:t1:billing:c1");
|
||||
|
||||
let cp_key = CheckpointKey::new(&tenant, &saga);
|
||||
assert_eq!(cp_key.as_str(), "checkpoint:t1:billing");
|
||||
|
||||
let outbox_key = OutboxKey::new(&tenant, "effect", &work_id);
|
||||
assert!(outbox_key.as_str().starts_with("outbox:t1:effect:"));
|
||||
assert_eq!(OutboxKey::prefix_for_tenant(&tenant), "outbox:t1:");
|
||||
|
||||
let schedule_key = ScheduleKey::new(&tenant, &saga, &corr, 123);
|
||||
assert_eq!(schedule_key.as_str(), "schedule:t1:billing:c1:123");
|
||||
assert_eq!(ScheduleKey::prefix_for_tenant(&tenant), "schedule:t1:");
|
||||
}
|
||||
}
|
||||
9
runner/src/types/mod.rs
Normal file
9
runner/src/types/mod.rs
Normal file
@@ -0,0 +1,9 @@
|
||||
mod envelope;
|
||||
mod error;
|
||||
mod id;
|
||||
mod keys;
|
||||
|
||||
pub use envelope::*;
|
||||
pub use error::*;
|
||||
pub use id::*;
|
||||
pub use keys::*;
|
||||
Reference in New Issue
Block a user