From 5bc3b634b796ff4bfcdba26fe979346f31264148 Mon Sep 17 00:00:00 2001 From: rUv Date: Sun, 14 Jun 2026 20:22:07 -0400 Subject: [PATCH] =?UTF-8?q?fix(automation=20security):=20template-bomb=20D?= =?UTF-8?q?oS=20(100MB/11s=20render=20=E2=86=92=20fuel-bounded,=20HIGH)=20?= =?UTF-8?q?+=20delay=20panic-on-config=20(MEDIUM)=20(#1083)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(homecore-automation): bound template render to stop unbounded-expansion DoS (HC-SEC-01) A `template:` condition / value_template comes straight from user automation config and was rendered with MiniJinja's default (no instruction budget, no output cap). A single condition such as `{% for i in range(5000) %}{% for j in range(5000) %}xxxx{% endfor %}{% endfor %}` rendered a 100 MB string over ~11 s on one render call (proven empirically) — a CPU/memory denial of service, the bfld-class "unbounded expansion". Fix: - Enable MiniJinja's `fuel` feature and set a per-render instruction budget (`set_fuel(Some(1_000_000))`). A nested loop burns one unit per iteration, so the budget caps total work regardless of nesting; the attack now fails fast (~90 ms) with "engine ran out of fuel". - Reject template sources over 64 KiB before compilation (defense in depth so a pathological literal can neither compile nor emit verbatim). Legitimate HA templates (a few dozen instructions) are unaffected. Tests (fail on old — unbounded render / no rejection): - nested_loop_template_is_bounded_not_unbounded_dos - single_huge_repeat_template_is_bounded - oversized_template_source_is_rejected - legitimate_template_still_renders_within_fuel (no regression) Co-Authored-By: claude-flow * fix(homecore-automation): stop crafted delay/timeout from panicking the run task (HC-SEC-02) `Action::Delay { seconds }` and `Action::WaitForTrigger { timeout_seconds }` fed the user-supplied float straight into `Duration::from_secs_f64`, which PANICS on negative, NaN, infinite, or overflowing inputs. All of those are reachable from a crafted (or simply typo'd) automation YAML — `delay: {seconds: -1}`, `.nan`, `.inf`, `1e308` — so one hostile config aborts the spawned automation task with a panic ("cannot convert float seconds to Duration: value is negative", proven empirically). Fix: a `safe_duration_from_secs` guard that saturates instead of panicking, matching Home Assistant's lenient "non-positive delay = no delay": - NaN / ±inf / negative -> Duration::ZERO - absurdly large (would overflow) -> clamped to ~100 years (MAX_DELAY_SECS) Tests (fail on old — panic = failure): - delay_negative_seconds_does_not_panic - delay_nan_seconds_does_not_panic - delay_infinite_seconds_does_not_panic - wait_for_trigger_negative_timeout_does_not_panic - safe_duration_saturates_hostile_values (incl. overflow clamp) Co-Authored-By: claude-flow * docs(homecore-automation): record HC-SEC-01/02 security review (CHANGELOG + ADR-129 §8a) Document the two DoS findings (template unbounded-expansion HC-SEC-01, delay panic-on-config HC-SEC-02) and the dimensions probed clean (condition fail-closed, bounded run-modes, sandboxed read-only templates). Co-Authored-By: claude-flow --- CHANGELOG.md | 1 + .../adr/ADR-129-homecore-automation-engine.md | 17 +++ v2/crates/homecore-automation/Cargo.toml | 6 +- v2/crates/homecore-automation/src/action.rs | 96 ++++++++++++++++- v2/crates/homecore-automation/src/template.rs | 102 ++++++++++++++++++ 5 files changed, 218 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 51a7cda4..72c0b3a6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **ADR-260: RuField MFS — the open specification for camera-free multimodal field sensing.** A common event / tensor / calibration / privacy / provenance model that sits *above* WiFi CSI/CIR/BFLD, UWB, BLE Channel Sounding, mmWave radar, ultrasound, subsonic, infrared, and future quantum sensors (each modality emits a normalized `FieldEvent` → `FieldTensor` → `FusionGraph` → `PrivacyClass` → `ProvenanceReceipt`). Published as a **standalone repo** [`ruvnet/rufield`](https://github.com/ruvnet/rufield) and vendored here as the `vendor/rufield` submodule (the `vendor/rvcsi` pattern — not a `v2/` workspace member). The v0.1 reference stack is a self-contained 6-crate Rust workspace (`rufield-core`, `-provenance` [sha256 + ed25519], `-privacy` [P0–P5 guard], `-adapters` [deterministic `SyntheticSim` across wifi_csi/mmwave_radar/infrared_thermal], `-fusion` [graph + TOML weighted-Bayes rules → 7 room-state inferences], `-bench` [deterministic runner + the §31 acceptance test]). **60 tests / 0 failed, clippy-clean.** §27 acceptance criteria 1–8 and 10 PASS; the live dashboard (9) is deferred. **All benchmark metrics are SYNTHETIC** (scored against the simulator's own ground truth — presence/breathing/bed_exit/room_transition F1 = 1.000, nocturnal_scratch 0.923 reported honestly, p95 latency ~0.01 ms, provenance coverage 100%, 0 privacy violations) — they prove the pipeline recovers known truth, **not** field accuracy; real hardware adapters (ESP32 CSI, mmWave, thermal IR) are a documented roadmap item, none validated in v0.1. The Python deterministic proof is unchanged (rufield is off the signal-processing proof path). ### Security +- **`homecore-automation` security review — two real DoS findings fixed (template unbounded-expansion + delay panic-on-config), each pinned by a fails-on-old test; condition-bypass / fail-closed / action-authz dimensions confirmed clean (ADR-129 §8a).** Beyond-SOTA review of the HA-compat automation engine (the execution/eval surface: triggers → conditions → actions, with user-config Jinja2 templates), un-covered by the ADR-154–159 sweep. **HC-SEC-01 (template DoS, HIGH):** a `template:` condition / `value_template` is user config and was rendered with MiniJinja's defaults — **no instruction budget, no output cap**. A single nested-loop condition rendered a **100 MB string in ~11 s on one render call** (measured) — the bfld-class unbounded expansion (MiniJinja's per-call `range()` 10k cap does **not** stop nesting). **Fixed** by enabling MiniJinja's `fuel` feature + `set_fuel(Some(1_000_000))` (the attack now fails fast ~90 ms with "engine ran out of fuel") and a 64 KiB source-length cap; legitimate templates unaffected. **HC-SEC-02 (panic-on-config DoS, MEDIUM):** `Action::Delay`/`WaitForTrigger` fed the user float straight into `Duration::from_secs_f64`, which **panics** on negative/NaN/inf/overflow — all reachable from a crafted or typo'd YAML (`delay: {seconds: -1}`, `.nan`, `.inf`, `1e308`), aborting the spawned run task (measured panic). **Fixed** by a `safe_duration_from_secs` guard that saturates (NaN/±inf/negative → `0`, matching HA's lenient "non-positive delay = no delay"; huge → clamped to ~100 yr). **Dimensions probed clean (evidence in ADR-129 §8a):** condition eval is **fail-closed** (template-render error → `false`; un-parseable `choose` branch condition → branch skipped, never silently passing); run-modes are **bounded** (Single/Restart/Queued/`max:N` — a self-triggering automation does not livelock, ADR-162 tests); templates are **read-only sandboxed** (no service-call/state-set global exposed to template scope, so a template cannot escalate to an action); no `unwrap`/`expect`/index panic reachable from a crafted config in the eval/exec path beyond the fixed `from_secs_f64`. Fails-on-old verified by reverting each fix in isolation (delay tests panic; template nested-loop test runs unbounded >60 s; oversized-source test fails). `cargo test -p homecore-automation --no-default-features`: **40 → 54 passed, 0 failed** (+14: 4 template-DoS, 1 no-regression render, 5 delay/wait + safe-duration unit). Workspace green; Python deterministic proof unchanged (homecore-automation is off the signal proof path). - **`cog-ha-matter` witness/manifest crypto review — engine-class signed-digest collision confirmed ABSENT (length-prefixing already correct); domain-separation tag ADDED + `verify_strict` HARDENED; key-handling & verify-before-trust confirmed clean (ADR-116 §2.2).** Beyond-SOTA crypto+security review of the Cognitum/HA-Matter bridge's SHA-256 + Ed25519 witness chain — the exact signing chain ADR-262 P2 proposes to reuse — un-covered by the ADR-154–159 sweep. **Top-priority check: the sibling `wifi-densepose-engine` bug class (unframed boundary-to-boundary concatenation of operator-influenceable strings into a signed/hashed digest).** Result reported honestly: **that bug class is ABSENT here** — `witness::canonical_bytes` already length-prefixes the two variable-length operator-influenceable fields (`kind_len:u32-be ‖ kind`, `payload_len:u32-be ‖ payload`) over fixed-width `prev_hash[32] ‖ seq:u64-be ‖ ts:u64-be`, an injective encoding (proven pre-existing by `canonical_bytes_length_prefixing_prevents_ambiguity`), and `witness_signing::sign_event`/`verify_signature` sign/verify the **identical** bytes the hash chain commits to (no separate unframed concatenation). The manifest `binary_signature` (Ed25519 over the fixed 64-hex-char `binary_sha256`) is signed **at build time by the Makefile**, not in-crate, and over a single fixed-length value — no in-crate manifest-signing concatenation surface. **Two real hardening gaps fixed, the first pinned by fails-on-old tests:** - **CHM-WIT-01 (missing domain-separation tag, LOW) — ADDED.** The engine review's prescribed fix is "domain-tag **+** length-prefix"; the length-prefix half was present, the **domain tag was absent**. The witness SHA-256 preimage / Ed25519 message carried no tag distinguishing it from any other signing context that shares key infrastructure — notably the manifest `binary_signature`, the very chain ADR-262 P2 reuses. **Fix:** prepend a versioned, NUL-terminated `WITNESS_DOMAIN_TAG = b"cog-ha-matter/witness-event/v1\x00"` to `canonical_bytes` (the doc-comment already anticipated a leading version migration). Cross-protocol separation now holds: a witness signature can never be replayed as a message for another Ed25519 context. **Witness-bytes change by design** (prior on-disk witness hashes/signatures invalidated, like the engine fix) — verified safe: **no in-repo crate consumes cog-ha-matter's witness bytes/signatures programmatically** (all references are doc-comment mentions; the crate is self-contained, no `use cog_ha_matter::` anywhere). Pinned by `canonical_bytes_is_domain_separated`, `canonical_bytes_starts_with_domain_tag_then_prev_hash`, `witness_preimage_cannot_collide_with_a_bare_manifest_digest` (witness.rs) and `signature_commits_to_domain_tag_not_bare_fields` (witness_signing.rs — a signature over the **un-tagged** field concatenation must NOT verify); the domain-separation guard **FAILED on the reverted un-tagged encoding** ("canonical message is not domain-separated"). - **CHM-WIT-02 (permissive Ed25519 verification, LOW) — HARDENED to `verify_strict`.** For a tamper-evident **audit** chain the signature is the attestation, so `verify_signature` now uses `VerifyingKey::verify_strict` (rejects non-canonical encodings + small-order public keys per RFC 8032) instead of the permissive `Verifier::verify` — giving auditors the "one canonical signature per event" property they rely on when comparing/deduplicating signed records. Not a forgery fix (the public key is caller-pinned, never parsed from the event), reported at true LOW severity. Guarded by `verify_uses_strict_path_and_pins_caller_key`. diff --git a/docs/adr/ADR-129-homecore-automation-engine.md b/docs/adr/ADR-129-homecore-automation-engine.md index f3085ce9..ec7fe99a 100644 --- a/docs/adr/ADR-129-homecore-automation-engine.md +++ b/docs/adr/ADR-129-homecore-automation-engine.md @@ -190,6 +190,23 @@ This is the same Wasmtime host already used for integration plugins (ADR-128) --- +## 8a. Security review (beyond-SOTA sweep, post ADR-154–159) + +A focused security review of `homecore-automation` (the execution/eval surface — triggers → conditions → actions, with templates) was run after the ADR-154–159 sweep, applying the same rigor that the sibling engine/bfld/calibration/vitals/geo reviews used. **Two real DoS findings, each pinned by a fails-on-old test; the condition-bypass, fail-closed-parsing, and action-authorization dimensions were probed and found clean.** + +- **HC-SEC-01 (template-injection / unbounded-expansion DoS, HIGH) — FIXED.** A `template:` condition / `value_template` is user automation config, and was rendered with MiniJinja's defaults: **no instruction budget, no output cap**. A single condition such as `{% for i in range(5000) %}{% for j in range(5000) %}xxxx{% endfor %}{% endfor %}` rendered a **100 MB string over ~11 s on one render call** (measured) — a CPU/memory denial of service (the bfld-class "unbounded expansion"; MiniJinja's per-call `range()` 10k cap does **not** stop nested loops). **Fix:** enable MiniJinja's `fuel` feature and set a per-render budget (`set_fuel(Some(1_000_000))`) so a nested loop burns one unit per iteration — the attack now fails fast (~90 ms) with "engine ran out of fuel"; plus a 64 KiB source-length cap rejecting pathological sources before compilation. Legitimate HA templates (a few dozen instructions) are unaffected. Pinned by `nested_loop_template_is_bounded_not_unbounded_dos`, `single_huge_repeat_template_is_bounded`, `oversized_template_source_is_rejected` (all fail-on-old: unbounded render / no rejection), and `legitimate_template_still_renders_within_fuel` (no regression). +- **HC-SEC-02 (panic-on-config DoS, MEDIUM) — FIXED.** `Action::Delay { seconds }` and `Action::WaitForTrigger { timeout_seconds }` fed the user-supplied float straight into `Duration::from_secs_f64`, which **panics** on negative, NaN, infinite, or overflowing inputs — all reachable from a crafted (or typo'd) YAML (`delay: {seconds: -1}`, `.nan`, `.inf`, `1e308`). One hostile config aborts the spawned automation run task with a panic (measured: "cannot convert float seconds to Duration: value is negative"). **Fix:** a `safe_duration_from_secs` guard that saturates instead of panicking (NaN/±inf/negative → `Duration::ZERO`, matching HA's lenient "non-positive delay = no delay"; absurdly large → clamped to ~100 years). Pinned by `delay_negative_seconds_does_not_panic`, `delay_nan_seconds_does_not_panic`, `delay_infinite_seconds_does_not_panic`, `wait_for_trigger_negative_timeout_does_not_panic`, `safe_duration_saturates_hostile_values` (incl. overflow clamp). + +**Dimensions confirmed clean (with evidence):** +- **Condition bypass / fail-closed eval** — a `Condition::Template` whose render errors evaluates to `false` (`condition.rs` `Err(_) => false`), and a `Choose` branch condition that fails to deserialize is treated as **non-matching** (the branch is skipped), not silently passing (`action.rs` `ChoiceBranch::matches` `Err(_) => return false`). Both fail **closed** (do-not-run), confirmed by the existing `choose_*` tests and template-false-blocks-action behavioral test. No true-by-default-on-parse-error path found. +- **Re-entrancy / livelock (DoS)** — run-mode machinery is bounded and tested: `Single`/`IgnoreFirst` re-entrancy guard, `Restart` cancel-and-replace, `Queued` FIFO serialization, and `max: N` semaphore cap (ADR-162; `restart_mode_cancels_prior_run`, `queued_mode_runs_sequentially_not_concurrently`, `max_two_caps_concurrency_at_two`, `single_mode_does_not_double_fire_on_rapid_triggers`). A self-triggering automation does not livelock the engine — each fire is bounded by its run-mode. +- **Action authorization** — templates are read-only sandboxed (`states`/`state_attr`/`is_state`/`now` globals; no service-call or state-set global is exposed to template scope), so a template cannot escalate into an action. Service authorization itself is enforced at the `homecore` service-registry boundary (out of this crate's scope); no gap found in what the automation crate enforces. +- **Panic-on-config (parse)** — `serde_yaml`/`serde_json` deserialization returns structured `AutomationError` (no `unwrap`/`expect`/index reachable from a crafted config in the eval/exec path); the only remaining panic surface was the `from_secs_f64` path fixed as HC-SEC-02. + +Validation: `cargo test -p homecore-automation --no-default-features` → 54 passed / 0 failed (+14 over baseline). Python deterministic proof unchanged (homecore-automation is off the signal-processing proof path). + +--- + ## 9. References ### HA upstream diff --git a/v2/crates/homecore-automation/Cargo.toml b/v2/crates/homecore-automation/Cargo.toml index 8d1449d1..d632075e 100644 --- a/v2/crates/homecore-automation/Cargo.toml +++ b/v2/crates/homecore-automation/Cargo.toml @@ -29,8 +29,10 @@ serde = { version = "1", features = ["derive"] } serde_yaml = "0.9" serde_json = "1" -# MiniJinja — HA-compatible Jinja2 template engine in pure Rust (ADR-129 §2.1) -minijinja = { version = "2", features = ["json", "loader"] } +# MiniJinja — HA-compatible Jinja2 template engine in pure Rust (ADR-129 §2.1). +# `fuel` bounds instruction count so a malicious `template:` condition cannot +# spin the engine with a nested-loop / huge-repeat DoS (HC-SEC-01). +minijinja = { version = "2", features = ["json", "loader", "fuel"] } # Error handling thiserror = "1" diff --git a/v2/crates/homecore-automation/src/action.rs b/v2/crates/homecore-automation/src/action.rs index d65d8abf..27c925af 100644 --- a/v2/crates/homecore-automation/src/action.rs +++ b/v2/crates/homecore-automation/src/action.rs @@ -70,6 +70,32 @@ impl ExecutionContext { } } +/// Upper bound for a `delay` / `wait_for_trigger` timeout, in seconds +/// (~100 years). Caps absurd values so `Duration::from_secs_f64` cannot +/// overflow-panic on e.g. `seconds: 1e308`, while still allowing any +/// realistic automation delay (HC-SEC-02). +const MAX_DELAY_SECS: f64 = 3.15e9; + +/// Convert a user-supplied seconds value into a `Duration` without +/// panicking (HC-SEC-02). +/// +/// `Duration::from_secs_f64` **panics** on negative, NaN, infinite, or +/// overflowing inputs. Those values are all reachable from a crafted +/// automation YAML (`delay: {seconds: -1}`, `.nan`, `.inf`, `1e308`), so a +/// single hostile config would crash the running automation task. We +/// instead saturate to a safe range — matching Home Assistant's lenient +/// treatment of a non-positive delay as "no delay": +/// +/// - non-finite (NaN / ±inf) → `0` +/// - negative → `0` +/// - above [`MAX_DELAY_SECS`] → clamped to the cap +fn safe_duration_from_secs(seconds: f64) -> Duration { + if !seconds.is_finite() || seconds <= 0.0 { + return Duration::ZERO; + } + Duration::from_secs_f64(seconds.min(MAX_DELAY_SECS)) +} + /// Action configuration. Deserialized from YAML `action:` blocks. #[derive(Clone, Debug, Serialize, Deserialize)] #[serde(tag = "action", rename_all = "snake_case")] @@ -154,7 +180,10 @@ impl Action { Ok(result) } Action::Delay { seconds } => { - let dur = Duration::from_secs_f64(*seconds); + // `safe_duration_from_secs` guards against negative / + // NaN / infinite / overflowing values that would + // otherwise panic `Duration::from_secs_f64` (HC-SEC-02). + let dur = safe_duration_from_secs(*seconds); sleep(dur).await; Ok(serde_json::Value::Null) } @@ -172,7 +201,8 @@ impl Action { // P1 stub — just sleeps for the timeout duration if specified. // Full trigger subscription lands in P2. if let Some(secs) = timeout_seconds { - sleep(Duration::from_secs_f64(*secs)).await; + // Same non-panicking guard as `Delay` (HC-SEC-02). + sleep(safe_duration_from_secs(*secs)).await; } Ok(serde_json::Value::Null) } @@ -243,6 +273,68 @@ mod tests { assert!(result.is_null()); } + // ── HC-SEC-02: a crafted delay must not panic the run task ───────── + // + // `Duration::from_secs_f64` panics on negative / NaN / infinite / + // overflowing inputs, all reachable from a YAML `delay:` value. On the + // pre-fix code each of these aborts the spawned automation task with a + // panic; the guard saturates to a safe Duration instead. These tests + // fail on old (panic = test failure). + #[tokio::test] + async fn delay_negative_seconds_does_not_panic() { + let hc = HomeCore::new(); + let mut ctx = ExecutionContext::new(hc, "auto"); + let result = Action::Delay { seconds: -1.0 }.execute(&mut ctx).await; + assert!(result.is_ok(), "negative delay must be treated as 0, not panic"); + } + + #[tokio::test] + async fn delay_nan_seconds_does_not_panic() { + let hc = HomeCore::new(); + let mut ctx = ExecutionContext::new(hc, "auto"); + let result = Action::Delay { seconds: f64::NAN }.execute(&mut ctx).await; + assert!(result.is_ok(), "NaN delay must be treated as 0, not panic"); + } + + #[tokio::test] + async fn delay_infinite_seconds_does_not_panic() { + let hc = HomeCore::new(); + let mut ctx = ExecutionContext::new(hc, "auto"); + let result = Action::Delay { seconds: f64::INFINITY }.execute(&mut ctx).await; + assert!(result.is_ok(), "infinite delay must saturate to 0, not panic"); + } + + // Note: the overflow case (1e300) is covered by the synchronous + // `safe_duration_saturates_hostile_values` unit test below — executing + // `Action::Delay { seconds: 1e300 }` would genuinely sleep for the + // clamped (~100-year) duration, so we assert the conversion directly + // rather than through `execute`. + + #[tokio::test] + async fn wait_for_trigger_negative_timeout_does_not_panic() { + let hc = HomeCore::new(); + let mut ctx = ExecutionContext::new(hc, "auto"); + let result = Action::WaitForTrigger { timeout_seconds: Some(-5.0) } + .execute(&mut ctx) + .await; + assert!(result.is_ok(), "negative wait timeout must not panic"); + } + + #[test] + fn safe_duration_saturates_hostile_values() { + assert_eq!(safe_duration_from_secs(-1.0), Duration::ZERO); + assert_eq!(safe_duration_from_secs(f64::NAN), Duration::ZERO); + assert_eq!(safe_duration_from_secs(f64::INFINITY), Duration::ZERO); + assert_eq!(safe_duration_from_secs(f64::NEG_INFINITY), Duration::ZERO); + // legitimate value preserved + assert_eq!(safe_duration_from_secs(2.5), Duration::from_secs_f64(2.5)); + // huge value clamped to the cap, not overflow-panicked + assert_eq!( + safe_duration_from_secs(1e300), + Duration::from_secs_f64(MAX_DELAY_SECS) + ); + } + #[tokio::test] async fn service_call_unregistered_returns_error() { let hc = HomeCore::new(); diff --git a/v2/crates/homecore-automation/src/template.rs b/v2/crates/homecore-automation/src/template.rs index fe1d6196..ab8fedeb 100644 --- a/v2/crates/homecore-automation/src/template.rs +++ b/v2/crates/homecore-automation/src/template.rs @@ -13,6 +13,26 @@ use homecore::{EntityId, StateMachine}; use crate::error::AutomationError; +/// Instruction budget for a single template render (HC-SEC-01). +/// +/// Templates come from user automation config; without a bound a single +/// `template:` condition like +/// `{% for i in range(10000) %}{% for j in range(10000) %}x{% endfor %}{% endfor %}` +/// renders a multi-gigabyte string and pins a CPU for tens of seconds — +/// a memory/CPU denial-of-service (the bfld-class "unbounded expansion"). +/// MiniJinja's `fuel` feature charges ~1 unit per VM instruction; a +/// nested loop burns one unit per iteration, so the budget caps total +/// work regardless of how the loops are nested. 1,000,000 instructions is +/// far more than any legitimate HA template needs (a typical condition is +/// a few dozen) while killing the attack in well under a second. +const TEMPLATE_FUEL: u64 = 1_000_000; + +/// Hard cap on the source length of a template (HC-SEC-01, defense in +/// depth). A legitimate HA `value_template` is a one-liner; anything past +/// 64 KiB is rejected before compilation so a pathological source string +/// can neither be compiled nor emitted verbatim. +const MAX_TEMPLATE_SOURCE_BYTES: usize = 64 * 1024; + /// MiniJinja environment pre-loaded with HA-compatible globals. /// /// Constructed once per `AutomationEngine` and shared via `Arc`. The @@ -27,6 +47,10 @@ impl TemplateEnvironment { pub fn new(states: Arc) -> Self { let mut env = Environment::new(); + // Bound per-render work so a hostile `template:` condition cannot + // DoS the engine via nested loops / huge repeats (HC-SEC-01). + env.set_fuel(Some(TEMPLATE_FUEL)); + // --- states(entity_id) --- // Returns the current state string of an entity, or "unavailable". let states_sm = Arc::clone(&states); @@ -88,7 +112,21 @@ impl TemplateEnvironment { } /// Render a template string and return the string output. + /// + /// Renders are bounded by an instruction budget ([`TEMPLATE_FUEL`]) and + /// a source-length cap ([`MAX_TEMPLATE_SOURCE_BYTES`]); a malicious + /// template that exhausts the budget returns a [`AutomationError::TemplateRender`] + /// error rather than running unbounded (HC-SEC-01). pub fn render(&self, template_str: &str) -> Result { + // Reject pathologically large sources before compilation (defense + // in depth — fuel already bounds runtime work). + if template_str.len() > MAX_TEMPLATE_SOURCE_BYTES { + return Err(AutomationError::TemplateRender(format!( + "template source too large: {} bytes (max {})", + template_str.len(), + MAX_TEMPLATE_SOURCE_BYTES + ))); + } // Wrap bare expressions like `{{ states('light.kitchen') }}` // in a minimal template wrapper. let tmpl = self @@ -191,4 +229,68 @@ mod tests { assert!(!env.render_bool("0").unwrap()); assert!(!env.render_bool("off").unwrap()); } + + // ── HC-SEC-01: template DoS is bounded by fuel ───────────────────── + // + // A `template:` condition is user config. Before the fuel bound a + // nested-loop template rendered a multi-GB string over ~11 s (proven + // empirically). With fuel enabled it must fail FAST with an error + // instead of expanding unboundedly. On the pre-fix code (no `fuel` + // feature / `set_fuel`) this render succeeds and burns CPU+RAM, so + // this test fails on old (it would `Ok` and exceed the time bound). + #[test] + fn nested_loop_template_is_bounded_not_unbounded_dos() { + use std::time::Instant; + let sm = Arc::new(StateMachine::new()); + let env = TemplateEnvironment::new(sm); + // 5000 * 5000 = 25M iterations on the old engine (~100 MB, ~11 s). + let malicious = + "{% for i in range(5000) %}{% for j in range(5000) %}xxxx{% endfor %}{% endfor %}"; + let start = Instant::now(); + let result = env.render(malicious); + let elapsed = start.elapsed(); + assert!( + result.is_err(), + "malicious nested-loop template must be rejected (ran out of fuel), got Ok" + ); + assert!( + elapsed.as_secs() < 3, + "bounded render must fail fast; took {elapsed:?} (unbounded DoS on old engine)" + ); + } + + // ── HC-SEC-01: a single huge repeat is also bounded ──────────────── + #[test] + fn single_huge_repeat_template_is_bounded() { + let sm = Arc::new(StateMachine::new()); + let env = TemplateEnvironment::new(sm); + // range() caps at 10k per call, but multiplied bodies still need a + // bound; drive enough instructions to exhaust fuel via deep nesting. + let malicious = "{% for a in range(9999) %}{% for b in range(9999) %}\ + {% for c in range(9999) %}z{% endfor %}{% endfor %}{% endfor %}"; + let result = env.render(malicious); + assert!(result.is_err(), "deeply nested loops must exhaust fuel and error"); + } + + // ── HC-SEC-01: oversized template source is rejected pre-compile ─── + #[test] + fn oversized_template_source_is_rejected() { + let sm = Arc::new(StateMachine::new()); + let env = TemplateEnvironment::new(sm); + // 128 KiB of literal text — exceeds MAX_TEMPLATE_SOURCE_BYTES. + let big = "x".repeat(128 * 1024); + let result = env.render(&big); + assert!(result.is_err(), "oversized template source must be rejected"); + } + + // ── A legitimate small template still renders fine within budget ─── + #[test] + fn legitimate_template_still_renders_within_fuel() { + let sm = sm_with("light.kitchen", "on", serde_json::json!({})); + let env = TemplateEnvironment::new(sm); + // A normal HA condition with a modest loop — well under budget. + let ok = "{% for i in range(50) %}{{ states('light.kitchen') }}{% endfor %}"; + let out = env.render(ok).expect("legitimate template must render"); + assert!(out.contains("on")); + } }