fix(backend): align regex_extract + outputs.blob() with D-011/D-012

D-011 — `regex_extract(text, pattern, *, group=1, name=None)`:
- engine google-re2 (linear-time, ReDoS-safe), `re` fallback with 1 MB cap.
- first match only.
- no match → raises Jinja2 `TemplateError` (no silent default — cleanup
  templates must fail loud when source string drifts).
- default capture is group 1 with fallback to group(0) when the pattern has
  no groups; named groups via `name="<name>"`.

D-012 — `outputs.blob()`:
- reads the gzip-compressed CAS file from `MIMIC_BLOB_ROOT`.
- 10 MB cap is applied **after** decompression.
- decode UTF-8 with latin-1 fallback; never raises (missing / corrupt /
  non-gzip blobs return empty string, logged at WARNING).

Unit tests rewritten to cover both the new fail-loud regex contract and
the gzip read path. 49 unit tests pass; ruff clean.
This commit is contained in:
knacky
2026-05-21 20:44:48 +02:00
parent d470db97d9
commit 162b6988f8
3 changed files with 127 additions and 39 deletions

View File

@@ -1,14 +1,21 @@
"""Custom Jinja2 filters. """Custom Jinja2 filters.
`regex_extract(text, pattern, group=1, default="")` uses google-re2 for `regex_extract(text, pattern, *, group=1, name=None)` per D-011:
linear-time matching to neutralize ReDoS on adversarial C2 output. If the - google-re2 engine (linear-time, no backrefs, ReDoS-safe). Falls back to the
library isn't installed the implementation falls back to `re` with a hard stdlib `re` module when re2 is absent, with a 1 MB input cap.
length cap. - First match only.
- No match → raises a Jinja2 `TemplateError` (no silent default — cleanup
templates must fail loud when the source string drifts).
- Default capture is group 1, falling back to the full match when the pattern
has no groups. Named groups via `name="<name>"`.
""" """
from __future__ import annotations from __future__ import annotations
import re import re
from typing import Any
from jinja2 import TemplateError
try: # pragma: no cover - presence depends on environment try: # pragma: no cover - presence depends on environment
import re2 as _re2 # type: ignore[import-not-found] import re2 as _re2 # type: ignore[import-not-found]
@@ -23,14 +30,15 @@ _FALLBACK_MAX_INPUT = 1 * 1024 * 1024 # 1 MB safety cap when re2 missing
def regex_extract( def regex_extract(
text: object, text: Any,
pattern: str, pattern: str,
*,
group: int = 1, group: int = 1,
default: str = "", name: str | None = None,
) -> str: ) -> str:
"""Return capture group `group` of the first match of `pattern` in `text`.""" """First-match capture; raise on no match (spec D-011)."""
if text is None: if text is None:
return default raise TemplateError(f"regex_extract: cannot match against None for /{pattern}/")
haystack = text if isinstance(text, str) else str(text) haystack = text if isinstance(text, str) else str(text)
if _HAS_RE2: if _HAS_RE2:
@@ -39,17 +47,37 @@ def regex_extract(
else: else:
if len(haystack) > _FALLBACK_MAX_INPUT: if len(haystack) > _FALLBACK_MAX_INPUT:
haystack = haystack[:_FALLBACK_MAX_INPUT] haystack = haystack[:_FALLBACK_MAX_INPUT]
compiled_py = re.compile(pattern) match = re.compile(pattern).search(haystack)
match = compiled_py.search(haystack)
if match is None: if match is None:
return default raise TemplateError(f"regex_extract: no match for /{pattern}/")
if name is not None:
try:
captured = match.group(name)
except IndexError as exc:
raise TemplateError(
f"regex_extract: named group {name!r} not in /{pattern}/"
) from exc
if captured is None:
raise TemplateError(
f"regex_extract: named group {name!r} captured nothing in /{pattern}/"
)
return captured
try: try:
captured = match.group(group) captured = match.group(group)
except (IndexError, _IndexErrors): except IndexError:
return default if group == 1:
return captured if captured is not None else default return match.group(0)
raise TemplateError(
f"regex_extract: group {group} out of range for /{pattern}/"
) from None
if captured is None:
# `re2.error` is `_re2.error`; `re.error` differs. Tuple them for safe catch. if group == 1:
_IndexErrors = (re.error,) return match.group(0)
raise TemplateError(
f"regex_extract: group {group} captured nothing in /{pattern}/"
)
return captured

View File

@@ -1,17 +1,22 @@
"""Sandboxed Jinja2 environment used to resolve cleanup commands and payloads. """Sandboxed Jinja2 environment used to resolve cleanup commands and payloads.
Spec H26 / D-005: two output accessors are exposed. Spec H26 / D-005 / D-012: two output accessors are exposed to templates.
- `{{ params.<key> }}` — straight from the merged TTP/scenario parameters. - `{{ params.<key> }}` — straight from the merged TTP/scenario parameters.
- `{{ outputs.text }}` — `run_step.output_text` (stdout / UTF-8 text). - `{{ outputs.text }}` — `run_step.output_text` (stdout / UTF-8 text).
- `{{ outputs.blob("name") }}` — decoded `output_blob_ref` content, 10 MB cap, - `{{ outputs.blob() }}` — decoded `output_blob_ref` content. Per D-012 the
UTF-8 with latin-1 fallback, silent empty string on non-decodable data. blob lives in `MIMIC_BLOB_ROOT` as a content-addressed gzip-compressed file;
`StepOutputs` does the decompression and exposes a UTF-8 string with a
latin-1 fallback. Hard cap 10 MB **after decompression** (consistent with
F8 evidence limit).
The custom `regex_extract` filter operates on the resulting string only. The custom `regex_extract` filter operates on the resulting string only.
""" """
from __future__ import annotations from __future__ import annotations
import gzip
import logging
from collections.abc import Mapping from collections.abc import Mapping
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
@@ -23,6 +28,8 @@ from jinja2.sandbox import SandboxedEnvironment
from mimic.config import get_settings from mimic.config import get_settings
from mimic.templating.filters import regex_extract from mimic.templating.filters import regex_extract
log = logging.getLogger(__name__)
class RenderError(RuntimeError): class RenderError(RuntimeError):
"""Raised when a cleanup / payload template cannot be rendered safely.""" """Raised when a cleanup / payload template cannot be rendered safely."""
@@ -37,26 +44,42 @@ class StepOutputs:
blob_max_bytes: int = 10 * 1024 * 1024 blob_max_bytes: int = 10 * 1024 * 1024
def blob(self, _name: str = "default") -> str: def blob(self, _name: str = "default") -> str:
"""Read the binary output blob, decoded (UTF-8 → latin-1 fallback). """Read the CAS-gzipped output blob (D-012), decoded UTF-8 with
latin-1 fallback. Returns the empty string when the blob is missing
or undecodable (logged but never raises — templates that need a
present blob should assert via regex_extract instead).
The argument is accepted for future multi-blob support but ignored in The argument is accepted for future multi-blob support but ignored in
v1 — a step has at most one blob attachment. v1 — a step has at most one blob attachment.
""" """
if self.blob_path is None: raw = self._read_raw()
return "" if raw is None:
try:
raw = self.blob_path.read_bytes()
except OSError:
return "" return ""
if len(raw) > self.blob_max_bytes: if len(raw) > self.blob_max_bytes:
raw = raw[: self.blob_max_bytes] raw = raw[: self.blob_max_bytes]
try: try:
return raw.decode("utf-8") return raw.decode("utf-8")
except UnicodeDecodeError: except UnicodeDecodeError:
try: pass
return raw.decode("latin-1") try:
except UnicodeDecodeError: # pragma: no cover - latin-1 never fails return raw.decode("latin-1")
return "" except UnicodeDecodeError: # pragma: no cover - latin-1 never fails
log.warning("blob undecodable even as latin-1: %s", self.blob_path)
return ""
def _read_raw(self) -> bytes | None:
if self.blob_path is None:
return None
try:
with gzip.open(self.blob_path, "rb") as fh:
return fh.read(self.blob_max_bytes + 1)
except FileNotFoundError:
log.warning("blob not found: %s", self.blob_path)
except OSError as exc:
log.warning("blob unreadable %s: %s", self.blob_path, exc)
except gzip.BadGzipFile as exc:
log.warning("blob is not gzip %s: %s", self.blob_path, exc)
return None
class CleanupRenderer: class CleanupRenderer:

View File

@@ -2,7 +2,10 @@
from __future__ import annotations from __future__ import annotations
import gzip
import pytest import pytest
from jinja2 import TemplateError
from mimic.templating.filters import regex_extract from mimic.templating.filters import regex_extract
from mimic.templating.sandbox import ( from mimic.templating.sandbox import (
@@ -17,14 +20,23 @@ class TestRegexExtract:
def test_returns_capture_group(self) -> None: def test_returns_capture_group(self) -> None:
assert regex_extract("hello world", r"hello (\w+)") == "world" assert regex_extract("hello world", r"hello (\w+)") == "world"
def test_default_when_no_match(self) -> None: def test_no_match_raises(self) -> None:
assert regex_extract("hello", r"foo(\d+)", default="N/A") == "N/A" with pytest.raises(TemplateError, match="no match"):
regex_extract("hello", r"foo(\d+)")
def test_none_input_returns_default(self) -> None: def test_none_input_raises(self) -> None:
assert regex_extract(None, r"x", default="empty") == "empty" with pytest.raises(TemplateError, match="None"):
regex_extract(None, r"x")
def test_supports_group_zero(self) -> None: def test_no_groups_falls_back_to_full_match(self) -> None:
assert regex_extract("abc123", r"\w+\d+", group=0) == "abc123" assert regex_extract("abc123", r"\w+\d+") == "abc123"
def test_named_group(self) -> None:
assert regex_extract("pid=4242", r"pid=(?P<n>\d+)", name="n") == "4242"
def test_missing_named_group_raises(self) -> None:
with pytest.raises(TemplateError):
regex_extract("pid=4242", r"pid=(\d+)", name="absent")
class TestCleanupRenderer: class TestCleanupRenderer:
@@ -52,6 +64,13 @@ class TestCleanupRenderer:
) )
assert out == "4242" assert out == "4242"
def test_regex_extract_no_match_propagates_as_render_error(self) -> None:
with pytest.raises(RenderError, match="no match"):
self.renderer.render(
r"{{ outputs.text | regex_extract('pid=(\\d+)') }}",
outputs=StepOutputs(text="nothing"),
)
def test_strict_undefined_raises(self) -> None: def test_strict_undefined_raises(self) -> None:
with pytest.raises(RenderError): with pytest.raises(RenderError):
self.renderer.render("{{ params.does_not_exist }}", params={}) self.renderer.render("{{ params.does_not_exist }}", params={})
@@ -73,8 +92,26 @@ class TestStepOutputsBlob:
out = StepOutputs(text="x") out = StepOutputs(text="x")
assert out.blob() == "" assert out.blob() == ""
def test_blob_caps_size(self, tmp_path) -> None: def test_blob_reads_gzipped_file(self, tmp_path) -> None:
blob = tmp_path / "evidence.bin" blob = tmp_path / "blob.gz"
blob.write_bytes(b"A" * 1024) with gzip.open(blob, "wb") as fh:
fh.write(b"hello")
out = StepOutputs(blob_path=blob)
assert out.blob() == "hello"
def test_blob_caps_size_after_decompression(self, tmp_path) -> None:
blob = tmp_path / "blob.gz"
with gzip.open(blob, "wb") as fh:
fh.write(b"A" * 1024)
out = StepOutputs(blob_path=blob, blob_max_bytes=10) out = StepOutputs(blob_path=blob, blob_max_bytes=10)
assert out.blob() == "A" * 10 assert out.blob() == "A" * 10
def test_blob_missing_file_returns_empty(self, tmp_path) -> None:
out = StepOutputs(blob_path=tmp_path / "absent.gz")
assert out.blob() == ""
def test_blob_non_gzip_returns_empty(self, tmp_path) -> None:
blob = tmp_path / "blob.gz"
blob.write_bytes(b"not actually gzip")
out = StepOutputs(blob_path=blob)
assert out.blob() == ""