fix(backend): align regex_extract + outputs.blob() with D-011/D-012

D-011 — `regex_extract(text, pattern, *, group=1, name=None)`:
- engine google-re2 (linear-time, ReDoS-safe), `re` fallback with 1 MB cap.
- first match only.
- no match → raises Jinja2 `TemplateError` (no silent default — cleanup
  templates must fail loud when source string drifts).
- default capture is group 1 with fallback to group(0) when the pattern has
  no groups; named groups via `name="<name>"`.

D-012 — `outputs.blob()`:
- reads the gzip-compressed CAS file from `MIMIC_BLOB_ROOT`.
- 10 MB cap is applied **after** decompression.
- decode UTF-8 with latin-1 fallback; never raises (missing / corrupt /
  non-gzip blobs return empty string, logged at WARNING).

Unit tests rewritten to cover both the new fail-loud regex contract and
the gzip read path. 49 unit tests pass; ruff clean.
This commit is contained in:
knacky
2026-05-21 20:44:48 +02:00
parent d470db97d9
commit 162b6988f8
3 changed files with 127 additions and 39 deletions

View File

@@ -1,14 +1,21 @@
"""Custom Jinja2 filters.
`regex_extract(text, pattern, group=1, default="")` uses google-re2 for
linear-time matching to neutralize ReDoS on adversarial C2 output. If the
library isn't installed the implementation falls back to `re` with a hard
length cap.
`regex_extract(text, pattern, *, group=1, name=None)` per D-011:
- google-re2 engine (linear-time, no backrefs, ReDoS-safe). Falls back to the
stdlib `re` module when re2 is absent, with a 1 MB input cap.
- First match only.
- No match → raises a Jinja2 `TemplateError` (no silent default — cleanup
templates must fail loud when the source string drifts).
- Default capture is group 1, falling back to the full match when the pattern
has no groups. Named groups via `name="<name>"`.
"""
from __future__ import annotations
import re
from typing import Any
from jinja2 import TemplateError
try: # pragma: no cover - presence depends on environment
import re2 as _re2 # type: ignore[import-not-found]
@@ -23,14 +30,15 @@ _FALLBACK_MAX_INPUT = 1 * 1024 * 1024 # 1 MB safety cap when re2 missing
def regex_extract(
text: object,
text: Any,
pattern: str,
*,
group: int = 1,
default: str = "",
name: str | None = None,
) -> str:
"""Return capture group `group` of the first match of `pattern` in `text`."""
"""First-match capture; raise on no match (spec D-011)."""
if text is None:
return default
raise TemplateError(f"regex_extract: cannot match against None for /{pattern}/")
haystack = text if isinstance(text, str) else str(text)
if _HAS_RE2:
@@ -39,17 +47,37 @@ def regex_extract(
else:
if len(haystack) > _FALLBACK_MAX_INPUT:
haystack = haystack[:_FALLBACK_MAX_INPUT]
compiled_py = re.compile(pattern)
match = compiled_py.search(haystack)
match = re.compile(pattern).search(haystack)
if match is None:
return default
raise TemplateError(f"regex_extract: no match for /{pattern}/")
if name is not None:
try:
captured = match.group(name)
except IndexError as exc:
raise TemplateError(
f"regex_extract: named group {name!r} not in /{pattern}/"
) from exc
if captured is None:
raise TemplateError(
f"regex_extract: named group {name!r} captured nothing in /{pattern}/"
)
return captured
try:
captured = match.group(group)
except (IndexError, _IndexErrors):
return default
return captured if captured is not None else default
except IndexError:
if group == 1:
return match.group(0)
raise TemplateError(
f"regex_extract: group {group} out of range for /{pattern}/"
) from None
# `re2.error` is `_re2.error`; `re.error` differs. Tuple them for safe catch.
_IndexErrors = (re.error,)
if captured is None:
if group == 1:
return match.group(0)
raise TemplateError(
f"regex_extract: group {group} captured nothing in /{pattern}/"
)
return captured

View File

@@ -1,17 +1,22 @@
"""Sandboxed Jinja2 environment used to resolve cleanup commands and payloads.
Spec H26 / D-005: two output accessors are exposed.
Spec H26 / D-005 / D-012: two output accessors are exposed to templates.
- `{{ params.<key> }}` — straight from the merged TTP/scenario parameters.
- `{{ outputs.text }}` — `run_step.output_text` (stdout / UTF-8 text).
- `{{ outputs.blob("name") }}` — decoded `output_blob_ref` content, 10 MB cap,
UTF-8 with latin-1 fallback, silent empty string on non-decodable data.
- `{{ outputs.blob() }}` — decoded `output_blob_ref` content. Per D-012 the
blob lives in `MIMIC_BLOB_ROOT` as a content-addressed gzip-compressed file;
`StepOutputs` does the decompression and exposes a UTF-8 string with a
latin-1 fallback. Hard cap 10 MB **after decompression** (consistent with
F8 evidence limit).
The custom `regex_extract` filter operates on the resulting string only.
"""
from __future__ import annotations
import gzip
import logging
from collections.abc import Mapping
from dataclasses import dataclass
from pathlib import Path
@@ -23,6 +28,8 @@ from jinja2.sandbox import SandboxedEnvironment
from mimic.config import get_settings
from mimic.templating.filters import regex_extract
log = logging.getLogger(__name__)
class RenderError(RuntimeError):
"""Raised when a cleanup / payload template cannot be rendered safely."""
@@ -37,26 +44,42 @@ class StepOutputs:
blob_max_bytes: int = 10 * 1024 * 1024
def blob(self, _name: str = "default") -> str:
"""Read the binary output blob, decoded (UTF-8 → latin-1 fallback).
"""Read the CAS-gzipped output blob (D-012), decoded UTF-8 with
latin-1 fallback. Returns the empty string when the blob is missing
or undecodable (logged but never raises — templates that need a
present blob should assert via regex_extract instead).
The argument is accepted for future multi-blob support but ignored in
v1 — a step has at most one blob attachment.
"""
if self.blob_path is None:
return ""
try:
raw = self.blob_path.read_bytes()
except OSError:
raw = self._read_raw()
if raw is None:
return ""
if len(raw) > self.blob_max_bytes:
raw = raw[: self.blob_max_bytes]
try:
return raw.decode("utf-8")
except UnicodeDecodeError:
try:
return raw.decode("latin-1")
except UnicodeDecodeError: # pragma: no cover - latin-1 never fails
return ""
pass
try:
return raw.decode("latin-1")
except UnicodeDecodeError: # pragma: no cover - latin-1 never fails
log.warning("blob undecodable even as latin-1: %s", self.blob_path)
return ""
def _read_raw(self) -> bytes | None:
if self.blob_path is None:
return None
try:
with gzip.open(self.blob_path, "rb") as fh:
return fh.read(self.blob_max_bytes + 1)
except FileNotFoundError:
log.warning("blob not found: %s", self.blob_path)
except OSError as exc:
log.warning("blob unreadable %s: %s", self.blob_path, exc)
except gzip.BadGzipFile as exc:
log.warning("blob is not gzip %s: %s", self.blob_path, exc)
return None
class CleanupRenderer: