fix(backend): align regex_extract + outputs.blob() with D-011/D-012
D-011 — `regex_extract(text, pattern, *, group=1, name=None)`: - engine google-re2 (linear-time, ReDoS-safe), `re` fallback with 1 MB cap. - first match only. - no match → raises Jinja2 `TemplateError` (no silent default — cleanup templates must fail loud when source string drifts). - default capture is group 1 with fallback to group(0) when the pattern has no groups; named groups via `name="<name>"`. D-012 — `outputs.blob()`: - reads the gzip-compressed CAS file from `MIMIC_BLOB_ROOT`. - 10 MB cap is applied **after** decompression. - decode UTF-8 with latin-1 fallback; never raises (missing / corrupt / non-gzip blobs return empty string, logged at WARNING). Unit tests rewritten to cover both the new fail-loud regex contract and the gzip read path. 49 unit tests pass; ruff clean.
This commit is contained in:
@@ -1,14 +1,21 @@
|
|||||||
"""Custom Jinja2 filters.
|
"""Custom Jinja2 filters.
|
||||||
|
|
||||||
`regex_extract(text, pattern, group=1, default="")` uses google-re2 for
|
`regex_extract(text, pattern, *, group=1, name=None)` per D-011:
|
||||||
linear-time matching to neutralize ReDoS on adversarial C2 output. If the
|
- google-re2 engine (linear-time, no backrefs, ReDoS-safe). Falls back to the
|
||||||
library isn't installed the implementation falls back to `re` with a hard
|
stdlib `re` module when re2 is absent, with a 1 MB input cap.
|
||||||
length cap.
|
- First match only.
|
||||||
|
- No match → raises a Jinja2 `TemplateError` (no silent default — cleanup
|
||||||
|
templates must fail loud when the source string drifts).
|
||||||
|
- Default capture is group 1, falling back to the full match when the pattern
|
||||||
|
has no groups. Named groups via `name="<name>"`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from jinja2 import TemplateError
|
||||||
|
|
||||||
try: # pragma: no cover - presence depends on environment
|
try: # pragma: no cover - presence depends on environment
|
||||||
import re2 as _re2 # type: ignore[import-not-found]
|
import re2 as _re2 # type: ignore[import-not-found]
|
||||||
@@ -23,14 +30,15 @@ _FALLBACK_MAX_INPUT = 1 * 1024 * 1024 # 1 MB safety cap when re2 missing
|
|||||||
|
|
||||||
|
|
||||||
def regex_extract(
|
def regex_extract(
|
||||||
text: object,
|
text: Any,
|
||||||
pattern: str,
|
pattern: str,
|
||||||
|
*,
|
||||||
group: int = 1,
|
group: int = 1,
|
||||||
default: str = "",
|
name: str | None = None,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Return capture group `group` of the first match of `pattern` in `text`."""
|
"""First-match capture; raise on no match (spec D-011)."""
|
||||||
if text is None:
|
if text is None:
|
||||||
return default
|
raise TemplateError(f"regex_extract: cannot match against None for /{pattern}/")
|
||||||
haystack = text if isinstance(text, str) else str(text)
|
haystack = text if isinstance(text, str) else str(text)
|
||||||
|
|
||||||
if _HAS_RE2:
|
if _HAS_RE2:
|
||||||
@@ -39,17 +47,37 @@ def regex_extract(
|
|||||||
else:
|
else:
|
||||||
if len(haystack) > _FALLBACK_MAX_INPUT:
|
if len(haystack) > _FALLBACK_MAX_INPUT:
|
||||||
haystack = haystack[:_FALLBACK_MAX_INPUT]
|
haystack = haystack[:_FALLBACK_MAX_INPUT]
|
||||||
compiled_py = re.compile(pattern)
|
match = re.compile(pattern).search(haystack)
|
||||||
match = compiled_py.search(haystack)
|
|
||||||
|
|
||||||
if match is None:
|
if match is None:
|
||||||
return default
|
raise TemplateError(f"regex_extract: no match for /{pattern}/")
|
||||||
|
|
||||||
|
if name is not None:
|
||||||
|
try:
|
||||||
|
captured = match.group(name)
|
||||||
|
except IndexError as exc:
|
||||||
|
raise TemplateError(
|
||||||
|
f"regex_extract: named group {name!r} not in /{pattern}/"
|
||||||
|
) from exc
|
||||||
|
if captured is None:
|
||||||
|
raise TemplateError(
|
||||||
|
f"regex_extract: named group {name!r} captured nothing in /{pattern}/"
|
||||||
|
)
|
||||||
|
return captured
|
||||||
|
|
||||||
try:
|
try:
|
||||||
captured = match.group(group)
|
captured = match.group(group)
|
||||||
except (IndexError, _IndexErrors):
|
except IndexError:
|
||||||
return default
|
if group == 1:
|
||||||
return captured if captured is not None else default
|
return match.group(0)
|
||||||
|
raise TemplateError(
|
||||||
|
f"regex_extract: group {group} out of range for /{pattern}/"
|
||||||
|
) from None
|
||||||
|
|
||||||
|
if captured is None:
|
||||||
# `re2.error` is `_re2.error`; `re.error` differs. Tuple them for safe catch.
|
if group == 1:
|
||||||
_IndexErrors = (re.error,)
|
return match.group(0)
|
||||||
|
raise TemplateError(
|
||||||
|
f"regex_extract: group {group} captured nothing in /{pattern}/"
|
||||||
|
)
|
||||||
|
return captured
|
||||||
|
|||||||
@@ -1,17 +1,22 @@
|
|||||||
"""Sandboxed Jinja2 environment used to resolve cleanup commands and payloads.
|
"""Sandboxed Jinja2 environment used to resolve cleanup commands and payloads.
|
||||||
|
|
||||||
Spec H26 / D-005: two output accessors are exposed.
|
Spec H26 / D-005 / D-012: two output accessors are exposed to templates.
|
||||||
|
|
||||||
- `{{ params.<key> }}` — straight from the merged TTP/scenario parameters.
|
- `{{ params.<key> }}` — straight from the merged TTP/scenario parameters.
|
||||||
- `{{ outputs.text }}` — `run_step.output_text` (stdout / UTF-8 text).
|
- `{{ outputs.text }}` — `run_step.output_text` (stdout / UTF-8 text).
|
||||||
- `{{ outputs.blob("name") }}` — decoded `output_blob_ref` content, 10 MB cap,
|
- `{{ outputs.blob() }}` — decoded `output_blob_ref` content. Per D-012 the
|
||||||
UTF-8 with latin-1 fallback, silent empty string on non-decodable data.
|
blob lives in `MIMIC_BLOB_ROOT` as a content-addressed gzip-compressed file;
|
||||||
|
`StepOutputs` does the decompression and exposes a UTF-8 string with a
|
||||||
|
latin-1 fallback. Hard cap 10 MB **after decompression** (consistent with
|
||||||
|
F8 evidence limit).
|
||||||
|
|
||||||
The custom `regex_extract` filter operates on the resulting string only.
|
The custom `regex_extract` filter operates on the resulting string only.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import gzip
|
||||||
|
import logging
|
||||||
from collections.abc import Mapping
|
from collections.abc import Mapping
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -23,6 +28,8 @@ from jinja2.sandbox import SandboxedEnvironment
|
|||||||
from mimic.config import get_settings
|
from mimic.config import get_settings
|
||||||
from mimic.templating.filters import regex_extract
|
from mimic.templating.filters import regex_extract
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class RenderError(RuntimeError):
|
class RenderError(RuntimeError):
|
||||||
"""Raised when a cleanup / payload template cannot be rendered safely."""
|
"""Raised when a cleanup / payload template cannot be rendered safely."""
|
||||||
@@ -37,26 +44,42 @@ class StepOutputs:
|
|||||||
blob_max_bytes: int = 10 * 1024 * 1024
|
blob_max_bytes: int = 10 * 1024 * 1024
|
||||||
|
|
||||||
def blob(self, _name: str = "default") -> str:
|
def blob(self, _name: str = "default") -> str:
|
||||||
"""Read the binary output blob, decoded (UTF-8 → latin-1 fallback).
|
"""Read the CAS-gzipped output blob (D-012), decoded UTF-8 with
|
||||||
|
latin-1 fallback. Returns the empty string when the blob is missing
|
||||||
|
or undecodable (logged but never raises — templates that need a
|
||||||
|
present blob should assert via regex_extract instead).
|
||||||
|
|
||||||
The argument is accepted for future multi-blob support but ignored in
|
The argument is accepted for future multi-blob support but ignored in
|
||||||
v1 — a step has at most one blob attachment.
|
v1 — a step has at most one blob attachment.
|
||||||
"""
|
"""
|
||||||
if self.blob_path is None:
|
raw = self._read_raw()
|
||||||
return ""
|
if raw is None:
|
||||||
try:
|
|
||||||
raw = self.blob_path.read_bytes()
|
|
||||||
except OSError:
|
|
||||||
return ""
|
return ""
|
||||||
if len(raw) > self.blob_max_bytes:
|
if len(raw) > self.blob_max_bytes:
|
||||||
raw = raw[: self.blob_max_bytes]
|
raw = raw[: self.blob_max_bytes]
|
||||||
try:
|
try:
|
||||||
return raw.decode("utf-8")
|
return raw.decode("utf-8")
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
try:
|
pass
|
||||||
return raw.decode("latin-1")
|
try:
|
||||||
except UnicodeDecodeError: # pragma: no cover - latin-1 never fails
|
return raw.decode("latin-1")
|
||||||
return ""
|
except UnicodeDecodeError: # pragma: no cover - latin-1 never fails
|
||||||
|
log.warning("blob undecodable even as latin-1: %s", self.blob_path)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _read_raw(self) -> bytes | None:
|
||||||
|
if self.blob_path is None:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
with gzip.open(self.blob_path, "rb") as fh:
|
||||||
|
return fh.read(self.blob_max_bytes + 1)
|
||||||
|
except FileNotFoundError:
|
||||||
|
log.warning("blob not found: %s", self.blob_path)
|
||||||
|
except OSError as exc:
|
||||||
|
log.warning("blob unreadable %s: %s", self.blob_path, exc)
|
||||||
|
except gzip.BadGzipFile as exc:
|
||||||
|
log.warning("blob is not gzip %s: %s", self.blob_path, exc)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
class CleanupRenderer:
|
class CleanupRenderer:
|
||||||
|
|||||||
@@ -2,7 +2,10 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import gzip
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
from jinja2 import TemplateError
|
||||||
|
|
||||||
from mimic.templating.filters import regex_extract
|
from mimic.templating.filters import regex_extract
|
||||||
from mimic.templating.sandbox import (
|
from mimic.templating.sandbox import (
|
||||||
@@ -17,14 +20,23 @@ class TestRegexExtract:
|
|||||||
def test_returns_capture_group(self) -> None:
|
def test_returns_capture_group(self) -> None:
|
||||||
assert regex_extract("hello world", r"hello (\w+)") == "world"
|
assert regex_extract("hello world", r"hello (\w+)") == "world"
|
||||||
|
|
||||||
def test_default_when_no_match(self) -> None:
|
def test_no_match_raises(self) -> None:
|
||||||
assert regex_extract("hello", r"foo(\d+)", default="N/A") == "N/A"
|
with pytest.raises(TemplateError, match="no match"):
|
||||||
|
regex_extract("hello", r"foo(\d+)")
|
||||||
|
|
||||||
def test_none_input_returns_default(self) -> None:
|
def test_none_input_raises(self) -> None:
|
||||||
assert regex_extract(None, r"x", default="empty") == "empty"
|
with pytest.raises(TemplateError, match="None"):
|
||||||
|
regex_extract(None, r"x")
|
||||||
|
|
||||||
def test_supports_group_zero(self) -> None:
|
def test_no_groups_falls_back_to_full_match(self) -> None:
|
||||||
assert regex_extract("abc123", r"\w+\d+", group=0) == "abc123"
|
assert regex_extract("abc123", r"\w+\d+") == "abc123"
|
||||||
|
|
||||||
|
def test_named_group(self) -> None:
|
||||||
|
assert regex_extract("pid=4242", r"pid=(?P<n>\d+)", name="n") == "4242"
|
||||||
|
|
||||||
|
def test_missing_named_group_raises(self) -> None:
|
||||||
|
with pytest.raises(TemplateError):
|
||||||
|
regex_extract("pid=4242", r"pid=(\d+)", name="absent")
|
||||||
|
|
||||||
|
|
||||||
class TestCleanupRenderer:
|
class TestCleanupRenderer:
|
||||||
@@ -52,6 +64,13 @@ class TestCleanupRenderer:
|
|||||||
)
|
)
|
||||||
assert out == "4242"
|
assert out == "4242"
|
||||||
|
|
||||||
|
def test_regex_extract_no_match_propagates_as_render_error(self) -> None:
|
||||||
|
with pytest.raises(RenderError, match="no match"):
|
||||||
|
self.renderer.render(
|
||||||
|
r"{{ outputs.text | regex_extract('pid=(\\d+)') }}",
|
||||||
|
outputs=StepOutputs(text="nothing"),
|
||||||
|
)
|
||||||
|
|
||||||
def test_strict_undefined_raises(self) -> None:
|
def test_strict_undefined_raises(self) -> None:
|
||||||
with pytest.raises(RenderError):
|
with pytest.raises(RenderError):
|
||||||
self.renderer.render("{{ params.does_not_exist }}", params={})
|
self.renderer.render("{{ params.does_not_exist }}", params={})
|
||||||
@@ -73,8 +92,26 @@ class TestStepOutputsBlob:
|
|||||||
out = StepOutputs(text="x")
|
out = StepOutputs(text="x")
|
||||||
assert out.blob() == ""
|
assert out.blob() == ""
|
||||||
|
|
||||||
def test_blob_caps_size(self, tmp_path) -> None:
|
def test_blob_reads_gzipped_file(self, tmp_path) -> None:
|
||||||
blob = tmp_path / "evidence.bin"
|
blob = tmp_path / "blob.gz"
|
||||||
blob.write_bytes(b"A" * 1024)
|
with gzip.open(blob, "wb") as fh:
|
||||||
|
fh.write(b"hello")
|
||||||
|
out = StepOutputs(blob_path=blob)
|
||||||
|
assert out.blob() == "hello"
|
||||||
|
|
||||||
|
def test_blob_caps_size_after_decompression(self, tmp_path) -> None:
|
||||||
|
blob = tmp_path / "blob.gz"
|
||||||
|
with gzip.open(blob, "wb") as fh:
|
||||||
|
fh.write(b"A" * 1024)
|
||||||
out = StepOutputs(blob_path=blob, blob_max_bytes=10)
|
out = StepOutputs(blob_path=blob, blob_max_bytes=10)
|
||||||
assert out.blob() == "A" * 10
|
assert out.blob() == "A" * 10
|
||||||
|
|
||||||
|
def test_blob_missing_file_returns_empty(self, tmp_path) -> None:
|
||||||
|
out = StepOutputs(blob_path=tmp_path / "absent.gz")
|
||||||
|
assert out.blob() == ""
|
||||||
|
|
||||||
|
def test_blob_non_gzip_returns_empty(self, tmp_path) -> None:
|
||||||
|
blob = tmp_path / "blob.gz"
|
||||||
|
blob.write_bytes(b"not actually gzip")
|
||||||
|
out = StepOutputs(blob_path=blob)
|
||||||
|
assert out.blob() == ""
|
||||||
|
|||||||
Reference in New Issue
Block a user