fix(backend): align regex_extract + outputs.blob() with D-011/D-012
D-011 — `regex_extract(text, pattern, *, group=1, name=None)`: - engine google-re2 (linear-time, ReDoS-safe), `re` fallback with 1 MB cap. - first match only. - no match → raises Jinja2 `TemplateError` (no silent default — cleanup templates must fail loud when source string drifts). - default capture is group 1 with fallback to group(0) when the pattern has no groups; named groups via `name="<name>"`. D-012 — `outputs.blob()`: - reads the gzip-compressed CAS file from `MIMIC_BLOB_ROOT`. - 10 MB cap is applied **after** decompression. - decode UTF-8 with latin-1 fallback; never raises (missing / corrupt / non-gzip blobs return empty string, logged at WARNING). Unit tests rewritten to cover both the new fail-loud regex contract and the gzip read path. 49 unit tests pass; ruff clean.
This commit is contained in:
@@ -2,7 +2,10 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import gzip
|
||||
|
||||
import pytest
|
||||
from jinja2 import TemplateError
|
||||
|
||||
from mimic.templating.filters import regex_extract
|
||||
from mimic.templating.sandbox import (
|
||||
@@ -17,14 +20,23 @@ class TestRegexExtract:
|
||||
def test_returns_capture_group(self) -> None:
|
||||
assert regex_extract("hello world", r"hello (\w+)") == "world"
|
||||
|
||||
def test_default_when_no_match(self) -> None:
|
||||
assert regex_extract("hello", r"foo(\d+)", default="N/A") == "N/A"
|
||||
def test_no_match_raises(self) -> None:
|
||||
with pytest.raises(TemplateError, match="no match"):
|
||||
regex_extract("hello", r"foo(\d+)")
|
||||
|
||||
def test_none_input_returns_default(self) -> None:
|
||||
assert regex_extract(None, r"x", default="empty") == "empty"
|
||||
def test_none_input_raises(self) -> None:
|
||||
with pytest.raises(TemplateError, match="None"):
|
||||
regex_extract(None, r"x")
|
||||
|
||||
def test_supports_group_zero(self) -> None:
|
||||
assert regex_extract("abc123", r"\w+\d+", group=0) == "abc123"
|
||||
def test_no_groups_falls_back_to_full_match(self) -> None:
|
||||
assert regex_extract("abc123", r"\w+\d+") == "abc123"
|
||||
|
||||
def test_named_group(self) -> None:
|
||||
assert regex_extract("pid=4242", r"pid=(?P<n>\d+)", name="n") == "4242"
|
||||
|
||||
def test_missing_named_group_raises(self) -> None:
|
||||
with pytest.raises(TemplateError):
|
||||
regex_extract("pid=4242", r"pid=(\d+)", name="absent")
|
||||
|
||||
|
||||
class TestCleanupRenderer:
|
||||
@@ -52,6 +64,13 @@ class TestCleanupRenderer:
|
||||
)
|
||||
assert out == "4242"
|
||||
|
||||
def test_regex_extract_no_match_propagates_as_render_error(self) -> None:
|
||||
with pytest.raises(RenderError, match="no match"):
|
||||
self.renderer.render(
|
||||
r"{{ outputs.text | regex_extract('pid=(\\d+)') }}",
|
||||
outputs=StepOutputs(text="nothing"),
|
||||
)
|
||||
|
||||
def test_strict_undefined_raises(self) -> None:
|
||||
with pytest.raises(RenderError):
|
||||
self.renderer.render("{{ params.does_not_exist }}", params={})
|
||||
@@ -73,8 +92,26 @@ class TestStepOutputsBlob:
|
||||
out = StepOutputs(text="x")
|
||||
assert out.blob() == ""
|
||||
|
||||
def test_blob_caps_size(self, tmp_path) -> None:
|
||||
blob = tmp_path / "evidence.bin"
|
||||
blob.write_bytes(b"A" * 1024)
|
||||
def test_blob_reads_gzipped_file(self, tmp_path) -> None:
|
||||
blob = tmp_path / "blob.gz"
|
||||
with gzip.open(blob, "wb") as fh:
|
||||
fh.write(b"hello")
|
||||
out = StepOutputs(blob_path=blob)
|
||||
assert out.blob() == "hello"
|
||||
|
||||
def test_blob_caps_size_after_decompression(self, tmp_path) -> None:
|
||||
blob = tmp_path / "blob.gz"
|
||||
with gzip.open(blob, "wb") as fh:
|
||||
fh.write(b"A" * 1024)
|
||||
out = StepOutputs(blob_path=blob, blob_max_bytes=10)
|
||||
assert out.blob() == "A" * 10
|
||||
|
||||
def test_blob_missing_file_returns_empty(self, tmp_path) -> None:
|
||||
out = StepOutputs(blob_path=tmp_path / "absent.gz")
|
||||
assert out.blob() == ""
|
||||
|
||||
def test_blob_non_gzip_returns_empty(self, tmp_path) -> None:
|
||||
blob = tmp_path / "blob.gz"
|
||||
blob.write_bytes(b"not actually gzip")
|
||||
out = StepOutputs(blob_path=blob)
|
||||
assert out.blob() == ""
|
||||
|
||||
Reference in New Issue
Block a user