2026-05-22 05:24:25 +02:00
|
|
|
"""Content-addressed gzip blob store (D-012, MA2 streaming)."""
|
feat(backend): add content-addressed gzip blob store (D-012)
Two on-disk pools per D-012:
- `MIMIC_BLOB_ROOT` (default `/var/lib/mimic/blobs/`) holds C2 polling
output blobs, content-addressed gzip layout `<aa>/<bb>/<sha256>.gz`.
- `MIMIC_EVIDENCE_ROOT` (default `/var/lib/mimic/evidence/`) reserved for
user-uploaded evidence (flat per-engagement, no compression). Wired only
in config + .env.example here; F8 endpoint lands later.
`mimic.storage.blob`:
- `blob_path(root, sha256_hex)` validates the digest and returns the CAS
path. Raises ValueError on a malformed digest (length != 64 or non-hex).
- `store_blob(root, data)` hashes, gzip-compresses, atomically writes to
`<aa>/<bb>/<sha256>.gz` (0o750 dir perms, 0o640 file perms). Idempotent:
duplicate writes leave mtime untouched.
5 new unit tests cover happy path, deduplication, idempotency, malformed
digest, and the two-byte-pair directory layout.
2026-05-21 20:44:59 +02:00
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
import gzip
|
|
|
|
|
import hashlib
|
2026-05-22 05:24:25 +02:00
|
|
|
import io
|
feat(backend): add content-addressed gzip blob store (D-012)
Two on-disk pools per D-012:
- `MIMIC_BLOB_ROOT` (default `/var/lib/mimic/blobs/`) holds C2 polling
output blobs, content-addressed gzip layout `<aa>/<bb>/<sha256>.gz`.
- `MIMIC_EVIDENCE_ROOT` (default `/var/lib/mimic/evidence/`) reserved for
user-uploaded evidence (flat per-engagement, no compression). Wired only
in config + .env.example here; F8 endpoint lands later.
`mimic.storage.blob`:
- `blob_path(root, sha256_hex)` validates the digest and returns the CAS
path. Raises ValueError on a malformed digest (length != 64 or non-hex).
- `store_blob(root, data)` hashes, gzip-compresses, atomically writes to
`<aa>/<bb>/<sha256>.gz` (0o750 dir perms, 0o640 file perms). Idempotent:
duplicate writes leave mtime untouched.
5 new unit tests cover happy path, deduplication, idempotency, malformed
digest, and the two-byte-pair directory layout.
2026-05-21 20:44:59 +02:00
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
|
2026-05-22 05:24:25 +02:00
|
|
|
from mimic.storage.blob import BlobTooLarge, blob_path, store_blob
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _stream(data: bytes) -> io.BytesIO:
|
|
|
|
|
return io.BytesIO(data)
|
feat(backend): add content-addressed gzip blob store (D-012)
Two on-disk pools per D-012:
- `MIMIC_BLOB_ROOT` (default `/var/lib/mimic/blobs/`) holds C2 polling
output blobs, content-addressed gzip layout `<aa>/<bb>/<sha256>.gz`.
- `MIMIC_EVIDENCE_ROOT` (default `/var/lib/mimic/evidence/`) reserved for
user-uploaded evidence (flat per-engagement, no compression). Wired only
in config + .env.example here; F8 endpoint lands later.
`mimic.storage.blob`:
- `blob_path(root, sha256_hex)` validates the digest and returns the CAS
path. Raises ValueError on a malformed digest (length != 64 or non-hex).
- `store_blob(root, data)` hashes, gzip-compresses, atomically writes to
`<aa>/<bb>/<sha256>.gz` (0o750 dir perms, 0o640 file perms). Idempotent:
duplicate writes leave mtime untouched.
5 new unit tests cover happy path, deduplication, idempotency, malformed
digest, and the two-byte-pair directory layout.
2026-05-21 20:44:59 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_blob_path_uses_two_byte_pairs(tmp_path) -> None:
|
|
|
|
|
digest = "ab" + "cd" + "ef" * 30
|
|
|
|
|
path = blob_path(tmp_path, digest)
|
|
|
|
|
assert path == tmp_path / "ab" / "cd" / f"{digest}.gz"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_blob_path_rejects_invalid_digest(tmp_path) -> None:
|
|
|
|
|
with pytest.raises(ValueError, match="invalid sha256"):
|
|
|
|
|
blob_path(tmp_path, "not-a-digest")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_store_blob_writes_gzip_and_returns_digest(tmp_path) -> None:
|
|
|
|
|
payload = b"hello world\n"
|
|
|
|
|
expected = hashlib.sha256(payload).hexdigest()
|
2026-05-22 05:24:25 +02:00
|
|
|
digest, path = store_blob(tmp_path, _stream(payload))
|
feat(backend): add content-addressed gzip blob store (D-012)
Two on-disk pools per D-012:
- `MIMIC_BLOB_ROOT` (default `/var/lib/mimic/blobs/`) holds C2 polling
output blobs, content-addressed gzip layout `<aa>/<bb>/<sha256>.gz`.
- `MIMIC_EVIDENCE_ROOT` (default `/var/lib/mimic/evidence/`) reserved for
user-uploaded evidence (flat per-engagement, no compression). Wired only
in config + .env.example here; F8 endpoint lands later.
`mimic.storage.blob`:
- `blob_path(root, sha256_hex)` validates the digest and returns the CAS
path. Raises ValueError on a malformed digest (length != 64 or non-hex).
- `store_blob(root, data)` hashes, gzip-compresses, atomically writes to
`<aa>/<bb>/<sha256>.gz` (0o750 dir perms, 0o640 file perms). Idempotent:
duplicate writes leave mtime untouched.
5 new unit tests cover happy path, deduplication, idempotency, malformed
digest, and the two-byte-pair directory layout.
2026-05-21 20:44:59 +02:00
|
|
|
assert digest == expected
|
|
|
|
|
assert path == tmp_path / expected[0:2] / expected[2:4] / f"{expected}.gz"
|
|
|
|
|
with gzip.open(path, "rb") as fh:
|
|
|
|
|
assert fh.read() == payload
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_store_blob_is_idempotent(tmp_path) -> None:
|
|
|
|
|
payload = b"same content"
|
2026-05-22 05:24:25 +02:00
|
|
|
digest1, path1 = store_blob(tmp_path, _stream(payload))
|
feat(backend): add content-addressed gzip blob store (D-012)
Two on-disk pools per D-012:
- `MIMIC_BLOB_ROOT` (default `/var/lib/mimic/blobs/`) holds C2 polling
output blobs, content-addressed gzip layout `<aa>/<bb>/<sha256>.gz`.
- `MIMIC_EVIDENCE_ROOT` (default `/var/lib/mimic/evidence/`) reserved for
user-uploaded evidence (flat per-engagement, no compression). Wired only
in config + .env.example here; F8 endpoint lands later.
`mimic.storage.blob`:
- `blob_path(root, sha256_hex)` validates the digest and returns the CAS
path. Raises ValueError on a malformed digest (length != 64 or non-hex).
- `store_blob(root, data)` hashes, gzip-compresses, atomically writes to
`<aa>/<bb>/<sha256>.gz` (0o750 dir perms, 0o640 file perms). Idempotent:
duplicate writes leave mtime untouched.
5 new unit tests cover happy path, deduplication, idempotency, malformed
digest, and the two-byte-pair directory layout.
2026-05-21 20:44:59 +02:00
|
|
|
mtime_before = path1.stat().st_mtime_ns
|
2026-05-22 05:24:25 +02:00
|
|
|
digest2, path2 = store_blob(tmp_path, _stream(payload))
|
feat(backend): add content-addressed gzip blob store (D-012)
Two on-disk pools per D-012:
- `MIMIC_BLOB_ROOT` (default `/var/lib/mimic/blobs/`) holds C2 polling
output blobs, content-addressed gzip layout `<aa>/<bb>/<sha256>.gz`.
- `MIMIC_EVIDENCE_ROOT` (default `/var/lib/mimic/evidence/`) reserved for
user-uploaded evidence (flat per-engagement, no compression). Wired only
in config + .env.example here; F8 endpoint lands later.
`mimic.storage.blob`:
- `blob_path(root, sha256_hex)` validates the digest and returns the CAS
path. Raises ValueError on a malformed digest (length != 64 or non-hex).
- `store_blob(root, data)` hashes, gzip-compresses, atomically writes to
`<aa>/<bb>/<sha256>.gz` (0o750 dir perms, 0o640 file perms). Idempotent:
duplicate writes leave mtime untouched.
5 new unit tests cover happy path, deduplication, idempotency, malformed
digest, and the two-byte-pair directory layout.
2026-05-21 20:44:59 +02:00
|
|
|
assert digest1 == digest2
|
|
|
|
|
assert path1 == path2
|
|
|
|
|
assert path2.stat().st_mtime_ns == mtime_before
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_store_blob_dedupes_distinct_payloads(tmp_path) -> None:
|
2026-05-22 05:24:25 +02:00
|
|
|
_, p1 = store_blob(tmp_path, _stream(b"alpha"))
|
|
|
|
|
_, p2 = store_blob(tmp_path, _stream(b"beta"))
|
feat(backend): add content-addressed gzip blob store (D-012)
Two on-disk pools per D-012:
- `MIMIC_BLOB_ROOT` (default `/var/lib/mimic/blobs/`) holds C2 polling
output blobs, content-addressed gzip layout `<aa>/<bb>/<sha256>.gz`.
- `MIMIC_EVIDENCE_ROOT` (default `/var/lib/mimic/evidence/`) reserved for
user-uploaded evidence (flat per-engagement, no compression). Wired only
in config + .env.example here; F8 endpoint lands later.
`mimic.storage.blob`:
- `blob_path(root, sha256_hex)` validates the digest and returns the CAS
path. Raises ValueError on a malformed digest (length != 64 or non-hex).
- `store_blob(root, data)` hashes, gzip-compresses, atomically writes to
`<aa>/<bb>/<sha256>.gz` (0o750 dir perms, 0o640 file perms). Idempotent:
duplicate writes leave mtime untouched.
5 new unit tests cover happy path, deduplication, idempotency, malformed
digest, and the two-byte-pair directory layout.
2026-05-21 20:44:59 +02:00
|
|
|
assert p1 != p2
|
|
|
|
|
assert p1.exists()
|
|
|
|
|
assert p2.exists()
|
2026-05-22 05:24:25 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_store_blob_raises_when_stream_exceeds_cap(tmp_path) -> None:
|
|
|
|
|
too_big = b"A" * (1024 + 1)
|
|
|
|
|
with pytest.raises(BlobTooLarge):
|
|
|
|
|
store_blob(tmp_path, _stream(too_big), max_bytes=1024)
|
|
|
|
|
# No tmp file left behind.
|
|
|
|
|
leftovers = [p for p in tmp_path.iterdir() if p.name.startswith(".tmp-")]
|
|
|
|
|
assert leftovers == []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_store_blob_handles_large_stream_in_chunks(tmp_path) -> None:
|
|
|
|
|
# 1.5 MB payload — exercises the multi-chunk path (chunks are 64 KB).
|
|
|
|
|
payload = (b"X" * 64 * 1024) * 24
|
|
|
|
|
digest, path = store_blob(tmp_path, _stream(payload), max_bytes=2 * 1024 * 1024)
|
|
|
|
|
assert digest == hashlib.sha256(payload).hexdigest()
|
|
|
|
|
with gzip.open(path, "rb") as fh:
|
|
|
|
|
assert fh.read() == payload
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_store_blob_rejects_zero_or_negative_max(tmp_path) -> None:
|
|
|
|
|
with pytest.raises(ValueError, match="max_bytes"):
|
|
|
|
|
store_blob(tmp_path, _stream(b"x"), max_bytes=0)
|