feat(backend): add content-addressed gzip blob store (D-012)

Two on-disk pools per D-012:
- `MIMIC_BLOB_ROOT` (default `/var/lib/mimic/blobs/`) holds C2 polling
  output blobs, content-addressed gzip layout `<aa>/<bb>/<sha256>.gz`.
- `MIMIC_EVIDENCE_ROOT` (default `/var/lib/mimic/evidence/`) reserved for
  user-uploaded evidence (flat per-engagement, no compression). Wired only
  in config + .env.example here; F8 endpoint lands later.

`mimic.storage.blob`:
- `blob_path(root, sha256_hex)` validates the digest and returns the CAS
  path. Raises ValueError on a malformed digest (length != 64 or non-hex).
- `store_blob(root, data)` hashes, gzip-compresses, atomically writes to
  `<aa>/<bb>/<sha256>.gz` (0o750 dir perms, 0o640 file perms). Idempotent:
  duplicate writes leave mtime untouched.

5 new unit tests cover happy path, deduplication, idempotency, malformed
digest, and the two-byte-pair directory layout.
This commit is contained in:
knacky
2026-05-21 20:44:59 +02:00
parent 162b6988f8
commit 12d131c826
5 changed files with 123 additions and 0 deletions

View File

@@ -3,6 +3,7 @@
from __future__ import annotations
from functools import lru_cache
from pathlib import Path
from typing import Literal
from pydantic import Field, SecretStr, field_validator
@@ -45,6 +46,10 @@ class Settings(BaseSettings):
template_render_timeout_ms: int = 250
output_blob_max_bytes: int = 10 * 1024 * 1024
# D-012: two on-disk pools.
blob_root: Path = Path("/var/lib/mimic/blobs")
evidence_root: Path = Path("/var/lib/mimic/evidence")
@field_validator("cors_origins", mode="before")
@classmethod
def _split_cors(cls, value: object) -> object:

View File

@@ -0,0 +1,14 @@
"""Local file-system pools (D-012).
Two separate roots, configured via env:
- `MIMIC_BLOB_ROOT` (default `/var/lib/mimic/blobs/`) holds C2 polling output
blobs, content-addressed and gzip-compressed: `<aa>/<bb>/<sha256>.gz` where
`aa` and `bb` are the first two byte-pairs of the hex digest.
- `MIMIC_EVIDENCE_ROOT` (default `/var/lib/mimic/evidence/`) holds user-uploaded
evidence files, flat layout `<engagement_id>/<evidence_id>.<ext>`, no
compression.
"""
from mimic.storage.blob import blob_path, store_blob
__all__ = ["blob_path", "store_blob"]

View File

@@ -0,0 +1,51 @@
"""Content-addressed gzip-compressed blob store (D-012)."""
from __future__ import annotations
import gzip
import hashlib
import os
import stat
from pathlib import Path
_SHA256_HEX_LEN = 64
def _validate_digest(sha256_hex: str) -> str:
if len(sha256_hex) != _SHA256_HEX_LEN or any(
c not in "0123456789abcdef" for c in sha256_hex.lower()
):
raise ValueError(f"invalid sha256 digest: {sha256_hex!r}")
return sha256_hex.lower()
def blob_path(root: Path | str, sha256_hex: str) -> Path:
"""Return the absolute path of the gzip-compressed blob `<aa>/<bb>/<digest>.gz`."""
digest = _validate_digest(sha256_hex)
return Path(root) / digest[0:2] / digest[2:4] / f"{digest}.gz"
def store_blob(root: Path | str, data: bytes) -> tuple[str, Path]:
"""Write `data` (gzip-compressed) under its sha256 digest path.
Idempotent: an existing path with the same digest is not overwritten.
Directory permissions are `0750` so only the owner and the `mimic` group
can read.
"""
digest = hashlib.sha256(data).hexdigest()
target = blob_path(root, digest)
if target.exists():
return digest, target
target.parent.mkdir(parents=True, exist_ok=True)
# 0o750: owner full, group r-x, others none. The blob root is owned by the
# `mimic` system user; only the application and any explicit group member
# (audit / backup) get read access.
os.chmod(target.parent, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP) # noqa: S103
tmp = target.with_suffix(target.suffix + ".tmp")
with gzip.open(tmp, "wb") as fh:
fh.write(data)
os.chmod(tmp, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP)
tmp.replace(target)
return digest, target