feat(backend): add content-addressed gzip blob store (D-012)
Two on-disk pools per D-012: - `MIMIC_BLOB_ROOT` (default `/var/lib/mimic/blobs/`) holds C2 polling output blobs, content-addressed gzip layout `<aa>/<bb>/<sha256>.gz`. - `MIMIC_EVIDENCE_ROOT` (default `/var/lib/mimic/evidence/`) reserved for user-uploaded evidence (flat per-engagement, no compression). Wired only in config + .env.example here; F8 endpoint lands later. `mimic.storage.blob`: - `blob_path(root, sha256_hex)` validates the digest and returns the CAS path. Raises ValueError on a malformed digest (length != 64 or non-hex). - `store_blob(root, data)` hashes, gzip-compresses, atomically writes to `<aa>/<bb>/<sha256>.gz` (0o750 dir perms, 0o640 file perms). Idempotent: duplicate writes leave mtime untouched. 5 new unit tests cover happy path, deduplication, idempotency, malformed digest, and the two-byte-pair directory layout.
This commit is contained in:
@@ -21,3 +21,7 @@ MIMIC_CORS_ORIGINS=http://localhost:5173
|
|||||||
# Logging
|
# Logging
|
||||||
MIMIC_LOG_LEVEL=DEBUG
|
MIMIC_LOG_LEVEL=DEBUG
|
||||||
MIMIC_LOG_JSON=false
|
MIMIC_LOG_JSON=false
|
||||||
|
|
||||||
|
# Storage pools (D-012)
|
||||||
|
MIMIC_BLOB_ROOT=/var/lib/mimic/blobs
|
||||||
|
MIMIC_EVIDENCE_ROOT=/var/lib/mimic/evidence
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
|
from pathlib import Path
|
||||||
from typing import Literal
|
from typing import Literal
|
||||||
|
|
||||||
from pydantic import Field, SecretStr, field_validator
|
from pydantic import Field, SecretStr, field_validator
|
||||||
@@ -45,6 +46,10 @@ class Settings(BaseSettings):
|
|||||||
template_render_timeout_ms: int = 250
|
template_render_timeout_ms: int = 250
|
||||||
output_blob_max_bytes: int = 10 * 1024 * 1024
|
output_blob_max_bytes: int = 10 * 1024 * 1024
|
||||||
|
|
||||||
|
# D-012: two on-disk pools.
|
||||||
|
blob_root: Path = Path("/var/lib/mimic/blobs")
|
||||||
|
evidence_root: Path = Path("/var/lib/mimic/evidence")
|
||||||
|
|
||||||
@field_validator("cors_origins", mode="before")
|
@field_validator("cors_origins", mode="before")
|
||||||
@classmethod
|
@classmethod
|
||||||
def _split_cors(cls, value: object) -> object:
|
def _split_cors(cls, value: object) -> object:
|
||||||
|
|||||||
14
backend/src/mimic/storage/__init__.py
Normal file
14
backend/src/mimic/storage/__init__.py
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
"""Local file-system pools (D-012).
|
||||||
|
|
||||||
|
Two separate roots, configured via env:
|
||||||
|
- `MIMIC_BLOB_ROOT` (default `/var/lib/mimic/blobs/`) holds C2 polling output
|
||||||
|
blobs, content-addressed and gzip-compressed: `<aa>/<bb>/<sha256>.gz` where
|
||||||
|
`aa` and `bb` are the first two byte-pairs of the hex digest.
|
||||||
|
- `MIMIC_EVIDENCE_ROOT` (default `/var/lib/mimic/evidence/`) holds user-uploaded
|
||||||
|
evidence files, flat layout `<engagement_id>/<evidence_id>.<ext>`, no
|
||||||
|
compression.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from mimic.storage.blob import blob_path, store_blob
|
||||||
|
|
||||||
|
__all__ = ["blob_path", "store_blob"]
|
||||||
51
backend/src/mimic/storage/blob.py
Normal file
51
backend/src/mimic/storage/blob.py
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
"""Content-addressed gzip-compressed blob store (D-012)."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import gzip
|
||||||
|
import hashlib
|
||||||
|
import os
|
||||||
|
import stat
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
_SHA256_HEX_LEN = 64
|
||||||
|
|
||||||
|
|
||||||
|
def _validate_digest(sha256_hex: str) -> str:
|
||||||
|
if len(sha256_hex) != _SHA256_HEX_LEN or any(
|
||||||
|
c not in "0123456789abcdef" for c in sha256_hex.lower()
|
||||||
|
):
|
||||||
|
raise ValueError(f"invalid sha256 digest: {sha256_hex!r}")
|
||||||
|
return sha256_hex.lower()
|
||||||
|
|
||||||
|
|
||||||
|
def blob_path(root: Path | str, sha256_hex: str) -> Path:
|
||||||
|
"""Return the absolute path of the gzip-compressed blob `<aa>/<bb>/<digest>.gz`."""
|
||||||
|
digest = _validate_digest(sha256_hex)
|
||||||
|
return Path(root) / digest[0:2] / digest[2:4] / f"{digest}.gz"
|
||||||
|
|
||||||
|
|
||||||
|
def store_blob(root: Path | str, data: bytes) -> tuple[str, Path]:
|
||||||
|
"""Write `data` (gzip-compressed) under its sha256 digest path.
|
||||||
|
|
||||||
|
Idempotent: an existing path with the same digest is not overwritten.
|
||||||
|
Directory permissions are `0750` so only the owner and the `mimic` group
|
||||||
|
can read.
|
||||||
|
"""
|
||||||
|
digest = hashlib.sha256(data).hexdigest()
|
||||||
|
target = blob_path(root, digest)
|
||||||
|
if target.exists():
|
||||||
|
return digest, target
|
||||||
|
|
||||||
|
target.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
# 0o750: owner full, group r-x, others none. The blob root is owned by the
|
||||||
|
# `mimic` system user; only the application and any explicit group member
|
||||||
|
# (audit / backup) get read access.
|
||||||
|
os.chmod(target.parent, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP) # noqa: S103
|
||||||
|
|
||||||
|
tmp = target.with_suffix(target.suffix + ".tmp")
|
||||||
|
with gzip.open(tmp, "wb") as fh:
|
||||||
|
fh.write(data)
|
||||||
|
os.chmod(tmp, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP)
|
||||||
|
tmp.replace(target)
|
||||||
|
return digest, target
|
||||||
49
backend/tests/unit/test_storage_blob.py
Normal file
49
backend/tests/unit/test_storage_blob.py
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
"""Content-addressed gzip blob store (D-012)."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import gzip
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from mimic.storage.blob import blob_path, store_blob
|
||||||
|
|
||||||
|
|
||||||
|
def test_blob_path_uses_two_byte_pairs(tmp_path) -> None:
|
||||||
|
digest = "ab" + "cd" + "ef" * 30
|
||||||
|
path = blob_path(tmp_path, digest)
|
||||||
|
assert path == tmp_path / "ab" / "cd" / f"{digest}.gz"
|
||||||
|
|
||||||
|
|
||||||
|
def test_blob_path_rejects_invalid_digest(tmp_path) -> None:
|
||||||
|
with pytest.raises(ValueError, match="invalid sha256"):
|
||||||
|
blob_path(tmp_path, "not-a-digest")
|
||||||
|
|
||||||
|
|
||||||
|
def test_store_blob_writes_gzip_and_returns_digest(tmp_path) -> None:
|
||||||
|
payload = b"hello world\n"
|
||||||
|
expected = hashlib.sha256(payload).hexdigest()
|
||||||
|
digest, path = store_blob(tmp_path, payload)
|
||||||
|
assert digest == expected
|
||||||
|
assert path == tmp_path / expected[0:2] / expected[2:4] / f"{expected}.gz"
|
||||||
|
with gzip.open(path, "rb") as fh:
|
||||||
|
assert fh.read() == payload
|
||||||
|
|
||||||
|
|
||||||
|
def test_store_blob_is_idempotent(tmp_path) -> None:
|
||||||
|
payload = b"same content"
|
||||||
|
digest1, path1 = store_blob(tmp_path, payload)
|
||||||
|
mtime_before = path1.stat().st_mtime_ns
|
||||||
|
digest2, path2 = store_blob(tmp_path, payload)
|
||||||
|
assert digest1 == digest2
|
||||||
|
assert path1 == path2
|
||||||
|
assert path2.stat().st_mtime_ns == mtime_before
|
||||||
|
|
||||||
|
|
||||||
|
def test_store_blob_dedupes_distinct_payloads(tmp_path) -> None:
|
||||||
|
_, p1 = store_blob(tmp_path, b"alpha")
|
||||||
|
_, p2 = store_blob(tmp_path, b"beta")
|
||||||
|
assert p1 != p2
|
||||||
|
assert p1.exists()
|
||||||
|
assert p2.exists()
|
||||||
Reference in New Issue
Block a user