diff --git a/backend/.env.example b/backend/.env.example index c169cb2..739dff2 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -21,3 +21,7 @@ MIMIC_CORS_ORIGINS=http://localhost:5173 # Logging MIMIC_LOG_LEVEL=DEBUG MIMIC_LOG_JSON=false + +# Storage pools (D-012) +MIMIC_BLOB_ROOT=/var/lib/mimic/blobs +MIMIC_EVIDENCE_ROOT=/var/lib/mimic/evidence diff --git a/backend/src/mimic/config.py b/backend/src/mimic/config.py index 6f47f07..052db46 100644 --- a/backend/src/mimic/config.py +++ b/backend/src/mimic/config.py @@ -3,6 +3,7 @@ from __future__ import annotations from functools import lru_cache +from pathlib import Path from typing import Literal from pydantic import Field, SecretStr, field_validator @@ -45,6 +46,10 @@ class Settings(BaseSettings): template_render_timeout_ms: int = 250 output_blob_max_bytes: int = 10 * 1024 * 1024 + # D-012: two on-disk pools. + blob_root: Path = Path("/var/lib/mimic/blobs") + evidence_root: Path = Path("/var/lib/mimic/evidence") + @field_validator("cors_origins", mode="before") @classmethod def _split_cors(cls, value: object) -> object: diff --git a/backend/src/mimic/storage/__init__.py b/backend/src/mimic/storage/__init__.py new file mode 100644 index 0000000..79d2b6c --- /dev/null +++ b/backend/src/mimic/storage/__init__.py @@ -0,0 +1,14 @@ +"""Local file-system pools (D-012). + +Two separate roots, configured via env: +- `MIMIC_BLOB_ROOT` (default `/var/lib/mimic/blobs/`) holds C2 polling output + blobs, content-addressed and gzip-compressed: `//.gz` where + `aa` and `bb` are the first two byte-pairs of the hex digest. +- `MIMIC_EVIDENCE_ROOT` (default `/var/lib/mimic/evidence/`) holds user-uploaded + evidence files, flat layout `/.`, no + compression. +""" + +from mimic.storage.blob import blob_path, store_blob + +__all__ = ["blob_path", "store_blob"] diff --git a/backend/src/mimic/storage/blob.py b/backend/src/mimic/storage/blob.py new file mode 100644 index 0000000..c2d1a22 --- /dev/null +++ b/backend/src/mimic/storage/blob.py @@ -0,0 +1,51 @@ +"""Content-addressed gzip-compressed blob store (D-012).""" + +from __future__ import annotations + +import gzip +import hashlib +import os +import stat +from pathlib import Path + +_SHA256_HEX_LEN = 64 + + +def _validate_digest(sha256_hex: str) -> str: + if len(sha256_hex) != _SHA256_HEX_LEN or any( + c not in "0123456789abcdef" for c in sha256_hex.lower() + ): + raise ValueError(f"invalid sha256 digest: {sha256_hex!r}") + return sha256_hex.lower() + + +def blob_path(root: Path | str, sha256_hex: str) -> Path: + """Return the absolute path of the gzip-compressed blob `//.gz`.""" + digest = _validate_digest(sha256_hex) + return Path(root) / digest[0:2] / digest[2:4] / f"{digest}.gz" + + +def store_blob(root: Path | str, data: bytes) -> tuple[str, Path]: + """Write `data` (gzip-compressed) under its sha256 digest path. + + Idempotent: an existing path with the same digest is not overwritten. + Directory permissions are `0750` so only the owner and the `mimic` group + can read. + """ + digest = hashlib.sha256(data).hexdigest() + target = blob_path(root, digest) + if target.exists(): + return digest, target + + target.parent.mkdir(parents=True, exist_ok=True) + # 0o750: owner full, group r-x, others none. The blob root is owned by the + # `mimic` system user; only the application and any explicit group member + # (audit / backup) get read access. + os.chmod(target.parent, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP) # noqa: S103 + + tmp = target.with_suffix(target.suffix + ".tmp") + with gzip.open(tmp, "wb") as fh: + fh.write(data) + os.chmod(tmp, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP) + tmp.replace(target) + return digest, target diff --git a/backend/tests/unit/test_storage_blob.py b/backend/tests/unit/test_storage_blob.py new file mode 100644 index 0000000..a5fa456 --- /dev/null +++ b/backend/tests/unit/test_storage_blob.py @@ -0,0 +1,49 @@ +"""Content-addressed gzip blob store (D-012).""" + +from __future__ import annotations + +import gzip +import hashlib + +import pytest + +from mimic.storage.blob import blob_path, store_blob + + +def test_blob_path_uses_two_byte_pairs(tmp_path) -> None: + digest = "ab" + "cd" + "ef" * 30 + path = blob_path(tmp_path, digest) + assert path == tmp_path / "ab" / "cd" / f"{digest}.gz" + + +def test_blob_path_rejects_invalid_digest(tmp_path) -> None: + with pytest.raises(ValueError, match="invalid sha256"): + blob_path(tmp_path, "not-a-digest") + + +def test_store_blob_writes_gzip_and_returns_digest(tmp_path) -> None: + payload = b"hello world\n" + expected = hashlib.sha256(payload).hexdigest() + digest, path = store_blob(tmp_path, payload) + assert digest == expected + assert path == tmp_path / expected[0:2] / expected[2:4] / f"{expected}.gz" + with gzip.open(path, "rb") as fh: + assert fh.read() == payload + + +def test_store_blob_is_idempotent(tmp_path) -> None: + payload = b"same content" + digest1, path1 = store_blob(tmp_path, payload) + mtime_before = path1.stat().st_mtime_ns + digest2, path2 = store_blob(tmp_path, payload) + assert digest1 == digest2 + assert path1 == path2 + assert path2.stat().st_mtime_ns == mtime_before + + +def test_store_blob_dedupes_distinct_payloads(tmp_path) -> None: + _, p1 = store_blob(tmp_path, b"alpha") + _, p2 = store_blob(tmp_path, b"beta") + assert p1 != p2 + assert p1.exists() + assert p2.exists()