"""Blue-side evidence storage service (M7). Files live under `${EVIDENCE_DIR}///`. The path is content-addressed: re-uploading byte-identical content into the same test reuses the existing file on disk and inserts a fresh row (so we keep history of who uploaded what without duplicating bytes). The upload pipeline streams to a tmpfile inside the same per-test directory (`atomic move` semantics on POSIX), computing the SHA256 chunk-by-chunk and aborting when the byte count crosses `MAX_BYTES`. We refuse files whose extension is not in the whitelist; MIME is also validated but with a more permissive fallback (browsers and `file(1)` disagree on `.evtx`). Soft delete only flips `deleted_at`. The bytes are kept on disk so a future admin `/admin/purge` (M12) can remove them physically. Until then, the path is still queryable but the API hides it from non-admins. """ from __future__ import annotations import hashlib import logging import os import re import tempfile import uuid from datetime import datetime, timezone from pathlib import Path from typing import BinaryIO from app.core.config import settings from app.db.session import session_scope from app.models.auth import User from app.models.evidence import EvidenceFile from app.models.mission import MissionScenario, MissionTest from app.services.mission_tests import ( EvidenceView, _ensure_mission_visible, _load_test, _to_evidence_view, _touch, ) log = logging.getLogger("metamorph.evidence") # --------------------------------------------------------------------------- # # Validation rules # --------------------------------------------------------------------------- # MAX_BYTES: int = 25 * 1024 * 1024 # 25 MB per spec §M7 # Filename extensions accepted at the upload boundary. Lowercased; the upload # handler downcases the original filename's tail before comparing. ALLOWED_EXTS: frozenset[str] = frozenset( { ".png", ".jpg", ".jpeg", ".pdf", ".txt", ".log", ".json", ".csv", ".evtx", ".zip", } ) # Accept a permissive MIME set so common browser/OS combos clear validation. # `.evtx` is canonically `application/octet-stream`; some Windows clients send # `application/x-msexcel` for csv; etc. We trust the extension first and use # the MIME as a secondary signal. ALLOWED_MIMES: frozenset[str] = frozenset( { "image/png", "image/jpeg", "image/jpg", "application/pdf", "text/plain", "text/csv", "application/csv", "application/json", "application/octet-stream", "application/zip", "application/x-zip-compressed", } ) # --------------------------------------------------------------------------- # # Exceptions # --------------------------------------------------------------------------- # class EvidenceNotFound(Exception): """Evidence row missing, soft-deleted, or not visible to the viewer.""" class EvidenceValidationError(Exception): """Extension/MIME/size invalid at the upload boundary.""" def __init__(self, code: str, message: str) -> None: super().__init__(message) self.code = code class EvidenceStorageError(Exception): """Disk I/O failure during upload — bytes left on disk are best-effort cleaned.""" # --------------------------------------------------------------------------- # # Helpers # --------------------------------------------------------------------------- # def _evidence_dir() -> Path: return Path(settings.EVIDENCE_DIR).resolve() def _test_dir(mission_id: uuid.UUID, test_id: uuid.UUID) -> Path: root = _evidence_dir() # Refuse to lay down per-mission directories at filesystem roots — an # operator who set EVIDENCE_DIR=/ would otherwise write into / itself. if root in (Path("/"), Path(root.anchor)): raise EvidenceStorageError("EVIDENCE_DIR cannot be a filesystem root") return root / str(mission_id) / str(test_id) def _sniff_ext(filename: str) -> str: """Lowercased extension including the leading dot, or '' if none.""" name = filename.rsplit("/", 1)[-1].rsplit("\\", 1)[-1] if "." not in name: return "" return "." + name.rsplit(".", 1)[-1].lower() def _validate_meta(filename: str, mime: str) -> str: ext = _sniff_ext(filename) if not ext: raise EvidenceValidationError( "missing_extension", "filename must have an extension" ) if ext not in ALLOWED_EXTS: raise EvidenceValidationError( "unsupported_extension", f"extension {ext!r} is not allowed" ) normalised_mime = (mime or "application/octet-stream").lower().split(";", 1)[0].strip() if normalised_mime not in ALLOWED_MIMES: raise EvidenceValidationError( "unsupported_mime", f"mime {normalised_mime!r} is not allowed" ) return ext def _stream_to_tmpfile( src: BinaryIO, target_dir: Path ) -> tuple[Path, str, int]: """Stream the upload into a tmpfile under `target_dir`, capping size. Returns (tmp_path, sha256_hex, total_bytes). Raises `EvidenceValidationError("too_large", …)` once the cumulative count goes above `MAX_BYTES`. The tmpfile is *always* removed on error. """ target_dir.mkdir(parents=True, exist_ok=True) fd, tmp_name = tempfile.mkstemp(prefix=".upload-", dir=str(target_dir)) tmp_path = Path(tmp_name) hasher = hashlib.sha256() total = 0 try: with os.fdopen(fd, "wb") as fh: while True: chunk = src.read(64 * 1024) if not chunk: break total += len(chunk) if total > MAX_BYTES: raise EvidenceValidationError( "too_large", f"file exceeds the {MAX_BYTES} byte limit", ) hasher.update(chunk) fh.write(chunk) return tmp_path, hasher.hexdigest(), total except Exception: try: tmp_path.unlink(missing_ok=True) except OSError: pass raise # --------------------------------------------------------------------------- # # Public API # --------------------------------------------------------------------------- # def add_evidence( mission_id: uuid.UUID, test_id: uuid.UUID, *, file_stream: BinaryIO, original_filename: str, mime: str, viewer_id: uuid.UUID, viewer_is_admin: bool, ) -> EvidenceView: """Persist the upload and return a view of the new evidence row. Pre-conditions: - The caller already verified that the viewer holds `mission.write_blue_fields`. - Mission + test visibility is enforced here (404, not 403). Disk layout: ${EVIDENCE_DIR}/// """ ext = _validate_meta(original_filename, mime) target_dir = _test_dir(mission_id, test_id) # Visibility/existence check BEFORE we touch disk. with session_scope() as s: _ensure_mission_visible(s, mission_id, viewer_id, viewer_is_admin) _load_test(s, mission_id, test_id) # raises MissionTestNotFound on miss tmp_path, sha256, size_bytes = _stream_to_tmpfile(file_stream, target_dir) # Defence in depth — the hash comes from hashlib but if any caller ever # passes pre-computed bytes we want to fail loudly rather than write to a # path like `..something.evtx`. if not re.fullmatch(r"[0-9a-f]{64}", sha256): tmp_path.unlink(missing_ok=True) raise EvidenceStorageError("computed sha256 is malformed") final_path = target_dir / f"{sha256}{ext}" try: if final_path.exists(): # Same bytes already on disk — drop the tmp and reuse the canonical path. tmp_path.unlink(missing_ok=True) else: # `os.replace` is the atomic rename primitive on POSIX (and the # documented atomic rename on Windows when src/dst live on the # same volume). We stage the tmpfile in `target_dir` so it # always shares a filesystem with the destination. os.replace(str(tmp_path), str(final_path)) except OSError as e: try: tmp_path.unlink(missing_ok=True) except OSError: pass log.warning( "metamorph.evidence.storage_failed", extra={"mission_id": str(mission_id), "test_id": str(test_id), "error": str(e)}, ) raise EvidenceStorageError(str(e)) from e with session_scope() as s: # Re-load + double-check visibility (defence in depth: the membership # set could have changed between the pre-check and now). _ensure_mission_visible(s, mission_id, viewer_id, viewer_is_admin) test = _load_test(s, mission_id, test_id) ev = EvidenceFile( mission_test_id=test.id, sha256=sha256, mime=(mime or "application/octet-stream").lower().split(";", 1)[0].strip(), size_bytes=size_bytes, storage_path=str(final_path), original_filename=original_filename[:255], uploaded_by_user_id=viewer_id, uploaded_at=datetime.now(tz=timezone.utc), ) s.add(ev) _touch(test, viewer_id) s.flush() s.refresh(ev) uploader = s.get(User, viewer_id) log.info( "metamorph.evidence.added", extra={ "evidence_id": str(ev.id), "mission_id": str(mission_id), "test_id": str(test_id), "sha256": sha256, "size_bytes": size_bytes, "mime": ev.mime, }, ) return _to_evidence_view(ev, uploader) def _resolve_evidence_chain( s, evidence_id: uuid.UUID ) -> tuple[EvidenceFile, MissionTest, MissionScenario] | None: """Walk evidence → test → scenario, returning None if any link is missing or deleted.""" ev = s.get(EvidenceFile, evidence_id) if ev is None or ev.deleted_at is not None: return None test = s.get(MissionTest, ev.mission_test_id) if test is None or test.deleted_at is not None: return None scenario = s.get(MissionScenario, test.scenario_id) if scenario is None or scenario.deleted_at is not None: return None return ev, test, scenario def get_evidence( evidence_id: uuid.UUID, *, viewer_id: uuid.UUID, viewer_is_admin: bool, ) -> EvidenceView: """Read a single evidence record. Membership-aware (404 on miss/forbidden).""" with session_scope() as s: chain = _resolve_evidence_chain(s, evidence_id) if chain is None: raise EvidenceNotFound() ev, _, scenario = chain try: _ensure_mission_visible(s, scenario.mission_id, viewer_id, viewer_is_admin) except Exception as e: raise EvidenceNotFound() from e uploader = s.get(User, ev.uploaded_by_user_id) if ev.uploaded_by_user_id else None return _to_evidence_view(ev, uploader) def get_evidence_for_download( evidence_id: uuid.UUID, *, viewer_id: uuid.UUID, viewer_is_admin: bool, ) -> tuple[EvidenceView, Path]: """Return view + on-disk path. Raises EvidenceNotFound if the bytes are gone.""" with session_scope() as s: chain = _resolve_evidence_chain(s, evidence_id) if chain is None: raise EvidenceNotFound() ev, _, scenario = chain try: _ensure_mission_visible(s, scenario.mission_id, viewer_id, viewer_is_admin) except Exception as e: raise EvidenceNotFound() from e uploader = s.get(User, ev.uploaded_by_user_id) if ev.uploaded_by_user_id else None view = _to_evidence_view(ev, uploader) path = Path(ev.storage_path) if not path.exists(): log.warning( "metamorph.evidence.bytes_missing", extra={"evidence_id": str(evidence_id), "path": str(path)}, ) raise EvidenceNotFound() return view, path def soft_delete_evidence( evidence_id: uuid.UUID, *, viewer_id: uuid.UUID, viewer_is_admin: bool, ) -> None: """Mark an evidence row deleted. Disk bytes are kept until admin purge (M12).""" with session_scope() as s: chain = _resolve_evidence_chain(s, evidence_id) if chain is None: raise EvidenceNotFound() ev, test, scenario = chain try: _ensure_mission_visible(s, scenario.mission_id, viewer_id, viewer_is_admin) except Exception as e: raise EvidenceNotFound() from e ev.deleted_at = datetime.now(tz=timezone.utc) _touch(test, viewer_id) s.flush() log.info( "metamorph.evidence.soft_deleted", extra={"evidence_id": str(evidence_id), "mission_id": str(scenario.mission_id)}, ) __all__ = [ "MAX_BYTES", "ALLOWED_EXTS", "ALLOWED_MIMES", "EvidenceNotFound", "EvidenceValidationError", "EvidenceStorageError", "add_evidence", "get_evidence", "get_evidence_for_download", "soft_delete_evidence", ]