fix(m4): code-review pass — SSRF allowlist + advisory lock + typed contract

Six post-code-review fixes, applied before opening the PR per project
workflow (spec-review + code-review both gate the merge):

1. SSRF allowlist on `/mitre/sync`. Host must be in MITRE_ALLOWED_HOSTS
   (defaults to `raw.githubusercontent.com`, env-overridable). Closes "admin
   holding `mitre.sync` pivots api container at 169.254.169.254 / internal
   mirrors" via a typo'd URL. New `MitreSourceForbidden` → 400
   `source_forbidden`; checked at the top of `_download()` so it kicks in
   before any I/O.

2. `pg_advisory_xact_lock(hashtext('mitre.seed'))` at the top of the seed
   transaction. Two concurrent `/mitre/sync` requests now serialise across
   the DELETE+INSERT of `mitre_technique_tactics`; previously they could
   both wipe the M2M and one would fail the unique constraint on re-insert.

3. Typed SyncResult contract. Pydantic `SyncResultOut` on the Flask side
   `model_validate`s the dict before returning — single source of truth
   for the response shape, mirrored by a `MitreSyncResult` TS interface
   (next commit). The `as Record<string, unknown>` + `as { duration_ms }`
   cast in MitrePage is gone.

4. N+1 in dotted sub-technique fallback removed. Built
   `{external_id → technique_id}` once at function entry. Currently a
   no-op against MITRE official (0 orphans), but a latent footgun for
   partial / older bundles.

5. `SETTING_VERSION` cleared explicitly when `source != MITRE_DEFAULT_URL`.
   Previously it kept the stale pin label, so `/mitre/status` lied after
   a custom-URL re-sync.

6. `/mitre/sync` 500s no longer echo `str(e)` to the client — URLError /
   psycopg / Pydantic text now lives in the JSON log only. Public response
   stays `{"error": "internal_error"}`.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Knacky
2026-05-12 19:19:11 +02:00
parent 7a69f10f3e
commit 63b48addc0
2 changed files with 82 additions and 21 deletions

View File

@@ -9,6 +9,7 @@ from __future__ import annotations
import logging
from flask import Blueprint, jsonify, request
from pydantic import BaseModel
from sqlalchemy import func, or_, select
from app.core.auth_decorators import require_auth, require_perm
@@ -16,6 +17,21 @@ from app.db.session import session_scope
from app.models.mitre import MitreSubtechnique, MitreTactic, MitreTechnique
from app.services import mitre_seed as mitre_seed_svc
class SyncResultOut(BaseModel):
"""Response schema for `POST /mitre/sync`. Mirrors `SeedResult.as_dict()`."""
tactics_upserted: int
techniques_upserted: int
subtechniques_upserted: int
subtechniques_skipped_orphan: int
technique_tactic_links: int
version: str | None
source: str
started_at: str
finished_at: str
duration_ms: int
bp = Blueprint("mitre", __name__, url_prefix="/mitre")
log = logging.getLogger("metamorph.api.mitre")
@@ -248,7 +264,8 @@ def sync():
Custom `source` URLs MUST be paired with either `expected_sha256` (integrity
guarantee) or `allow_unverified: true` (explicit opt-out) — the seed service
will raise otherwise.
will raise otherwise. The host is allowlisted (defaults to
raw.githubusercontent.com, overridable via the MITRE_ALLOWED_HOSTS env).
"""
payload = request.get_json(silent=True) or {}
source = payload.get("source") # optional URL override
@@ -261,12 +278,19 @@ def sync():
or (mitre_seed_svc.MITRE_DEFAULT_SHA256 if source is None else None),
allow_unverified=allow_unverified,
)
except mitre_seed_svc.MitreSourceForbidden as e:
return jsonify({"error": "source_forbidden", "message": str(e)}), 400
except mitre_seed_svc.MitreChecksumMismatch as e:
return jsonify({"error": "checksum_mismatch", "message": str(e)}), 502
except mitre_seed_svc.MitreSeedError as e:
return jsonify({"error": "seed_failed", "message": str(e)}), 502
except Exception as e: # noqa: BLE001
except Exception: # noqa: BLE001
# Do NOT leak the internal error string to the client (URLError stack,
# DB driver text). The stack lands in our JSON logs.
log.exception("metamorph.api.mitre.sync_failed")
return jsonify({"error": "internal_error", "message": str(e)}), 500
log.warning("metamorph.api.mitre.sync_done", extra=result.as_dict())
return jsonify(result.as_dict())
return jsonify({"error": "internal_error"}), 500
# Validate via the Pydantic Out model so the response contract is
# explicit (single source of truth shared with the TS interface).
payload_out = SyncResultOut.model_validate(result.as_dict()).model_dump()
log.info("metamorph.api.mitre.sync_done", extra=payload_out)
return jsonify(payload_out)

View File

@@ -29,7 +29,7 @@ from datetime import datetime, timezone
from pathlib import Path
from typing import Iterable
from sqlalchemy import delete, select
from sqlalchemy import delete, select, text as sql_text
from app.db.session import session_scope
from app.models.mitre import (
@@ -59,6 +59,18 @@ MITRE_DEFAULT_SHA256 = "df520ea0775a57db7bff760145b02fed89290802913e056b7ed5970b
MITRE_BUNDLE_CACHE_PATH = Path(os.environ.get("MITRE_CACHE_DIR", "/data/mitre"))
MITRE_DOWNLOAD_TIMEOUT_SECONDS = 120
# Hosts authorised as a source for a MITRE sync. An admin holding `mitre.sync`
# could otherwise pivot the in-container HTTP fetch to internal services
# (169.254.169.254, db, internal mirrors). Override via the `MITRE_ALLOWED_HOSTS`
# env (comma-separated) when running against a private mirror.
MITRE_ALLOWED_HOSTS: frozenset[str] = frozenset(
h.strip()
for h in os.environ.get(
"MITRE_ALLOWED_HOSTS", "raw.githubusercontent.com"
).split(",")
if h.strip()
)
# Settings keys used to expose the seed metadata to the operator UI/CLI.
SETTING_LAST_SYNC = "mitre_last_sync"
SETTING_VERSION = "mitre_version"
@@ -76,6 +88,10 @@ class MitreChecksumMismatch(MitreSeedError):
pass
class MitreSourceForbidden(MitreSeedError):
"""The provided source URL points to a host outside the allowlist."""
@dataclass
class ParsedBundle:
tactics: list[dict] = field(default_factory=list)
@@ -123,6 +139,18 @@ def _is_url(source: str) -> bool:
return parsed.scheme in ("http", "https")
def _ensure_host_allowed(url: str) -> None:
"""Raise MitreSourceForbidden if the URL targets a non-allowlisted host."""
parsed = urllib.parse.urlparse(url)
if parsed.scheme not in ("http", "https"):
raise MitreSourceForbidden(f"unsupported URL scheme: {parsed.scheme!r}")
host = (parsed.hostname or "").lower()
if host not in MITRE_ALLOWED_HOSTS:
raise MitreSourceForbidden(
f"host {host!r} not in MITRE_ALLOWED_HOSTS={sorted(MITRE_ALLOWED_HOSTS)}"
)
def _sha256_of(path: Path) -> str:
h = hashlib.sha256()
with path.open("rb") as f:
@@ -132,6 +160,7 @@ def _sha256_of(path: Path) -> str:
def _download(url: str, dest: Path, *, expected_sha256: str | None = None) -> Path:
_ensure_host_allowed(url)
dest.parent.mkdir(parents=True, exist_ok=True)
tmp = dest.with_suffix(dest.suffix + ".part")
log.info("metamorph.mitre.download.start", extra={"url": url, "dest": str(dest)})
@@ -331,8 +360,18 @@ def _upsert_subtechniques(
subtechniques: Iterable[dict],
stix_to_tech_id: dict,
) -> tuple[int, int]:
"""Returns (n_upserted, n_skipped_orphans)."""
"""Returns (n_upserted, n_skipped_orphans).
`n_upserted` is the count of rows whose state was applied (INSERT or
UPDATE) — matches Postgres upsert semantics.
"""
existing = {sb.external_id: sb for sb in s.scalars(select(MitreSubtechnique)).all()}
# Pre-index techniques by external_id so the dotted-id fallback doesn't
# issue N+1 SELECTs (was a latent footgun for partial-bundle re-syncs).
parent_by_external: dict[str, object] = {
t.external_id: t.id
for t in s.scalars(select(MitreTechnique)).all()
}
n_upserted = 0
n_skipped = 0
for sb in subtechniques:
@@ -342,17 +381,7 @@ def _upsert_subtechniques(
# Fall back to the dotted external_id convention (T1003.001 → T1003).
m = re.match(r"^(T\d+)\.\d+$", sb["external_id"])
if m:
parent_ext = m.group(1)
# We don't have a parent-by-external-id map here; query.
parent_row = next(
iter(
s.scalars(
select(MitreTechnique).where(MitreTechnique.external_id == parent_ext)
).all()
),
None,
)
parent_id = parent_row.id if parent_row else None
parent_id = parent_by_external.get(m.group(1))
if parent_id is None:
log.warning(
"metamorph.mitre.orphan_subtechnique",
@@ -433,6 +462,13 @@ def seed_mitre(
)
with session_scope() as s:
# Serialize concurrent /mitre/sync calls. The lock is transaction-scoped
# (released automatically at COMMIT/ROLLBACK), so a second sync arriving
# while the first is mid-DELETE+INSERT of `mitre_technique_tactics`
# blocks until the first commits. Avoids the unique-constraint race the
# code-reviewer flagged. hashtext() is stable across sessions.
s.execute(sql_text("SELECT pg_advisory_xact_lock(hashtext('mitre.seed'))"))
short_to_tactic_id, n_tactics = _upsert_tactics(s, parsed.tactics)
stix_to_tech_id, n_techs, n_links = _upsert_techniques(
s, parsed.techniques, short_to_tactic_id
@@ -441,9 +477,10 @@ def seed_mitre(
finished_at = datetime.now(tz=timezone.utc)
_upsert_setting(s, SETTING_LAST_SYNC, finished_at.isoformat())
# If the URL is the pinned one, we know the version; otherwise leave None.
# `version` reflects the known pin only when seeded from MITRE_DEFAULT_URL;
# otherwise we explicitly clear it so /mitre/status doesn't lie about a
# stale version after a custom-URL re-sync.
version = MITRE_VERSION if source_label == MITRE_DEFAULT_URL else None
if version:
_upsert_setting(s, SETTING_VERSION, version)
_upsert_setting(s, SETTING_SOURCE_URL, source_label)