Metamorph/backend/app/api/diag.py

"""Operational diagnostics. No auth in v1 (M0/M1 only expose non-sensitive
counts and the current Alembic revision).

The `/diag/reset` endpoint is **test-only** — it requires `APP_ENV=test` and
is the bedrock of the e2e suite (clean DB + freshly minted install token).
"""

from __future__ import annotations

import logging
import shutil
from pathlib import Path

from flask import Blueprint, abort, jsonify
from sqlalchemy import text
from sqlalchemy.exc import SQLAlchemyError

from app.core.config import settings
from app.core.install_token import regenerate_install_token
from app.db.session import get_engine
from app.services.detection_levels import seed_detection_levels

bp = Blueprint("diag", __name__, url_prefix="/diag")
log = logging.getLogger("metamorph.diag")


@bp.get("/db")
def db_diag():
    """Return the Alembic revision and the count of public-schema tables."""
    try:
        with get_engine().connect() as conn:
            revision = conn.execute(
                text("SELECT version_num FROM alembic_version")
            ).scalar()
            table_count = conn.execute(
                text(
                    "SELECT count(*) FROM information_schema.tables "
                    "WHERE table_schema='public' AND table_type='BASE TABLE'"
                )
            ).scalar_one()
    except SQLAlchemyError as e:
        log.warning("metamorph.diag.db_unreachable", extra={"error": str(e)})
        return jsonify({"reachable": False, "error": "database_unreachable"}), 503

    return jsonify(
        {
            "reachable": True,
            "alembic_revision": revision,
            "table_count": int(table_count),
        }
    )


@bp.post("/reset")
def reset_test_state():
    """TEST-ONLY: wipe users/auth tables and mint a fresh install token.

    Refuses unless `APP_ENV=test`. Used by the Playwright suite to start each
    auth scenario from a deterministic state.
    """
    # NOTE: this endpoint is the test-suite reset hook. Allowed in `dev` too so
    # the e2e suite can run against a normal `make up` stack, but in dev it is
    # destructive — equivalent to `make clean` for the auth tables. Production
    # (APP_ENV=prod/staging) is locked out.
    if settings.APP_ENV not in ("dev", "test"):
        abort(403, description="diag/reset is only available in dev/test")
    if settings.APP_ENV == "dev":
        log.warning("metamorph.diag.reset_in_dev_environment")

    try:
        with get_engine().begin() as conn:
            # Auth + RBAC + settings reset.
            conn.execute(
                text(
                    "TRUNCATE users, refresh_tokens, invitations, invitation_groups, "
                    "user_groups, settings, groups RESTART IDENTITY CASCADE"
                )
            )
            # Mission catalogue reset (M6). Truncated before the template tables
            # below because `mission_scenarios.source_scenario_template_id` and
            # `mission_tests.source_test_template_id` are ON DELETE SET NULL — a
            # cascade-truncate of templates would attempt to null those columns
            # and stall on the constraint check. Wiping the mission tables first
            # avoids that round-trip; cascades from `missions` then take care of
            # members, scenarios, tests, mitre_tags, categories.
            conn.execute(
                text(
                    "TRUNCATE mission_test_mitre_tags, mission_tests, "
                    "mission_scenarios, mission_categories, mission_members, "
                    "missions RESTART IDENTITY CASCADE"
                )
            )
            # Template catalogue reset (M5). The MITRE truncate below cascades to
            # the polymorphic tag join, but the template rows themselves must be
            # wiped first because `scenario_template_tests.test_template_id` is
            # ON DELETE RESTRICT.
            conn.execute(
                text(
                    "TRUNCATE scenario_template_tests, scenario_templates, "
                    "test_template_mitre_tags, test_templates "
                    "RESTART IDENTITY CASCADE"
                )
            )
            # MITRE reference reset — kept in sync with `settings` so a freshly
            # reset stack has `GET /mitre/status` and `GET /mitre/tactics` agree
            # ("no data, no last_sync"). The e2e suite re-syncs via /mitre/sync
            # when it needs catalogue data.
            conn.execute(
                text(
                    "TRUNCATE mitre_technique_tactics, mitre_subtechniques, "
                    "mitre_techniques, mitre_tactics RESTART IDENTITY CASCADE"
                )
            )
            # Detection levels (M7) are reference data seeded at boot — they
            # are explicitly preserved here, but the seed is re-run below to
            # cover the edge case where an operator hand-tweaked the rows
            # before invoking the reset. The seed is idempotent.
    except SQLAlchemyError as e:
        log.error("metamorph.diag.reset_failed", extra={"error": str(e)})
        return jsonify({"reset": False, "error": "database_error"}), 500

    # M7: wipe the evidence directory so an e2e suite that uploads bytes does
    # not accumulate files across runs. Only in `test`; in `dev` we keep the
    # files (operator likely wants to inspect what they uploaded by hand).
    if settings.APP_ENV == "test":
        evidence_root = Path(settings.EVIDENCE_DIR)
        if evidence_root.exists():
            for child in evidence_root.iterdir():
                # Symlinks are unlinked, never followed — a hostile or
                # accidental symlink inside the evidence dir must NOT cause
                # rmtree to recurse into an unrelated tree.
                try:
                    if child.is_symlink() or not child.is_dir():
                        child.unlink(missing_ok=True)
                    else:
                        shutil.rmtree(child)
                except OSError as e:
                    log.warning(
                        "metamorph.diag.evidence_cleanup_failed",
                        extra={"path": str(child), "error": str(e)},
                    )

    # Detection levels were preserved during the wipe; re-run the seed to
    # cover the off-chance an operator has deleted some rows manually.
    seed_detection_levels()

    token = regenerate_install_token()

    # Clear the in-memory rate-limit counters so the e2e suite that follows can
    # log in repeatedly without hitting `/auth/login`/`/auth/refresh` limits.
    # The limiter uses `memory://` in dev (cf. `app/core/rate_limit.py`).
    try:
        from app.core.rate_limit import limiter  # noqa: PLC0415 — avoid import cycle

        if limiter.enabled:
            limiter.reset()
    except Exception as e:  # noqa: BLE001
        log.warning("metamorph.diag.rate_limit_reset_failed", extra={"error": str(e)})

    log.warning("metamorph.diag.reset_completed")
    return jsonify({"reset": True, "install_token": token})