"""Unified fixtures loader for the AI-endpoints smoke test.

Single source of truth for transforming the CSVs under data/ into
ready-to-POST API payloads. The 11 indexer extraction fields live in
data/indexing/TestMVP2_extraction_fields.csv (one row per field, with
the `examples` column JSON-encoded so it preserves the list shape).
"""
from __future__ import annotations

import json
from dataclasses import dataclass, replace
from pathlib import Path

import pandas as pd


@dataclass(frozen=True)
class Fixtures:
    """Pre-shaped payload data for every AI endpoint."""

    papers: list[dict]              # {id, title, abstract, citation}
    criteria: list[dict]            # {name, type, value}
    questions: list[str]
    project_name: str
    project_description: str
    indexer_records: list[dict]     # {ID, Title, Abstract}
    indexer_fields: list[dict]      # see INDEXER_FIELDS
    country_values: list[str]       # for /v1/indexer/group-tags


def _read_csv(path: Path) -> pd.DataFrame:
    if not path.exists():
        raise FileNotFoundError(f"Required fixture CSV not found: {path}")
    return pd.read_csv(path, encoding="utf-8-sig")


def _load_indexer_fields(path: Path) -> list[dict]:
    df = _read_csv(path)
    out: list[dict] = []
    for _, row in df.iterrows():
        out.append({
            "name": str(row["name"]),
            "description": str(row["description"]),
            "data_type_primary": str(row["data_type_primary"]),
            "examples": json.loads(row["examples"]),
            "examples_mode": str(row["examples_mode"]),
            "depth": str(row["depth"]),
        })
    return out


def load_fixtures(data_root: Path | str = "data") -> Fixtures:
    root = Path(data_root)
    screening = root / "screening"
    indexing = root / "indexing"

    citations = _read_csv(screening / "TestMVP_citations.csv")
    crit = _read_csv(screening / "TestMVP_criteria.csv")
    qs = _read_csv(screening / "TestMVP_questions.csv")
    proj = _read_csv(screening / "TestMVP_project_description.csv")
    idx = _read_csv(indexing / "TestMVP2_Indexing.csv")
    indexer_fields = _load_indexer_fields(indexing / "TestMVP2_extraction_fields.csv")

    papers = [
        {
            "id": str(row["ID"]),
            "title": str(row.get("Title", "") or ""),
            "abstract": str(row.get("Abstract", "") or ""),
            "citation": str(row.get("Full_Citation", "") or ""),
        }
        for _, row in citations.iterrows()
    ]

    criteria = [
        {
            "name": str(row["criteria_name"]),
            "type": str(row["criteria_type"]),
            "value": str(row["criteria_value"]),
        }
        for _, row in crit.iterrows()
    ]

    questions = [str(row["questions"]) for _, row in qs.iterrows() if pd.notna(row.get("questions"))]

    proj_row = proj.iloc[0]
    project_name = str(proj_row.get("project_name", "TestMVP"))
    project_description = str(proj_row.get("project_description", ""))

    indexer_records = [
        {
            "ID": str(row["id"]),
            "Title": str(row.get("title", "") or ""),
            "Abstract": str(row.get("abstract", "") or ""),
        }
        for _, row in idx.iterrows()
    ]

    countries = (
        idx["country"].dropna().astype(str).str.strip()
        .loc[lambda s: s != ""]
        .unique()
        .tolist()
    )

    return Fixtures(
        papers=papers,
        criteria=criteria,
        questions=questions,
        project_name=project_name,
        project_description=project_description,
        indexer_records=indexer_records,
        indexer_fields=indexer_fields,
        country_values=countries,
    )


def sample(fx: Fixtures, n_papers: int, n_records: int | None = None) -> Fixtures:
    """Return a downsampled view for the live pass."""
    n_records = n_records if n_records is not None else n_papers
    return replace(
        fx,
        papers=fx.papers[:n_papers],
        indexer_records=fx.indexer_records[:n_records],
    )