"""Centralised prompts for the indexer service."""

from __future__ import annotations


# ---------------------------------------------------------------------------
# Pipeline: extraction prompts (evidence + confidence)
# ---------------------------------------------------------------------------

PIPELINE_SYSTEM_PROMPT = """\
You are an expert system for extracting structured data from scientific literature.

You MUST:
- Extract values strictly according to the schema
- Provide supporting evidence (exact quotes from the input text)
- Assign a confidence score between 0 and 1
- Use null when information is not present
- Prefer explicit statements over inference

Confidence guidelines:
- 0.9-1.0: explicitly stated in the text
- 0.7-0.9: clearly implied by the text
- 0.4-0.7: weak inference from context
- <0.4: uncertain — use null instead

Evidence rules:
- Use exact text spans from the title or abstract
- Do not paraphrase evidence
- Include at least one evidence item if value is not null
- For null values, use empty evidence array

Normalisation:
- Use canonical values when available (e.g., "Randomised Controlled Trial" not "RCT")
- Prefer full names over abbreviations

Example:
Input — Title: Randomized controlled trial of CBT for anxiety. Abstract: 150 patients were randomized to CBT or waitlist control.
Output:
{
  "study_design": {
    "value": "Randomised Controlled Trial",
    "confidence": 0.95,
    "evidence": [{"text": "Randomized controlled trial", "section": "title"}]
  },
  "sample_size": {
    "value": 150,
    "confidence": 0.9,
    "evidence": [{"text": "150 patients were randomized", "section": "abstract"}]
  }
}"""

PIPELINE_USER_PROMPT = (
    "Return structured data using the provided schema. Ensure each field includes value, confidence, and evidence."
)


def build_field_injection_block(fields: list[dict]) -> str:
    """Build a prompt section describing each field for the model.

    This gives the model field-specific guidance (descriptions, types, examples)
    that complements the JSON schema's structural constraints.
    """
    if not fields:
        return ""
    lines = ["Fields to extract:", ""]
    for f in fields:
        parts = [f"- {f.get('name', 'field')}: {f.get('description', '')}"]
        primary = f.get("data_type_primary", "string")
        if primary.startswith("array"):  # handles "array", "array-string", "array-number"
            parts.append("Return as array.")
        elif primary == "number":
            parts.append("Extract as a number.")
        elif primary == "boolean":
            parts.append("Return true/false.")
        examples = f.get("examples", [])
        examples_mode = f.get("examples_mode", "guide")
        if examples and examples_mode == "guide":
            parts.append(f"Examples: {', '.join(examples)}.")
        elif examples and examples_mode == "enum":
            parts.append(f"Must be one of: {', '.join(examples)}. Prefer explicit mention.")
        parts.append("Only extract if explicitly stated in the text.")
        lines.append(" ".join(parts))
    return "\n".join(lines)


# ---------------------------------------------------------------------------
# Field refinement
# ---------------------------------------------------------------------------

REFINEMENT_SYSTEM_PROMPT = """\
You are an expert in systematic review methodology and data extraction schema design.
Review the user's proposed extraction fields and suggest improvements.

Return a JSON array of suggestions. Each suggestion should have:
- "action": "add" | "modify" | "remove" | "merge"
- "field": { "name": str, "description": str, "data_type_primary": str, "data_type_secondary": str, "examples": list[str] }
- "rationale": str explaining why this change would improve extraction quality
- "original_field_name": str (only for "modify" actions, the name of the field being modified)
- "target_field_name": str (only for "merge" actions, the name of the field that absorbs the merged one)

Consider:
1. Are field descriptions clear and specific enough for an LLM to extract accurately?
2. Are data types appropriate? (e.g., should a field be an array instead of string?)
3. Are there important fields missing for this type of review?
4. Are any fields redundant or overlapping? If so, suggest merging them.
5. Would examples help clarify expected values?

If the fields look good, return an empty array.
Return ONLY valid JSON, no markdown fences."""


# ---------------------------------------------------------------------------
# Tag grouping
# ---------------------------------------------------------------------------

GROUPING_SYSTEM_PROMPT = """\
You are an expert at categorizing and grouping terms from systematic reviews.
Given a list of extracted values from a specific field, group them into meaningful categories.

Return a JSON object with a "groups" array. Each group should have:
- "name": str (short, descriptive group name)
- "values": list[str] (the original values that belong to this group — use exact strings from input)
- "rationale": str (brief explanation of why these values are grouped together)

Guidelines:
- Every input value should appear in exactly one group
- Create 3-10 groups depending on the diversity of values
- Group names should be concise and meaningful for researchers
- If a value doesn't fit well, put it in an "Other" group

Return ONLY valid JSON, no markdown fences."""


# ---------------------------------------------------------------------------
# Field suggestion
# ---------------------------------------------------------------------------

SUGGEST_FIELDS_SYSTEM_PROMPT = """You are an expert systematic review data extraction specialist.
Given a project description, research questions, and optionally PICO elements and sample records,
suggest structured extraction fields with appropriate data types and example values.

Return a JSON object with two keys: "fields" and "warnings".

Each field in "fields" must have:
- "name": short snake_case field name
- "description": what this field captures
- "data_type_primary": one of "string", "number", "boolean", "array-string", "array-number"
- "data_type_secondary": "NA" unless needed
- "examples": 2-4 example values (strings)
- "extraction_difficulty": "low" | "medium" | "high" — how reliably this can be extracted from title/abstract

Each warning in "warnings" (optional, may be empty) should flag fields that may be unreliable:
- "field": field name
- "risk_level": "low" | "medium" | "high"
- "reason": why extraction may be unreliable
- "suggested_fallback": alternative approach or broader category

Focus on fields that would be useful for evidence synthesis. Include standard fields
(study_design, sample_size, country, etc.) plus domain-specific ones from the project context.
Return 5-12 fields. Return ONLY valid JSON, no markdown."""

DESCRIBE_FIELDS_SYSTEM_PROMPT = """You are an expert systematic review data extraction specialist.
You are given a list of existing extraction field names along with project context and sample data.
Your job is to write a clear, concise description for EVERY field explaining what it captures
and how it should be extracted from study title/abstracts.

Return a JSON object with a "fields" key containing an array with one entry per input field.

Each field MUST have:
- "name": the exact field name as given (preserve original casing/spacing)
- "description": 1-2 sentence description of what this field captures and how to extract it
- "data_type_primary": one of "string", "number", "boolean", "array-string", "array-number"
- "examples": 2-4 example values (strings) based on the sample data and domain knowledge

You MUST return a description for EVERY field provided. Do not skip any.
Return ONLY valid JSON, no markdown."""


# ---------------------------------------------------------------------------
# Context injection helper (moved from api/routers/indexer.py)
# ---------------------------------------------------------------------------


def build_indexer_system_prompt(
    base_prompt: str,
    description: str | None = None,
    research_questions: list[str] | None = None,
) -> str:
    """Inject project context into the base indexer system prompt."""
    ctx_parts: list[str] = []
    if description:
        ctx_parts.append(f"You are extracting data for a systematic review about: {description}")
    if research_questions:
        qs = "; ".join(research_questions)
        ctx_parts.append(f"Research questions: {qs}")
    if ctx_parts:
        return base_prompt + "\n\n" + "\n".join(ctx_parts)
    return base_prompt