#!/usr/bin/env bash
set -euo pipefail

ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PYTHON_BIN="${PYTHON_BIN:-$ROOT_DIR/.venv/bin/python}"

if [[ -f "$ROOT_DIR/.env" ]]; then
  set -a
  # shellcheck disable=SC1090
  source "$ROOT_DIR/.env"
  set +a
fi

if [[ ! -x "$PYTHON_BIN" ]]; then
  echo "ERROR: python not found at $PYTHON_BIN" >&2
  exit 1
fi

if [[ -z "${NEWS_EXTRACT_PROVIDER:-}" ]]; then
  if [[ -n "${OPENAI_API_KEY:-}" ]]; then
    export NEWS_EXTRACT_PROVIDER="openai"
  elif [[ -n "${GROQ_API_KEY:-}" ]]; then
    export NEWS_EXTRACT_PROVIDER="groq"
  else
    export NEWS_EXTRACT_PROVIDER="openai"
  fi
fi

case "${NEWS_EXTRACT_PROVIDER}" in
  openai)
    export NEWS_EXTRACT_MODEL="${NEWS_EXTRACT_MODEL:-gpt-5-nano-2025-08-07}"
    ;;
  groq)
    export NEWS_EXTRACT_MODEL="${NEWS_EXTRACT_MODEL:-llama4-16e}"
    ;;
esac
export ENTITY_BLACKLIST="${ENTITY_BLACKLIST:-}"

case "${NEWS_EXTRACT_PROVIDER}" in
  openai)
    if [[ -z "${OPENAI_API_KEY:-}" ]]; then
      echo "ERROR: OPENAI_API_KEY is not set, so the live OpenAI extraction test cannot run." >&2
      exit 4
    fi
    ;;
  groq)
    if [[ -z "${GROQ_API_KEY:-}" ]]; then
      echo "ERROR: GROQ_API_KEY is not set, so the live Groq extraction test cannot run." >&2
      exit 4
    fi
    ;;
esac

"$PYTHON_BIN" - <<'PY'
import asyncio
import json
import os
import sys
from news_mcp.llm import call_extraction

cluster = {
    "headline": "Reuters says Bitcoin, Ethereum, the Fed, and the ECB reacted as Trump and the EU discussed Iran and Israel",
    "summary": (
        "In a fictional test report, Reuters described Bitcoin, Ethereum, "
        "the Federal Reserve, and the European Central Bank. Trump, the EU, "
        "Iran, and Israel were all mentioned in the same narrative."
    ),
    "articles": [
        {
            "title": "Reuters says Bitcoin, Ethereum, the Fed, and the ECB reacted as Trump and the EU discussed Iran and Israel",
            "url": "https://example.com/test",
            "source": "TestSource",
            "timestamp": "Tue, 31 Mar 2026 12:00:00 GMT",
            "summary": "A fabricated test story involving several named entities.",
        }
    ],
}

# Quantifiable acceptance set: the model may canonicalize some entities,
# but it must recover the core set below.
expected_any = {
    "Reuters",
    "Bitcoin",
    "Ethereum",
    "Fed",
    "ECB",
    "Trump",
    "EU",
    "Iran",
    "Israel",
}

canonical_map = {
    "federal reserve": "Fed",
    "federalreserve": "Fed",
    "european central bank": "ECB",
    "ecb": "ECB",
    "european union": "EU",
    "eu": "EU",
    "donald trump": "Trump",
    "trump": "Trump",
}

async def main() -> int:
    out = await call_extraction(cluster)
    entities = out.get("entities", [])
    normalized = set()
    for ent in entities:
        key = str(ent).strip().lower()
        normalized.add(canonical_map.get(key, str(ent).strip()))

    missing = sorted(expected_any - normalized)
    extra = sorted(normalized - expected_any)

    print(json.dumps({
        "provider": os.getenv("NEWS_EXTRACT_PROVIDER"),
        "model": os.getenv("NEWS_EXTRACT_MODEL"),
        "output": out,
        "normalized_entities": sorted(normalized),
        "missing": missing,
        "extra": extra,
    }, ensure_ascii=False, indent=2))

    if missing:
        print(f"FAIL: missing entities: {missing}", file=sys.stderr)
        return 2

    # Extra entities are tolerated only if they are generic / helpful.
    allowed_extras = {"Macro", "Crypto"}
    bad_extra = [e for e in extra if e not in allowed_extras]
    if bad_extra:
        print(f"FAIL: unexpected extra entities: {bad_extra}", file=sys.stderr)
        return 3

    print("PASS: live extraction smoke test matched expected core entities")
    return 0

raise SystemExit(asyncio.run(main()))
PY