há 1 mês atrás · 64bf700047
--- a/.env.example
+++ b/.env.example
@@ -28,3 +28,4 @@ NEWS_REFRESH_INTERVAL_SECONDS=900
 
				 NEWS_BACKGROUND_REFRESH_ENABLED=true
			
 
				 NEWS_BACKGROUND_REFRESH_ON_START=true
			
 
				 NEWS_PROMPTS_DIR=
			
 
				+NEWS_ENTITY_ALIASES_FILE=
			
--- a/OUTLOOK.md
+++ b/OUTLOOK.md
@@ -388,3 +388,37 @@ But only if you:
 
				 * cluster events
			
 
				 * compress information
			
 
				 
			
 
				+---
			
 
				+
			
 
				+# ✅ Completed since this outlook was written
			
 
				+
			
 
				+* provider-agnostic LLM extraction/summarization layer added
			
 
				+* prompts moved into separate files for easier updates
			
 
				+* entity blacklist implemented and made case-insensitive
			
 
				+* live extraction smoke test added
			
 
				+* docs updated with the new env vars and workflow
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# 🔭 Next high-level steps
			
 
				+
			
 
				+1. **Normalization layer**
			
 
				+
			
 
				+   * canonicalize acronyms and entity variants before storage / querying
			
 
				+   * keep the blacklist as a separate post-processing rule
			
 
				+
			
 
				+2. **Wildcard blacklist support**
			
 
				+
			
 
				+   * allow patterns for entities / topics / keywords
			
 
				+   * keep matching case-insensitive
			
 
				+
			
 
				+3. **Emerging signal quality**
			
 
				+
			
 
				+   * tune what counts as an emerging topic/entity
			
 
				+   * reduce noise from repeated source names and generic terms
			
 
				+
			
 
				+4. **Entity/time tracking and replay (future capability)**
			
 
				+
			
 
				+   * track how important entities evolve over time
			
 
				+   * allow replay of when entities first appeared, how topics shifted, and how sentiment changed
			
 
				+   * useful later for narrative reconstruction and trend timelines
			
--- a/README.md
+++ b/README.md
@@ -50,6 +50,7 @@ Key variables:
 
				 - `GROQ_API_KEY`, `OPENAI_API_KEY`
			
 
				 - `ENTITY_BLACKLIST` (comma-separated, case-insensitive exact entity match)
			
 
				 - `NEWS_PROMPTS_DIR` (override prompt directory)
			
 
				+- `NEWS_ENTITY_ALIASES_FILE` (override entity alias JSON file)
			
 
				 - `NEWS_FEED_URL` (single feed fallback)
			
 
				 - `NEWS_FEED_URLS` (comma-separated feed URLs; overrides `NEWS_FEED_URL`)
			
 
				 - `NEWS_REFRESH_INTERVAL_SECONDS` (default 900)
			
--- a/config/entity_aliases.json
+++ b/config/entity_aliases.json
@@ -0,0 +1,16 @@
 
				+{
			
 
				+  "btc": "Bitcoin",
			
 
				+  "bitcoin": "Bitcoin",
			
 
				+  "eth": "Ethereum",
			
 
				+  "ethereum": "Ethereum",
			
 
				+  "fed": "Federal Reserve",
			
 
				+  "federal reserve": "Federal Reserve",
			
 
				+  "ecb": "European Central Bank",
			
 
				+  "european central bank": "European Central Bank",
			
 
				+  "eu": "European Union",
			
 
				+  "european union": "European Union",
			
 
				+  "trump": "Donald Trump",
			
 
				+  "donald trump": "Donald Trump",
			
 
				+  "merz": "Friedrich Merz",
			
 
				+  "friedrich merz": "Friedrich Merz"
			
 
				+}
			
--- a/news_mcp/config.py
+++ b/news_mcp/config.py
@@ -10,6 +10,7 @@ DATA_DIR = Path(os.getenv("NEWS_MCP_DATA_DIR", Path(__file__).resolve().parent /
 
				 DATA_DIR.mkdir(parents=True, exist_ok=True)
			
 
				 DB_PATH = Path(os.getenv("NEWS_MCP_DB_PATH", str(DATA_DIR / "news.sqlite")))
			
 
				 PROMPTS_DIR = Path(os.getenv("NEWS_PROMPTS_DIR", str(_HERE / "prompts")))
			
 
				+ENTITY_ALIASES_FILE = Path(os.getenv("NEWS_ENTITY_ALIASES_FILE", str(_HERE / "config" / "entity_aliases.json")))
			
 
				 
			
 
				 NEWS_FEED_URL = os.getenv("NEWS_FEED_URL", os.getenv("NEWS_RSS_FEED_URL", "https://breakingthenews.net/news-feed.xml"))
			
 
				 NEWS_FEED_URLS = os.getenv("NEWS_FEED_URLS", os.getenv("NEWS_RSS_FEED_URLS", "")).strip()
			
--- a/news_mcp/enrichment/groq_enrich.py
+++ b/news_mcp/enrichment/groq_enrich.py
@@ -1,194 +0,0 @@
 
				-from __future__ import annotations
			
 
				-
			
 
				-import json
			
 
				-import logging
			
 
				-from typing import Any, Dict, List
			
 
				-
			
 
				-import httpx
			
 
				-
			
 
				-from news_mcp.config import GROQ_API_KEY, GROQ_MODEL, GROQ_DEBUG
			
 
				-
			
 
				-
			
 
				-logger = logging.getLogger(__name__)
			
 
				-
			
 
				-
			
 
				-_SYSTEM = "You are a news signal extraction engine. Return STRICT JSON only."
			
 
				-
			
 
				-
			
 
				-def _build_prompt(articles: List[Dict[str, Any]], headline: str, summary: str | None) -> str:
			
 
				-    # Keep prompt compact: clusters already deduped.
			
 
				-    sample = articles[:6]
			
 
				-    return json.dumps(
			
 
				-        {
			
 
				-            "cluster": {
			
 
				-                "headline": headline,
			
 
				-                "summary": summary or "",
			
 
				-                "articles": [
			
 
				-                    {
			
 
				-                        "title": a.get("title"),
			
 
				-                        "url": a.get("url"),
			
 
				-                        "source": a.get("source"),
			
 
				-                        "timestamp": a.get("timestamp"),
			
 
				-                        "summary": a.get("summary", ""),
			
 
				-                    }
			
 
				-                    for a in sample
			
 
				-                ],
			
 
				-            }
			
 
				-        },
			
 
				-        ensure_ascii=False,
			
 
				-    )
			
 
				-
			
 
				-
			
 
				-async def classify_cluster_groq(cluster: Dict[str, Any]) -> Dict[str, Any]:
			
 
				-    if not GROQ_API_KEY:
			
 
				-        # No enrichment configured.
			
 
				-        return cluster
			
 
				-
			
 
				-    headline = cluster.get("headline", "")
			
 
				-    summary = cluster.get("summary", "")
			
 
				-    articles = cluster.get("articles", [])
			
 
				-
			
 
				-    user_payload = _build_prompt(articles=articles, headline=headline, summary=summary)
			
 
				-
			
 
				-    prompt = (
			
 
				-        f"Input cluster JSON:\n{user_payload}\n\n"
			
 
				-        "You MUST extract a news signal from the headline AND summary. Do not leave entities empty when the text mentions obvious names.\n"
			
 
				-        "Task:\n"
			
 
				-        "1) infer the best top-level topic\n"
			
 
				-        "2) extract concise entities from the cluster\n"
			
 
				-        "3) assign sentiment from the wording/context\n"
			
 
				-        "4) provide short keywords that justify the classification\n\n"
			
 
				-        "Entity rules (strict):\n"
			
 
				-        "- Use short strings (1-5 words).\n"
			
 
				-        "- Include all obvious named entities mentioned in headline or summary: people, countries, regions, organizations, ministries, presidents, leaders, wars/conflicts if named.\n"
			
 
				-        "- Also include finance/crypto entities when present: BTC, ETH, Bitcoin, Ethereum, ETF, SEC, ECB, Fed, euro, inflation, rates.\n"
			
 
				-        "- If the cluster mentions Iran, UAE, Egypt, Germany, Europe, Trump, Merz, Sisi, those should appear in entities.\n"
			
 
				-        "- Do NOT return empty entities if any such names/places appear.\n\n"
			
 
				-        "Sentiment rules:\n"
			
 
				-        "- positive: clearly encouraging, improving, or supportive tone\n"
			
 
				-        "- negative: clearly alarming, worsening, severe, conflict, loss, risk, warning tone\n"
			
 
				-        "- neutral: factual, balanced, or mixed\n"
			
 
				-        "- sentimentScore must be a number from -1.0 to 1.0 and should reflect the sentiment label.\n\n"
			
 
				-        "Return STRICT JSON with EXACT keys only:\n"
			
 
				-        "{ topic, entities, sentiment, sentimentScore, keywords }\n"
			
 
				-        "where topic is one of [crypto, macro, regulation, ai, other].\n"
			
 
				-    )
			
 
				-
			
 
				-    if GROQ_DEBUG:
			
 
				-        msg = f"[GROQ PROMPT] {prompt}"
			
 
				-        logger.warning(msg)
			
 
				-        print(msg, flush=True)
			
 
				-
			
 
				-    req = {
			
 
				-        "model": GROQ_MODEL,
			
 
				-        "messages": [
			
 
				-            {"role": "system", "content": _SYSTEM},
			
 
				-            {"role": "user", "content": prompt},
			
 
				-        ],
			
 
				-        "temperature": 0.2,
			
 
				-        "response_format": {"type": "json_object"},
			
 
				-    }
			
 
				-
			
 
				-    async with httpx.AsyncClient(timeout=30.0) as client:
			
 
				-        resp = await client.post(
			
 
				-            "https://api.groq.com/openai/v1/chat/completions",
			
 
				-            headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
			
 
				-            json=req,
			
 
				-        )
			
 
				-        resp.raise_for_status()
			
 
				-        data = resp.json()
			
 
				-
			
 
				-    content = data["choices"][0]["message"]["content"]
			
 
				-
			
 
				-    if GROQ_DEBUG:
			
 
				-        msg = f"[GROQ RAW RESPONSE] {content}"
			
 
				-        logger.warning(msg)
			
 
				-        print(msg, flush=True)
			
 
				-
			
 
				-    parsed = json.loads(content)
			
 
				-
			
 
				-    # Normalize output types into our cluster shape.
			
 
				-    topic = parsed.get("topic") or cluster.get("topic")
			
 
				-    entities = parsed.get("entities") or []
			
 
				-    sentiment = parsed.get("sentiment") or "neutral"
			
 
				-    sentiment_score = parsed.get("sentimentScore")
			
 
				-    keywords = parsed.get("keywords") or []
			
 
				-
			
 
				-    out = dict(cluster)
			
 
				-    if topic:
			
 
				-        out["topic"] = topic
			
 
				-    out["entities"] = entities
			
 
				-    out["sentiment"] = sentiment
			
 
				-    if sentiment_score is not None:
			
 
				-        out["sentimentScore"] = float(sentiment_score)
			
 
				-    out["keywords"] = keywords
			
 
				-    return out
			
 
				-
			
 
				-
			
 
				-async def summarize_cluster_groq(cluster: Dict[str, Any]) -> Dict[str, Any]:
			
 
				-    """Produce a compact agent-facing summary.
			
 
				-
			
 
				-    Returns:
			
 
				-      {
			
 
				-        "headline": str,
			
 
				-        "mergedSummary": str,
			
 
				-        "keyFacts": [str,...],
			
 
				-        "sources": [str,...]
			
 
				-      }
			
 
				-    """
			
 
				-    if not GROQ_API_KEY:
			
 
				-        return {
			
 
				-            "headline": cluster.get("headline"),
			
 
				-            "mergedSummary": cluster.get("summary"),
			
 
				-            "keyFacts": [],
			
 
				-            "sources": cluster.get("sources", []),
			
 
				-        }
			
 
				-
			
 
				-    headline = cluster.get("headline", "")
			
 
				-    summary = cluster.get("summary", "")
			
 
				-    articles = cluster.get("articles", [])
			
 
				-
			
 
				-    sample = articles[:5]
			
 
				-    req = {
			
 
				-        "model": GROQ_MODEL,
			
 
				-        "messages": [
			
 
				-            {
			
 
				-                "role": "system",
			
 
				-                "content": "You are a summarization engine for news clusters. Return strict JSON only.",
			
 
				-            },
			
 
				-            {
			
 
				-                "role": "user",
			
 
				-                "content": json.dumps(
			
 
				-                    {
			
 
				-                        "headline": headline,
			
 
				-                        "summary": summary,
			
 
				-                        "articles": [
			
 
				-                            {
			
 
				-                                "title": a.get("title"),
			
 
				-                                "url": a.get("url"),
			
 
				-                                "source": a.get("source"),
			
 
				-                                "timestamp": a.get("timestamp"),
			
 
				-                            }
			
 
				-                            for a in sample
			
 
				-                        ],
			
 
				-                    },
			
 
				-                    ensure_ascii=False,
			
 
				-                )
			
 
				-                + "\n\nReturn keys: headline, mergedSummary, keyFacts (5-8 strings), sources. mergedSummary should be 2-4 sentences.",
			
 
				-            },
			
 
				-        ],
			
 
				-        "temperature": 0.2,
			
 
				-        "response_format": {"type": "json_object"},
			
 
				-    }
			
 
				-
			
 
				-    async with httpx.AsyncClient(timeout=45.0) as client:
			
 
				-        resp = await client.post(
			
 
				-            "https://api.groq.com/openai/v1/chat/completions",
			
 
				-            headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
			
 
				-            json=req,
			
 
				-        )
			
 
				-        resp.raise_for_status()
			
 
				-        data = resp.json()
			
 
				-    content = data["choices"][0]["message"]["content"]
			
 
				-    parsed = json.loads(content)
			
 
				-    return parsed
			
--- a/news_mcp/enrichment/llm_enrich.py
+++ b/news_mcp/enrichment/llm_enrich.py
@@ -1,17 +1,25 @@
 
				 from __future__ import annotations
			
 
				 
			
 
				+from fnmatch import fnmatchcase
			
 
				 from typing import Any, Dict
			
 
				 
			
 
				 from news_mcp.config import NEWS_ENTITY_BLACKLIST
			
 
				+from news_mcp.entity_normalize import normalize_entities
			
 
				 from news_mcp.llm import call_extraction, call_summary
			
 
				 
			
 
				 
			
 
				+def _matches_blacklist(value: str, blacklist=None) -> bool:
			
 
				+    patterns = [x.strip().lower() for x in (blacklist if blacklist is not None else NEWS_ENTITY_BLACKLIST) if x and x.strip()]
			
 
				+    key = str(value).strip().lower()
			
 
				+    if not key:
			
 
				+        return True
			
 
				+    return any(fnmatchcase(key, pattern) for pattern in patterns)
			
 
				+
			
 
				+
			
 
				 def _filter_entities(entities, blacklist=None):
			
 
				-    banned = set(x.strip().lower() for x in (blacklist if blacklist is not None else NEWS_ENTITY_BLACKLIST))
			
 
				     out = []
			
 
				     for ent in entities or []:
			
 
				-        key = str(ent).strip().lower()
			
 
				-        if not key or key in banned:
			
 
				+        if _matches_blacklist(ent, blacklist=blacklist):
			
 
				             continue
			
 
				         out.append(ent)
			
 
				     return out
			
@@ -20,12 +28,15 @@ def _filter_entities(entities, blacklist=None):
 
				 async def classify_cluster_groq(cluster: Dict[str, Any]) -> Dict[str, Any]:
			
 
				     parsed = await call_extraction(cluster)
			
 
				     out = dict(cluster)
			
 
				+    topic = parsed.get("topic", cluster.get("topic"))
			
 
				+    if topic and _matches_blacklist(topic):
			
 
				+        topic = "other"
			
 
				     out.update({
			
 
				-        "topic": parsed.get("topic", cluster.get("topic")),
			
 
				-        "entities": _filter_entities(parsed.get("entities", [])),
			
 
				+        "topic": topic,
			
 
				+        "entities": normalize_entities(_filter_entities(parsed.get("entities", []))),
			
 
				         "sentiment": parsed.get("sentiment", "neutral"),
			
 
				         "sentimentScore": parsed.get("sentimentScore"),
			
 
				-        "keywords": parsed.get("keywords", []),
			
 
				+        "keywords": normalize_entities(_filter_entities(parsed.get("keywords", []))),
			
 
				     })
			
 
				     return out
			
 
				 
			
--- a/news_mcp/entity_normalize.py
+++ b/news_mcp/entity_normalize.py
@@ -0,0 +1,54 @@
 
				+from __future__ import annotations
			
 
				+
			
 
				+import json
			
 
				+from functools import lru_cache
			
 
				+from pathlib import Path
			
 
				+from typing import Iterable
			
 
				+
			
 
				+from news_mcp.config import ENTITY_ALIASES_FILE
			
 
				+
			
 
				+# Small, explicit canonical alias map.
			
 
				+# Keep this conservative and grow it only when a shorthand is clearly useful.
			
 
				+@lru_cache(maxsize=1)
			
 
				+def _alias_map() -> dict[str, str]:
			
 
				+    path = Path(ENTITY_ALIASES_FILE)
			
 
				+    if not path.exists():
			
 
				+        return {}
			
 
				+    try:
			
 
				+        raw = json.loads(path.read_text(encoding="utf-8"))
			
 
				+    except Exception:
			
 
				+        return {}
			
 
				+    out: dict[str, str] = {}
			
 
				+    if isinstance(raw, dict):
			
 
				+        for k, v in raw.items():
			
 
				+            if k and v:
			
 
				+                out[str(k).strip().lower()] = str(v).strip()
			
 
				+    return out
			
 
				+
			
 
				+
			
 
				+def _lookup_alias(key: str) -> str | None:
			
 
				+    return _alias_map().get(key)
			
 
				+
			
 
				+
			
 
				+def normalize_entity(value: str) -> str:
			
 
				+    key = str(value).strip().lower()
			
 
				+    if not key:
			
 
				+        return ""
			
 
				+    return _lookup_alias(key) or str(value).strip()
			
 
				+
			
 
				+
			
 
				+def normalize_query(value: str) -> str:
			
 
				+    return normalize_entity(value)
			
 
				+
			
 
				+
			
 
				+def normalize_entities(values: Iterable[str]) -> list[str]:
			
 
				+    out: list[str] = []
			
 
				+    seen: set[str] = set()
			
 
				+    for value in values or []:
			
 
				+        norm = normalize_entity(value)
			
 
				+        key = norm.lower()
			
 
				+        if not norm or key in seen:
			
 
				+            continue
			
 
				+        seen.add(key)
			
 
				+        out.append(norm)
			
 
				+    return out
			
--- a/news_mcp/mcp_server_fastmcp.py
+++ b/news_mcp/mcp_server_fastmcp.py
@@ -10,6 +10,7 @@ from news_mcp.jobs.poller import refresh_clusters
 
				 from news_mcp.storage.sqlite_store import SQLiteClusterStore
			
 
				 from news_mcp.enrichment.llm_enrich import summarize_cluster_groq
			
 
				 from news_mcp.llm import active_llm_config
			
 
				+from news_mcp.entity_normalize import normalize_query
			
 
				 from collections import Counter
			
 
				 import logging
			
 
				 
			
@@ -25,7 +26,7 @@ async def get_latest_events(topic: str = "crypto", limit: int = 5):
 
				     limit = max(1, min(int(limit), 20))
			
 
				     # In v1, `topic` is a coarse category. If the caller passes an entity name
			
 
				     # (e.g. "trump"/"iran"), gracefully fall back to `other`.
			
 
				-    topic_norm = str(topic).strip().lower()
			
 
				+    topic_norm = normalize_query(topic).lower()
			
 
				     allowed = {t.lower() for t in DEFAULT_TOPICS}
			
 
				     if topic_norm not in allowed:
			
 
				         topic_norm = "other"
			
@@ -61,7 +62,7 @@ async def get_latest_events(topic: str = "crypto", limit: int = 5):
 
				 @mcp.tool(description="What's happening with X? Filter latest clusters by extracted entity substring (case-insensitive).")
			
 
				 async def get_events_for_entity(entity: str, limit: int = 10):
			
 
				     limit = max(1, min(int(limit), 30))
			
 
				-    query = str(entity).strip().lower()
			
 
				+    query = normalize_query(entity).strip().lower()
			
 
				     if not query:
			
 
				         return []
			
 
				 
			
@@ -189,7 +190,7 @@ async def detect_emerging_topics(limit: int = 10):
 
				 async def get_news_sentiment(entity: str, timeframe: str = "24h"):
			
 
				     store = SQLiteClusterStore(DB_PATH)
			
 
				 
			
 
				-    ent = str(entity).strip().lower()
			
 
				+    ent = normalize_query(entity).strip().lower()
			
 
				     if not ent:
			
 
				         return {
			
 
				             "entity": entity,
			
@@ -222,7 +223,6 @@ async def get_news_sentiment(entity: str, timeframe: str = "24h"):
 
				         }
			
 
				 
			
 
				     scores = []
			
 
				-    labels = []
			
 
				     for c in matched:
			
 
				         s = c.get("sentimentScore")
			
 
				         if s is not None:
			
@@ -230,21 +230,17 @@ async def get_news_sentiment(entity: str, timeframe: str = "24h"):
 
				                 scores.append(float(s))
			
 
				             except Exception:
			
 
				                 pass
			
 
				-        lbl = c.get("sentiment")
			
 
				-        if lbl:
			
 
				-            labels.append(str(lbl).lower())
			
 
				 
			
 
				     avg_score = sum(scores) / len(scores) if scores else 0.0
			
 
				 
			
 
				-    # Majority vote on sentiment label, fall back to sign of avg score.
			
 
				-    if labels:
			
 
				-        majority = Counter(labels).most_common(1)[0][0]
			
 
				-        if majority in {"positive", "negative", "neutral"}:
			
 
				-            sentiment = majority
			
 
				-        else:
			
 
				-            sentiment = "positive" if avg_score > 0 else "negative" if avg_score < 0 else "neutral"
			
 
				+    # Keep the label aligned with the numeric score.
			
 
				+    # Small magnitudes are treated as neutral to avoid noisy label flips.
			
 
				+    if avg_score >= 0.15:
			
 
				+        sentiment = "positive"
			
 
				+    elif avg_score <= -0.15:
			
 
				+        sentiment = "negative"
			
 
				     else:
			
 
				-        sentiment = "positive" if avg_score > 0 else "negative" if avg_score < 0 else "neutral"
			
 
				+        sentiment = "neutral"
			
 
				 
			
 
				     return {
			
 
				         "entity": entity,
			
--- a/test_news_mcp.py
+++ b/test_news_mcp.py
@@ -6,7 +6,8 @@ from pathlib import Path
 
				 from news_mcp.dedup.cluster import dedup_and_cluster_articles
			
 
				 from news_mcp.storage.sqlite_store import SQLiteClusterStore
			
 
				 from news_mcp.enrichment.importance import compute_importance
			
 
				-from news_mcp.enrichment.llm_enrich import _filter_entities
			
 
				+from news_mcp.enrichment.llm_enrich import _filter_entities, _matches_blacklist
			
 
				+from news_mcp.entity_normalize import normalize_query, normalize_entities
			
 
				 from news_mcp.llm import build_extraction_prompt, call_llm, load_prompt
			
 
				 
			
 
				 
			
@@ -82,6 +83,22 @@ def test_blacklist_filters_entities_case_insensitively():
 
				     assert filtered == ["Reuters", "CoinDesk"]
			
 
				 
			
 
				 
			
 
				+def test_blacklist_supports_wildcards():
			
 
				+    assert _matches_blacklist("Bloomberg Economics", blacklist=["bloomberg*"])
			
 
				+    assert _matches_blacklist("bloomberg", blacklist=["*berg"])
			
 
				+    assert not _matches_blacklist("Reuters", blacklist=["bloomberg*"])
			
 
				+
			
 
				+
			
 
				+def test_query_normalization_keeps_common_shorthand_working():
			
 
				+    assert normalize_query("btc") == "Bitcoin"
			
 
				+    assert normalize_query("Trump") == "Donald Trump"
			
 
				+    assert normalize_query("nvidia") == "nvidia"
			
 
				+
			
 
				+
			
 
				+def test_entity_normalization_deduplicates_aliases():
			
 
				+    assert normalize_entities(["btc", "Bitcoin", "BTC", "Ethereum"]) == ["Bitcoin", "Ethereum"]
			
 
				+
			
 
				+
			
 
				 def test_load_prompt_reads_prompt_files():
			
 
				     text = load_prompt("extract_entities.prompt")
			
 
				     assert "Return STRICT JSON" in text