1 месяц назад · 64bf700047
--- a/.env.example
+++ b/.env.example
@@ -28,3 +28,4 @@ NEWS_REFRESH_INTERVAL_SECONDS=900
 
															 NEWS_BACKGROUND_REFRESH_ENABLED=true
														
 
															 NEWS_BACKGROUND_REFRESH_ON_START=true
														
 
															 NEWS_PROMPTS_DIR=
														
 
															+NEWS_ENTITY_ALIASES_FILE=
														
--- a/OUTLOOK.md
+++ b/OUTLOOK.md
@@ -388,3 +388,37 @@ But only if you:
 
															 * cluster events
														
 
															 * compress information
														
 
															+---
														
 
															+
														
 
															+# ✅ Completed since this outlook was written
														
 
															+
														
 
															+* provider-agnostic LLM extraction/summarization layer added
														
 
															+* prompts moved into separate files for easier updates
														
 
															+* entity blacklist implemented and made case-insensitive
														
 
															+* live extraction smoke test added
														
 
															+* docs updated with the new env vars and workflow
														
 
															+
														
 
															+---
														
 
															+
														
 
															+# 🔭 Next high-level steps
														
 
															+
														
 
															+1. **Normalization layer**
														
 
															+
														
 
															+   * canonicalize acronyms and entity variants before storage / querying
														
 
															+   * keep the blacklist as a separate post-processing rule
														
 
															+
														
 
															+2. **Wildcard blacklist support**
														
 
															+
														
 
															+   * allow patterns for entities / topics / keywords
														
 
															+   * keep matching case-insensitive
														
 
															+
														
 
															+3. **Emerging signal quality**
														
 
															+
														
 
															+   * tune what counts as an emerging topic/entity
														
 
															+   * reduce noise from repeated source names and generic terms
														
 
															+
														
 
															+4. **Entity/time tracking and replay (future capability)**
														
 
															+
														
 
															+   * track how important entities evolve over time
														
 
															+   * allow replay of when entities first appeared, how topics shifted, and how sentiment changed
														
 
															+   * useful later for narrative reconstruction and trend timelines
														
--- a/README.md
+++ b/README.md
@@ -50,6 +50,7 @@ Key variables:
 
															 - `GROQ_API_KEY`, `OPENAI_API_KEY`
														
 
															 - `ENTITY_BLACKLIST` (comma-separated, case-insensitive exact entity match)
														
 
															 - `NEWS_PROMPTS_DIR` (override prompt directory)
														
 
															+- `NEWS_ENTITY_ALIASES_FILE` (override entity alias JSON file)
														
 
															 - `NEWS_FEED_URL` (single feed fallback)
														
 
															 - `NEWS_FEED_URLS` (comma-separated feed URLs; overrides `NEWS_FEED_URL`)
														
 
															 - `NEWS_REFRESH_INTERVAL_SECONDS` (default 900)
														
--- a/config/entity_aliases.json
+++ b/config/entity_aliases.json
@@ -0,0 +1,16 @@
 
															+{
														
 
															+  "btc": "Bitcoin",
														
 
															+  "bitcoin": "Bitcoin",
														
 
															+  "eth": "Ethereum",
														
 
															+  "ethereum": "Ethereum",
														
 
															+  "fed": "Federal Reserve",
														
 
															+  "federal reserve": "Federal Reserve",
														
 
															+  "ecb": "European Central Bank",
														
 
															+  "european central bank": "European Central Bank",
														
 
															+  "eu": "European Union",
														
 
															+  "european union": "European Union",
														
 
															+  "trump": "Donald Trump",
														
 
															+  "donald trump": "Donald Trump",
														
 
															+  "merz": "Friedrich Merz",
														
 
															+  "friedrich merz": "Friedrich Merz"
														
 
															+}
														
--- a/news_mcp/config.py
+++ b/news_mcp/config.py
@@ -10,6 +10,7 @@ DATA_DIR = Path(os.getenv("NEWS_MCP_DATA_DIR", Path(__file__).resolve().parent /
 
															 DATA_DIR.mkdir(parents=True, exist_ok=True)
														
 
															 DB_PATH = Path(os.getenv("NEWS_MCP_DB_PATH", str(DATA_DIR / "news.sqlite")))
														
 
															 PROMPTS_DIR = Path(os.getenv("NEWS_PROMPTS_DIR", str(_HERE / "prompts")))
														
 
															+ENTITY_ALIASES_FILE = Path(os.getenv("NEWS_ENTITY_ALIASES_FILE", str(_HERE / "config" / "entity_aliases.json")))
														
 
															 NEWS_FEED_URL = os.getenv("NEWS_FEED_URL", os.getenv("NEWS_RSS_FEED_URL", "https://breakingthenews.net/news-feed.xml"))
														
 
															 NEWS_FEED_URLS = os.getenv("NEWS_FEED_URLS", os.getenv("NEWS_RSS_FEED_URLS", "")).strip()
														
--- a/news_mcp/enrichment/groq_enrich.py
+++ b/news_mcp/enrichment/groq_enrich.py
@@ -1,194 +0,0 @@
 
															-from __future__ import annotations
														
 
															-
														
 
															-import json
														
 
															-import logging
														
 
															-from typing import Any, Dict, List
														
 
															-
														
 
															-import httpx
														
 
															-
														
 
															-from news_mcp.config import GROQ_API_KEY, GROQ_MODEL, GROQ_DEBUG
														
 
															-
														
 
															-
														
 
															-logger = logging.getLogger(__name__)
														
 
															-
														
 
															-
														
 
															-_SYSTEM = "You are a news signal extraction engine. Return STRICT JSON only."
														
 
															-
														
 
															-
														
 
															-def _build_prompt(articles: List[Dict[str, Any]], headline: str, summary: str | None) -> str:
														
 
															-    # Keep prompt compact: clusters already deduped.
														
 
															-    sample = articles[:6]
														
 
															-    return json.dumps(
														
 
															-        {
														
 
															-            "cluster": {
														
 
															-                "headline": headline,
														
 
															-                "summary": summary or "",
														
 
															-                "articles": [
														
 
															-                    {
														
 
															-                        "title": a.get("title"),
														
 
															-                        "url": a.get("url"),
														
 
															-                        "source": a.get("source"),
														
 
															-                        "timestamp": a.get("timestamp"),
														
 
															-                        "summary": a.get("summary", ""),
														
 
															-                    }
														
 
															-                    for a in sample
														
 
															-                ],
														
 
															-            }
														
 
															-        },
														
 
															-        ensure_ascii=False,
														
 
															-    )
														
 
															-
														
 
															-
														
 
															-async def classify_cluster_groq(cluster: Dict[str, Any]) -> Dict[str, Any]:
														
 
															-    if not GROQ_API_KEY:
														
 
															-        # No enrichment configured.
														
 
															-        return cluster
														
 
															-
														
 
															-    headline = cluster.get("headline", "")
														
 
															-    summary = cluster.get("summary", "")
														
 
															-    articles = cluster.get("articles", [])
														
 
															-
														
 
															-    user_payload = _build_prompt(articles=articles, headline=headline, summary=summary)
														
 
															-
														
 
															-    prompt = (
														
 
															-        f"Input cluster JSON:\n{user_payload}\n\n"
														
 
															-        "You MUST extract a news signal from the headline AND summary. Do not leave entities empty when the text mentions obvious names.\n"
														
 
															-        "Task:\n"
														
 
															-        "1) infer the best top-level topic\n"
														
 
															-        "2) extract concise entities from the cluster\n"
														
 
															-        "3) assign sentiment from the wording/context\n"
														
 
															-        "4) provide short keywords that justify the classification\n\n"
														
 
															-        "Entity rules (strict):\n"
														
 
															-        "- Use short strings (1-5 words).\n"
														
 
															-        "- Include all obvious named entities mentioned in headline or summary: people, countries, regions, organizations, ministries, presidents, leaders, wars/conflicts if named.\n"
														
 
															-        "- Also include finance/crypto entities when present: BTC, ETH, Bitcoin, Ethereum, ETF, SEC, ECB, Fed, euro, inflation, rates.\n"
														
 
															-        "- If the cluster mentions Iran, UAE, Egypt, Germany, Europe, Trump, Merz, Sisi, those should appear in entities.\n"
														
 
															-        "- Do NOT return empty entities if any such names/places appear.\n\n"
														
 
															-        "Sentiment rules:\n"
														
 
															-        "- positive: clearly encouraging, improving, or supportive tone\n"
														
 
															-        "- negative: clearly alarming, worsening, severe, conflict, loss, risk, warning tone\n"
														
 
															-        "- neutral: factual, balanced, or mixed\n"
														
 
															-        "- sentimentScore must be a number from -1.0 to 1.0 and should reflect the sentiment label.\n\n"
														
 
															-        "Return STRICT JSON with EXACT keys only:\n"
														
 
															-        "{ topic, entities, sentiment, sentimentScore, keywords }\n"
														
 
															-        "where topic is one of [crypto, macro, regulation, ai, other].\n"
														
 
															-    )
														
 
															-
														
 
															-    if GROQ_DEBUG:
														
 
															-        msg = f"[GROQ PROMPT] {prompt}"
														
 
															-        logger.warning(msg)
														
 
															-        print(msg, flush=True)
														
 
															-
														
 
															-    req = {
														
 
															-        "model": GROQ_MODEL,
														
 
															-        "messages": [
														
 
															-            {"role": "system", "content": _SYSTEM},
														
 
															-            {"role": "user", "content": prompt},
														
 
															-        ],
														
 
															-        "temperature": 0.2,
														
 
															-        "response_format": {"type": "json_object"},
														
 
															-    }
														
 
															-
														
 
															-    async with httpx.AsyncClient(timeout=30.0) as client:
														
 
															-        resp = await client.post(
														
 
															-            "https://api.groq.com/openai/v1/chat/completions",
														
 
															-            headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
														
 
															-            json=req,
														
 
															-        )
														
 
															-        resp.raise_for_status()
														
 
															-        data = resp.json()
														
 
															-
														
 
															-    content = data["choices"][0]["message"]["content"]
														
 
															-
														
 
															-    if GROQ_DEBUG:
														
 
															-        msg = f"[GROQ RAW RESPONSE] {content}"
														
 
															-        logger.warning(msg)
														
 
															-        print(msg, flush=True)
														
 
															-
														
 
															-    parsed = json.loads(content)
														
 
															-
														
 
															-    # Normalize output types into our cluster shape.
														
 
															-    topic = parsed.get("topic") or cluster.get("topic")
														
 
															-    entities = parsed.get("entities") or []
														
 
															-    sentiment = parsed.get("sentiment") or "neutral"
														
 
															-    sentiment_score = parsed.get("sentimentScore")
														
 
															-    keywords = parsed.get("keywords") or []
														
 
															-
														
 
															-    out = dict(cluster)
														
 
															-    if topic:
														
 
															-        out["topic"] = topic
														
 
															-    out["entities"] = entities
														
 
															-    out["sentiment"] = sentiment
														
 
															-    if sentiment_score is not None:
														
 
															-        out["sentimentScore"] = float(sentiment_score)
														
 
															-    out["keywords"] = keywords
														
 
															-    return out
														
 
															-
														
 
															-
														
 
															-async def summarize_cluster_groq(cluster: Dict[str, Any]) -> Dict[str, Any]:
														
 
															-    """Produce a compact agent-facing summary.
														
 
															-
														
 
															-    Returns:
														
 
															-      {
														
 
															-        "headline": str,
														
 
															-        "mergedSummary": str,
														
 
															-        "keyFacts": [str,...],
														
 
															-        "sources": [str,...]
														
 
															-      }
														
 
															-    """
														
 
															-    if not GROQ_API_KEY:
														
 
															-        return {
														
 
															-            "headline": cluster.get("headline"),
														
 
															-            "mergedSummary": cluster.get("summary"),
														
 
															-            "keyFacts": [],
														
 
															-            "sources": cluster.get("sources", []),
														
 
															-        }
														
 
															-
														
 
															-    headline = cluster.get("headline", "")
														
 
															-    summary = cluster.get("summary", "")
														
 
															-    articles = cluster.get("articles", [])
														
 
															-
														
 
															-    sample = articles[:5]
														
 
															-    req = {
														
 
															-        "model": GROQ_MODEL,
														
 
															-        "messages": [
														
 
															-            {
														
 
															-                "role": "system",
														
 
															-                "content": "You are a summarization engine for news clusters. Return strict JSON only.",
														
 
															-            },
														
 
															-            {
														
 
															-                "role": "user",
														
 
															-                "content": json.dumps(
														
 
															-                    {
														
 
															-                        "headline": headline,
														
 
															-                        "summary": summary,
														
 
															-                        "articles": [
														
 
															-                            {
														
 
															-                                "title": a.get("title"),
														
 
															-                                "url": a.get("url"),
														
 
															-                                "source": a.get("source"),
														
 
															-                                "timestamp": a.get("timestamp"),
														
 
															-                            }
														
 
															-                            for a in sample
														
 
															-                        ],
														
 
															-                    },
														
 
															-                    ensure_ascii=False,
														
 
															-                )
														
 
															-                + "\n\nReturn keys: headline, mergedSummary, keyFacts (5-8 strings), sources. mergedSummary should be 2-4 sentences.",
														
 
															-            },
														
 
															-        ],
														
 
															-        "temperature": 0.2,
														
 
															-        "response_format": {"type": "json_object"},
														
 
															-    }
														
 
															-
														
 
															-    async with httpx.AsyncClient(timeout=45.0) as client:
														
 
															-        resp = await client.post(
														
 
															-            "https://api.groq.com/openai/v1/chat/completions",
														
 
															-            headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
														
 
															-            json=req,
														
 
															-        )
														
 
															-        resp.raise_for_status()
														
 
															-        data = resp.json()
														
 
															-    content = data["choices"][0]["message"]["content"]
														
 
															-    parsed = json.loads(content)
														
 
															-    return parsed
														
--- a/news_mcp/enrichment/llm_enrich.py
+++ b/news_mcp/enrichment/llm_enrich.py
@@ -1,17 +1,25 @@
 
															 from __future__ import annotations
														
 
															+from fnmatch import fnmatchcase
														
 
															 from typing import Any, Dict
														
 
															 from news_mcp.config import NEWS_ENTITY_BLACKLIST
														
 
															+from news_mcp.entity_normalize import normalize_entities
														
 
															 from news_mcp.llm import call_extraction, call_summary
														
 
															+def _matches_blacklist(value: str, blacklist=None) -> bool:
														
 
															+    patterns = [x.strip().lower() for x in (blacklist if blacklist is not None else NEWS_ENTITY_BLACKLIST) if x and x.strip()]
														
 
															+    key = str(value).strip().lower()
														
 
															+    if not key:
														
 
															+        return True
														
 
															+    return any(fnmatchcase(key, pattern) for pattern in patterns)
														
 
															+
														
 
															+
														
 
															 def _filter_entities(entities, blacklist=None):
														
 
															-    banned = set(x.strip().lower() for x in (blacklist if blacklist is not None else NEWS_ENTITY_BLACKLIST))
														
 
															     out = []
														
 
															     for ent in entities or []:
														
 
															-        key = str(ent).strip().lower()
														
 
															-        if not key or key in banned:
														
 
															+        if _matches_blacklist(ent, blacklist=blacklist):
														
 
															             continue
														
 
															         out.append(ent)
														
 
															     return out
														
@@ -20,12 +28,15 @@ def _filter_entities(entities, blacklist=None):
 
															 async def classify_cluster_groq(cluster: Dict[str, Any]) -> Dict[str, Any]:
														
 
															     parsed = await call_extraction(cluster)
														
 
															     out = dict(cluster)
														
 
															+    topic = parsed.get("topic", cluster.get("topic"))
														
 
															+    if topic and _matches_blacklist(topic):
														
 
															+        topic = "other"
														
 
															     out.update({
														
 
															-        "topic": parsed.get("topic", cluster.get("topic")),
														
 
															-        "entities": _filter_entities(parsed.get("entities", [])),
														
 
															+        "topic": topic,
														
 
															+        "entities": normalize_entities(_filter_entities(parsed.get("entities", []))),
														
 
															         "sentiment": parsed.get("sentiment", "neutral"),
														
 
															         "sentimentScore": parsed.get("sentimentScore"),
														
 
															-        "keywords": parsed.get("keywords", []),
														
 
															+        "keywords": normalize_entities(_filter_entities(parsed.get("keywords", []))),
														
 
															     })
														
 
															     return out
														
--- a/news_mcp/entity_normalize.py
+++ b/news_mcp/entity_normalize.py
@@ -0,0 +1,54 @@
 
															+from __future__ import annotations
														
 
															+
														
 
															+import json
														
 
															+from functools import lru_cache
														
 
															+from pathlib import Path
														
 
															+from typing import Iterable
														
 
															+
														
 
															+from news_mcp.config import ENTITY_ALIASES_FILE
														
 
															+
														
 
															+# Small, explicit canonical alias map.
														
 
															+# Keep this conservative and grow it only when a shorthand is clearly useful.
														
 
															+@lru_cache(maxsize=1)
														
 
															+def _alias_map() -> dict[str, str]:
														
 
															+    path = Path(ENTITY_ALIASES_FILE)
														
 
															+    if not path.exists():
														
 
															+        return {}
														
 
															+    try:
														
 
															+        raw = json.loads(path.read_text(encoding="utf-8"))
														
 
															+    except Exception:
														
 
															+        return {}
														
 
															+    out: dict[str, str] = {}
														
 
															+    if isinstance(raw, dict):
														
 
															+        for k, v in raw.items():
														
 
															+            if k and v:
														
 
															+                out[str(k).strip().lower()] = str(v).strip()
														
 
															+    return out
														
 
															+
														
 
															+
														
 
															+def _lookup_alias(key: str) -> str | None:
														
 
															+    return _alias_map().get(key)
														
 
															+
														
 
															+
														
 
															+def normalize_entity(value: str) -> str:
														
 
															+    key = str(value).strip().lower()
														
 
															+    if not key:
														
 
															+        return ""
														
 
															+    return _lookup_alias(key) or str(value).strip()
														
 
															+
														
 
															+
														
 
															+def normalize_query(value: str) -> str:
														
 
															+    return normalize_entity(value)
														
 
															+
														
 
															+
														
 
															+def normalize_entities(values: Iterable[str]) -> list[str]:
														
 
															+    out: list[str] = []
														
 
															+    seen: set[str] = set()
														
 
															+    for value in values or []:
														
 
															+        norm = normalize_entity(value)
														
 
															+        key = norm.lower()
														
 
															+        if not norm or key in seen:
														
 
															+            continue
														
 
															+        seen.add(key)
														
 
															+        out.append(norm)
														
 
															+    return out
														
--- a/news_mcp/mcp_server_fastmcp.py
+++ b/news_mcp/mcp_server_fastmcp.py
@@ -10,6 +10,7 @@ from news_mcp.jobs.poller import refresh_clusters
 
															 from news_mcp.storage.sqlite_store import SQLiteClusterStore
														
 
															 from news_mcp.enrichment.llm_enrich import summarize_cluster_groq
														
 
															 from news_mcp.llm import active_llm_config
														
 
															+from news_mcp.entity_normalize import normalize_query
														
 
															 from collections import Counter
														
 
															 import logging
														
@@ -25,7 +26,7 @@ async def get_latest_events(topic: str = "crypto", limit: int = 5):
 
															     limit = max(1, min(int(limit), 20))
														
 
															     # In v1, `topic` is a coarse category. If the caller passes an entity name
														
 
															     # (e.g. "trump"/"iran"), gracefully fall back to `other`.
														
 
															-    topic_norm = str(topic).strip().lower()
														
 
															+    topic_norm = normalize_query(topic).lower()
														
 
															     allowed = {t.lower() for t in DEFAULT_TOPICS}
														
 
															     if topic_norm not in allowed:
														
 
															         topic_norm = "other"
														
@@ -61,7 +62,7 @@ async def get_latest_events(topic: str = "crypto", limit: int = 5):
 
															 @mcp.tool(description="What's happening with X? Filter latest clusters by extracted entity substring (case-insensitive).")
														
 
															 async def get_events_for_entity(entity: str, limit: int = 10):
														
 
															     limit = max(1, min(int(limit), 30))
														
 
															-    query = str(entity).strip().lower()
														
 
															+    query = normalize_query(entity).strip().lower()
														
 
															     if not query:
														
 
															         return []
														
@@ -189,7 +190,7 @@ async def detect_emerging_topics(limit: int = 10):
 
															 async def get_news_sentiment(entity: str, timeframe: str = "24h"):
														
 
															     store = SQLiteClusterStore(DB_PATH)
														
 
															-    ent = str(entity).strip().lower()
														
 
															+    ent = normalize_query(entity).strip().lower()
														
 
															     if not ent:
														
 
															         return {
														
 
															             "entity": entity,
														
@@ -222,7 +223,6 @@ async def get_news_sentiment(entity: str, timeframe: str = "24h"):
 
															         }
														
 
															     scores = []
														
 
															-    labels = []
														
 
															     for c in matched:
														
 
															         s = c.get("sentimentScore")
														
 
															         if s is not None:
														
@@ -230,21 +230,17 @@ async def get_news_sentiment(entity: str, timeframe: str = "24h"):
 
															                 scores.append(float(s))
														
 
															             except Exception:
														
 
															                 pass
														
 
															-        lbl = c.get("sentiment")
														
 
															-        if lbl:
														
 
															-            labels.append(str(lbl).lower())
														
 
															     avg_score = sum(scores) / len(scores) if scores else 0.0
														
 
															-    # Majority vote on sentiment label, fall back to sign of avg score.
														
 
															-    if labels:
														
 
															-        majority = Counter(labels).most_common(1)[0][0]
														
 
															-        if majority in {"positive", "negative", "neutral"}:
														
 
															-            sentiment = majority
														
 
															-        else:
														
 
															-            sentiment = "positive" if avg_score > 0 else "negative" if avg_score < 0 else "neutral"
														
 
															+    # Keep the label aligned with the numeric score.
														
 
															+    # Small magnitudes are treated as neutral to avoid noisy label flips.
														
 
															+    if avg_score >= 0.15:
														
 
															+        sentiment = "positive"
														
 
															+    elif avg_score <= -0.15:
														
 
															+        sentiment = "negative"
														
 
															     else:
														
 
															-        sentiment = "positive" if avg_score > 0 else "negative" if avg_score < 0 else "neutral"
														
 
															+        sentiment = "neutral"
														
 
															     return {
														
 
															         "entity": entity,
														
--- a/test_news_mcp.py
+++ b/test_news_mcp.py
@@ -6,7 +6,8 @@ from pathlib import Path
 
															 from news_mcp.dedup.cluster import dedup_and_cluster_articles
														
 
															 from news_mcp.storage.sqlite_store import SQLiteClusterStore
														
 
															 from news_mcp.enrichment.importance import compute_importance
														
 
															-from news_mcp.enrichment.llm_enrich import _filter_entities
														
 
															+from news_mcp.enrichment.llm_enrich import _filter_entities, _matches_blacklist
														
 
															+from news_mcp.entity_normalize import normalize_query, normalize_entities
														
 
															 from news_mcp.llm import build_extraction_prompt, call_llm, load_prompt
														
@@ -82,6 +83,22 @@ def test_blacklist_filters_entities_case_insensitively():
 
															     assert filtered == ["Reuters", "CoinDesk"]
														
 
															+def test_blacklist_supports_wildcards():
														
 
															+    assert _matches_blacklist("Bloomberg Economics", blacklist=["bloomberg*"])
														
 
															+    assert _matches_blacklist("bloomberg", blacklist=["*berg"])
														
 
															+    assert not _matches_blacklist("Reuters", blacklist=["bloomberg*"])
														
 
															+
														
 
															+
														
 
															+def test_query_normalization_keeps_common_shorthand_working():
														
 
															+    assert normalize_query("btc") == "Bitcoin"
														
 
															+    assert normalize_query("Trump") == "Donald Trump"
														
 
															+    assert normalize_query("nvidia") == "nvidia"
														
 
															+
														
 
															+
														
 
															+def test_entity_normalization_deduplicates_aliases():
														
 
															+    assert normalize_entities(["btc", "Bitcoin", "BTC", "Ethereum"]) == ["Bitcoin", "Ethereum"]
														
 
															+
														
 
															+
														
 
															 def test_load_prompt_reads_prompt_files():
														
 
															     text = load_prompt("extract_entities.prompt")
														
 
															     assert "Return STRICT JSON" in text