Ver Fonte

Release v0.1.0

Lukas Goldschmidt há 1 mês atrás
pai
commit
64bf700047

+ 1 - 0
.env.example

@@ -28,3 +28,4 @@ NEWS_REFRESH_INTERVAL_SECONDS=900
 NEWS_BACKGROUND_REFRESH_ENABLED=true
 NEWS_BACKGROUND_REFRESH_ON_START=true
 NEWS_PROMPTS_DIR=
+NEWS_ENTITY_ALIASES_FILE=

+ 34 - 0
OUTLOOK.md

@@ -388,3 +388,37 @@ But only if you:
 * cluster events
 * compress information
 
+---
+
+# ✅ Completed since this outlook was written
+
+* provider-agnostic LLM extraction/summarization layer added
+* prompts moved into separate files for easier updates
+* entity blacklist implemented and made case-insensitive
+* live extraction smoke test added
+* docs updated with the new env vars and workflow
+
+---
+
+# 🔭 Next high-level steps
+
+1. **Normalization layer**
+
+   * canonicalize acronyms and entity variants before storage / querying
+   * keep the blacklist as a separate post-processing rule
+
+2. **Wildcard blacklist support**
+
+   * allow patterns for entities / topics / keywords
+   * keep matching case-insensitive
+
+3. **Emerging signal quality**
+
+   * tune what counts as an emerging topic/entity
+   * reduce noise from repeated source names and generic terms
+
+4. **Entity/time tracking and replay (future capability)**
+
+   * track how important entities evolve over time
+   * allow replay of when entities first appeared, how topics shifted, and how sentiment changed
+   * useful later for narrative reconstruction and trend timelines

+ 1 - 0
README.md

@@ -50,6 +50,7 @@ Key variables:
 - `GROQ_API_KEY`, `OPENAI_API_KEY`
 - `ENTITY_BLACKLIST` (comma-separated, case-insensitive exact entity match)
 - `NEWS_PROMPTS_DIR` (override prompt directory)
+- `NEWS_ENTITY_ALIASES_FILE` (override entity alias JSON file)
 - `NEWS_FEED_URL` (single feed fallback)
 - `NEWS_FEED_URLS` (comma-separated feed URLs; overrides `NEWS_FEED_URL`)
 - `NEWS_REFRESH_INTERVAL_SECONDS` (default 900)

+ 16 - 0
config/entity_aliases.json

@@ -0,0 +1,16 @@
+{
+  "btc": "Bitcoin",
+  "bitcoin": "Bitcoin",
+  "eth": "Ethereum",
+  "ethereum": "Ethereum",
+  "fed": "Federal Reserve",
+  "federal reserve": "Federal Reserve",
+  "ecb": "European Central Bank",
+  "european central bank": "European Central Bank",
+  "eu": "European Union",
+  "european union": "European Union",
+  "trump": "Donald Trump",
+  "donald trump": "Donald Trump",
+  "merz": "Friedrich Merz",
+  "friedrich merz": "Friedrich Merz"
+}

+ 1 - 0
news_mcp/config.py

@@ -10,6 +10,7 @@ DATA_DIR = Path(os.getenv("NEWS_MCP_DATA_DIR", Path(__file__).resolve().parent /
 DATA_DIR.mkdir(parents=True, exist_ok=True)
 DB_PATH = Path(os.getenv("NEWS_MCP_DB_PATH", str(DATA_DIR / "news.sqlite")))
 PROMPTS_DIR = Path(os.getenv("NEWS_PROMPTS_DIR", str(_HERE / "prompts")))
+ENTITY_ALIASES_FILE = Path(os.getenv("NEWS_ENTITY_ALIASES_FILE", str(_HERE / "config" / "entity_aliases.json")))
 
 NEWS_FEED_URL = os.getenv("NEWS_FEED_URL", os.getenv("NEWS_RSS_FEED_URL", "https://breakingthenews.net/news-feed.xml"))
 NEWS_FEED_URLS = os.getenv("NEWS_FEED_URLS", os.getenv("NEWS_RSS_FEED_URLS", "")).strip()

+ 0 - 194
news_mcp/enrichment/groq_enrich.py

@@ -1,194 +0,0 @@
-from __future__ import annotations
-
-import json
-import logging
-from typing import Any, Dict, List
-
-import httpx
-
-from news_mcp.config import GROQ_API_KEY, GROQ_MODEL, GROQ_DEBUG
-
-
-logger = logging.getLogger(__name__)
-
-
-_SYSTEM = "You are a news signal extraction engine. Return STRICT JSON only."
-
-
-def _build_prompt(articles: List[Dict[str, Any]], headline: str, summary: str | None) -> str:
-    # Keep prompt compact: clusters already deduped.
-    sample = articles[:6]
-    return json.dumps(
-        {
-            "cluster": {
-                "headline": headline,
-                "summary": summary or "",
-                "articles": [
-                    {
-                        "title": a.get("title"),
-                        "url": a.get("url"),
-                        "source": a.get("source"),
-                        "timestamp": a.get("timestamp"),
-                        "summary": a.get("summary", ""),
-                    }
-                    for a in sample
-                ],
-            }
-        },
-        ensure_ascii=False,
-    )
-
-
-async def classify_cluster_groq(cluster: Dict[str, Any]) -> Dict[str, Any]:
-    if not GROQ_API_KEY:
-        # No enrichment configured.
-        return cluster
-
-    headline = cluster.get("headline", "")
-    summary = cluster.get("summary", "")
-    articles = cluster.get("articles", [])
-
-    user_payload = _build_prompt(articles=articles, headline=headline, summary=summary)
-
-    prompt = (
-        f"Input cluster JSON:\n{user_payload}\n\n"
-        "You MUST extract a news signal from the headline AND summary. Do not leave entities empty when the text mentions obvious names.\n"
-        "Task:\n"
-        "1) infer the best top-level topic\n"
-        "2) extract concise entities from the cluster\n"
-        "3) assign sentiment from the wording/context\n"
-        "4) provide short keywords that justify the classification\n\n"
-        "Entity rules (strict):\n"
-        "- Use short strings (1-5 words).\n"
-        "- Include all obvious named entities mentioned in headline or summary: people, countries, regions, organizations, ministries, presidents, leaders, wars/conflicts if named.\n"
-        "- Also include finance/crypto entities when present: BTC, ETH, Bitcoin, Ethereum, ETF, SEC, ECB, Fed, euro, inflation, rates.\n"
-        "- If the cluster mentions Iran, UAE, Egypt, Germany, Europe, Trump, Merz, Sisi, those should appear in entities.\n"
-        "- Do NOT return empty entities if any such names/places appear.\n\n"
-        "Sentiment rules:\n"
-        "- positive: clearly encouraging, improving, or supportive tone\n"
-        "- negative: clearly alarming, worsening, severe, conflict, loss, risk, warning tone\n"
-        "- neutral: factual, balanced, or mixed\n"
-        "- sentimentScore must be a number from -1.0 to 1.0 and should reflect the sentiment label.\n\n"
-        "Return STRICT JSON with EXACT keys only:\n"
-        "{ topic, entities, sentiment, sentimentScore, keywords }\n"
-        "where topic is one of [crypto, macro, regulation, ai, other].\n"
-    )
-
-    if GROQ_DEBUG:
-        msg = f"[GROQ PROMPT] {prompt}"
-        logger.warning(msg)
-        print(msg, flush=True)
-
-    req = {
-        "model": GROQ_MODEL,
-        "messages": [
-            {"role": "system", "content": _SYSTEM},
-            {"role": "user", "content": prompt},
-        ],
-        "temperature": 0.2,
-        "response_format": {"type": "json_object"},
-    }
-
-    async with httpx.AsyncClient(timeout=30.0) as client:
-        resp = await client.post(
-            "https://api.groq.com/openai/v1/chat/completions",
-            headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
-            json=req,
-        )
-        resp.raise_for_status()
-        data = resp.json()
-
-    content = data["choices"][0]["message"]["content"]
-
-    if GROQ_DEBUG:
-        msg = f"[GROQ RAW RESPONSE] {content}"
-        logger.warning(msg)
-        print(msg, flush=True)
-
-    parsed = json.loads(content)
-
-    # Normalize output types into our cluster shape.
-    topic = parsed.get("topic") or cluster.get("topic")
-    entities = parsed.get("entities") or []
-    sentiment = parsed.get("sentiment") or "neutral"
-    sentiment_score = parsed.get("sentimentScore")
-    keywords = parsed.get("keywords") or []
-
-    out = dict(cluster)
-    if topic:
-        out["topic"] = topic
-    out["entities"] = entities
-    out["sentiment"] = sentiment
-    if sentiment_score is not None:
-        out["sentimentScore"] = float(sentiment_score)
-    out["keywords"] = keywords
-    return out
-
-
-async def summarize_cluster_groq(cluster: Dict[str, Any]) -> Dict[str, Any]:
-    """Produce a compact agent-facing summary.
-
-    Returns:
-      {
-        "headline": str,
-        "mergedSummary": str,
-        "keyFacts": [str,...],
-        "sources": [str,...]
-      }
-    """
-    if not GROQ_API_KEY:
-        return {
-            "headline": cluster.get("headline"),
-            "mergedSummary": cluster.get("summary"),
-            "keyFacts": [],
-            "sources": cluster.get("sources", []),
-        }
-
-    headline = cluster.get("headline", "")
-    summary = cluster.get("summary", "")
-    articles = cluster.get("articles", [])
-
-    sample = articles[:5]
-    req = {
-        "model": GROQ_MODEL,
-        "messages": [
-            {
-                "role": "system",
-                "content": "You are a summarization engine for news clusters. Return strict JSON only.",
-            },
-            {
-                "role": "user",
-                "content": json.dumps(
-                    {
-                        "headline": headline,
-                        "summary": summary,
-                        "articles": [
-                            {
-                                "title": a.get("title"),
-                                "url": a.get("url"),
-                                "source": a.get("source"),
-                                "timestamp": a.get("timestamp"),
-                            }
-                            for a in sample
-                        ],
-                    },
-                    ensure_ascii=False,
-                )
-                + "\n\nReturn keys: headline, mergedSummary, keyFacts (5-8 strings), sources. mergedSummary should be 2-4 sentences.",
-            },
-        ],
-        "temperature": 0.2,
-        "response_format": {"type": "json_object"},
-    }
-
-    async with httpx.AsyncClient(timeout=45.0) as client:
-        resp = await client.post(
-            "https://api.groq.com/openai/v1/chat/completions",
-            headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
-            json=req,
-        )
-        resp.raise_for_status()
-        data = resp.json()
-    content = data["choices"][0]["message"]["content"]
-    parsed = json.loads(content)
-    return parsed

+ 17 - 6
news_mcp/enrichment/llm_enrich.py

@@ -1,17 +1,25 @@
 from __future__ import annotations
 
+from fnmatch import fnmatchcase
 from typing import Any, Dict
 
 from news_mcp.config import NEWS_ENTITY_BLACKLIST
+from news_mcp.entity_normalize import normalize_entities
 from news_mcp.llm import call_extraction, call_summary
 
 
+def _matches_blacklist(value: str, blacklist=None) -> bool:
+    patterns = [x.strip().lower() for x in (blacklist if blacklist is not None else NEWS_ENTITY_BLACKLIST) if x and x.strip()]
+    key = str(value).strip().lower()
+    if not key:
+        return True
+    return any(fnmatchcase(key, pattern) for pattern in patterns)
+
+
 def _filter_entities(entities, blacklist=None):
-    banned = set(x.strip().lower() for x in (blacklist if blacklist is not None else NEWS_ENTITY_BLACKLIST))
     out = []
     for ent in entities or []:
-        key = str(ent).strip().lower()
-        if not key or key in banned:
+        if _matches_blacklist(ent, blacklist=blacklist):
             continue
         out.append(ent)
     return out
@@ -20,12 +28,15 @@ def _filter_entities(entities, blacklist=None):
 async def classify_cluster_groq(cluster: Dict[str, Any]) -> Dict[str, Any]:
     parsed = await call_extraction(cluster)
     out = dict(cluster)
+    topic = parsed.get("topic", cluster.get("topic"))
+    if topic and _matches_blacklist(topic):
+        topic = "other"
     out.update({
-        "topic": parsed.get("topic", cluster.get("topic")),
-        "entities": _filter_entities(parsed.get("entities", [])),
+        "topic": topic,
+        "entities": normalize_entities(_filter_entities(parsed.get("entities", []))),
         "sentiment": parsed.get("sentiment", "neutral"),
         "sentimentScore": parsed.get("sentimentScore"),
-        "keywords": parsed.get("keywords", []),
+        "keywords": normalize_entities(_filter_entities(parsed.get("keywords", []))),
     })
     return out
 

+ 54 - 0
news_mcp/entity_normalize.py

@@ -0,0 +1,54 @@
+from __future__ import annotations
+
+import json
+from functools import lru_cache
+from pathlib import Path
+from typing import Iterable
+
+from news_mcp.config import ENTITY_ALIASES_FILE
+
+# Small, explicit canonical alias map.
+# Keep this conservative and grow it only when a shorthand is clearly useful.
+@lru_cache(maxsize=1)
+def _alias_map() -> dict[str, str]:
+    path = Path(ENTITY_ALIASES_FILE)
+    if not path.exists():
+        return {}
+    try:
+        raw = json.loads(path.read_text(encoding="utf-8"))
+    except Exception:
+        return {}
+    out: dict[str, str] = {}
+    if isinstance(raw, dict):
+        for k, v in raw.items():
+            if k and v:
+                out[str(k).strip().lower()] = str(v).strip()
+    return out
+
+
+def _lookup_alias(key: str) -> str | None:
+    return _alias_map().get(key)
+
+
+def normalize_entity(value: str) -> str:
+    key = str(value).strip().lower()
+    if not key:
+        return ""
+    return _lookup_alias(key) or str(value).strip()
+
+
+def normalize_query(value: str) -> str:
+    return normalize_entity(value)
+
+
+def normalize_entities(values: Iterable[str]) -> list[str]:
+    out: list[str] = []
+    seen: set[str] = set()
+    for value in values or []:
+        norm = normalize_entity(value)
+        key = norm.lower()
+        if not norm or key in seen:
+            continue
+        seen.add(key)
+        out.append(norm)
+    return out

+ 11 - 15
news_mcp/mcp_server_fastmcp.py

@@ -10,6 +10,7 @@ from news_mcp.jobs.poller import refresh_clusters
 from news_mcp.storage.sqlite_store import SQLiteClusterStore
 from news_mcp.enrichment.llm_enrich import summarize_cluster_groq
 from news_mcp.llm import active_llm_config
+from news_mcp.entity_normalize import normalize_query
 from collections import Counter
 import logging
 
@@ -25,7 +26,7 @@ async def get_latest_events(topic: str = "crypto", limit: int = 5):
     limit = max(1, min(int(limit), 20))
     # In v1, `topic` is a coarse category. If the caller passes an entity name
     # (e.g. "trump"/"iran"), gracefully fall back to `other`.
-    topic_norm = str(topic).strip().lower()
+    topic_norm = normalize_query(topic).lower()
     allowed = {t.lower() for t in DEFAULT_TOPICS}
     if topic_norm not in allowed:
         topic_norm = "other"
@@ -61,7 +62,7 @@ async def get_latest_events(topic: str = "crypto", limit: int = 5):
 @mcp.tool(description="What's happening with X? Filter latest clusters by extracted entity substring (case-insensitive).")
 async def get_events_for_entity(entity: str, limit: int = 10):
     limit = max(1, min(int(limit), 30))
-    query = str(entity).strip().lower()
+    query = normalize_query(entity).strip().lower()
     if not query:
         return []
 
@@ -189,7 +190,7 @@ async def detect_emerging_topics(limit: int = 10):
 async def get_news_sentiment(entity: str, timeframe: str = "24h"):
     store = SQLiteClusterStore(DB_PATH)
 
-    ent = str(entity).strip().lower()
+    ent = normalize_query(entity).strip().lower()
     if not ent:
         return {
             "entity": entity,
@@ -222,7 +223,6 @@ async def get_news_sentiment(entity: str, timeframe: str = "24h"):
         }
 
     scores = []
-    labels = []
     for c in matched:
         s = c.get("sentimentScore")
         if s is not None:
@@ -230,21 +230,17 @@ async def get_news_sentiment(entity: str, timeframe: str = "24h"):
                 scores.append(float(s))
             except Exception:
                 pass
-        lbl = c.get("sentiment")
-        if lbl:
-            labels.append(str(lbl).lower())
 
     avg_score = sum(scores) / len(scores) if scores else 0.0
 
-    # Majority vote on sentiment label, fall back to sign of avg score.
-    if labels:
-        majority = Counter(labels).most_common(1)[0][0]
-        if majority in {"positive", "negative", "neutral"}:
-            sentiment = majority
-        else:
-            sentiment = "positive" if avg_score > 0 else "negative" if avg_score < 0 else "neutral"
+    # Keep the label aligned with the numeric score.
+    # Small magnitudes are treated as neutral to avoid noisy label flips.
+    if avg_score >= 0.15:
+        sentiment = "positive"
+    elif avg_score <= -0.15:
+        sentiment = "negative"
     else:
-        sentiment = "positive" if avg_score > 0 else "negative" if avg_score < 0 else "neutral"
+        sentiment = "neutral"
 
     return {
         "entity": entity,

+ 18 - 1
test_news_mcp.py

@@ -6,7 +6,8 @@ from pathlib import Path
 from news_mcp.dedup.cluster import dedup_and_cluster_articles
 from news_mcp.storage.sqlite_store import SQLiteClusterStore
 from news_mcp.enrichment.importance import compute_importance
-from news_mcp.enrichment.llm_enrich import _filter_entities
+from news_mcp.enrichment.llm_enrich import _filter_entities, _matches_blacklist
+from news_mcp.entity_normalize import normalize_query, normalize_entities
 from news_mcp.llm import build_extraction_prompt, call_llm, load_prompt
 
 
@@ -82,6 +83,22 @@ def test_blacklist_filters_entities_case_insensitively():
     assert filtered == ["Reuters", "CoinDesk"]
 
 
+def test_blacklist_supports_wildcards():
+    assert _matches_blacklist("Bloomberg Economics", blacklist=["bloomberg*"])
+    assert _matches_blacklist("bloomberg", blacklist=["*berg"])
+    assert not _matches_blacklist("Reuters", blacklist=["bloomberg*"])
+
+
+def test_query_normalization_keeps_common_shorthand_working():
+    assert normalize_query("btc") == "Bitcoin"
+    assert normalize_query("Trump") == "Donald Trump"
+    assert normalize_query("nvidia") == "nvidia"
+
+
+def test_entity_normalization_deduplicates_aliases():
+    assert normalize_entities(["btc", "Bitcoin", "BTC", "Ethereum"]) == ["Bitcoin", "Ethereum"]
+
+
 def test_load_prompt_reads_prompt_files():
     text = load_prompt("extract_entities.prompt")
     assert "Return STRICT JSON" in text