1 miesiąc temu · a4096b9dfb
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 
															 # 📰 News MCP Server
														
 
															-FastMCP-based MCP server that turns RSS into **deduplicated, enriched news clusters**.
														
 
															+FastMCP-based MCP server that turns news feeds into **deduplicated, enriched clusters**.
														
 
															 ## Quick start
														
@@ -18,7 +18,7 @@ Health:
 
															 - `http://127.0.0.1:8506/health`
														
 
															 ## What this server provides
														
 
															-- Fetches RSS from `https://breakingthenews.net/news-feed.xml`
														
 
															+- Fetches from one or more configured news feeds (`NEWS_FEED_URL` / `NEWS_FEED_URLS`)
														
 
															 - Deduplicates articles into clusters (v1 fuzzy title similarity)
														
 
															 - Enriches clusters with Groq (topic/entities/sentiment/keywords)
														
 
															 - Caches clusters + Groq fields in SQLite
														
@@ -37,11 +37,70 @@ Health:
 
															 4) `detect_emerging_topics(limit)`
														
 
															 - derives “emerging” signals from recent cached clusters
														
 
															+5) `get_news_sentiment(entity, timeframe)`
														
 
															+- aggregates sentiment around an entity from cached enriched clusters
														
 
															+
														
 
															 ## Configuration
														
 
															 See `news-mcp/.env`.
														
 
															 Key variables:
														
 
															 - `GROQ_API_KEY`, `GROQ_MODEL`, `GROQ_DEBUG`
														
 
															+- `NEWS_FEED_URL` (single feed fallback)
														
 
															+- `NEWS_FEED_URLS` (comma-separated feed URLs; overrides `NEWS_FEED_URL`)
														
 
															 - `NEWS_REFRESH_INTERVAL_SECONDS` (default 900)
														
 
															 - `NEWS_BACKGROUND_REFRESH_ON_START` (default true)
														
 
															+- `NEWS_BACKGROUND_REFRESH_ENABLED` (default true)
														
 
															 - `NEWS_CLUSTERS_TTL_HOURS`
														
 
															+- `GROQ_ENRICH_OTHER_ONLY` (default false; set true for cost control)
														
 
															+
														
 
															+## mcporter examples (all news-mcp calls)
														
 
															+
														
 
															+Use your existing config path:
														
 
															+
														
 
															+```bash
														
 
															+CONFIG=/home/lucky/.openclaw/workspace/config/mcporter.json
														
 
															+```
														
 
															+
														
 
															+Inspect server + tools:
														
 
															+
														
 
															+```bash
														
 
															+mcporter --config "$CONFIG" list news --schema
														
 
															+```
														
 
															+
														
 
															+### 1) Latest events
														
 
															+
														
 
															+```bash
														
 
															+mcporter --config "$CONFIG" call news.get_latest_events topic=crypto limit=10
														
 
															+mcporter --config "$CONFIG" call news.get_latest_events topic=macro limit=5
														
 
															+```
														
 
															+
														
 
															+### 2) Events for an entity
														
 
															+
														
 
															+```bash
														
 
															+mcporter --config "$CONFIG" call news.get_events_for_entity entity=Bitcoin limit=10
														
 
															+mcporter --config "$CONFIG" call news.get_events_for_entity entity=ETH limit=10
														
 
															+mcporter --config "$CONFIG" call news.get_events_for_entity entity=ETF limit=10
														
 
															+```
														
 
															+
														
 
															+### 3) Event summary (by cluster_id)
														
 
															+
														
 
															+```bash
														
 
															+# First fetch an event id
														
 
															+mcporter --config "$CONFIG" call news.get_latest_events topic=crypto limit=1
														
 
															+
														
 
															+# Then summarize it
														
 
															+mcporter --config "$CONFIG" call news.get_event_summary event_id=<cluster_id>
														
 
															+```
														
 
															+
														
 
															+### 4) Emerging topics
														
 
															+
														
 
															+```bash
														
 
															+mcporter --config "$CONFIG" call news.detect_emerging_topics limit=10
														
 
															+```
														
 
															+
														
 
															+### 5) Sentiment for an entity
														
 
															+
														
 
															+```bash
														
 
															+mcporter --config "$CONFIG" call news.get_news_sentiment entity=Bitcoin timeframe=24h
														
 
															+mcporter --config "$CONFIG" call news.get_news_sentiment entity=Ethereum timeframe=72h
														
 
															+```
														
--- a/killserver.sh
+++ b/killserver.sh
@@ -2,6 +2,20 @@
 
															 set -euo pipefail
														
 
															 PIDFILE=${PIDFILE:-server.pid}
														
 
															+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
														
 
															+
														
 
															+is_news_server() {
														
 
															+  local cmd="$1"
														
 
															+  case "$cmd" in
														
 
															+    *"news_mcp.mcp_server_fastmcp:app"*|*"run.sh"*|*"uvicorn"*)
														
 
															+      return 0
														
 
															+      ;;
														
 
															+    *)
														
 
															+      return 1
														
 
															+      ;;
														
 
															+  esac
														
 
															+}
														
 
															+
														
 
															 stop_pid() {
														
 
															   local pid="$1"
														
 
															   if [ -n "$pid" ] && ps -p "$pid" > /dev/null 2>&1; then
														
@@ -18,6 +32,20 @@ if [ -f "$PIDFILE" ]; then
 
															 fi
														
 
															 # Sweep up stale server processes started from this project.
														
 
															-for pid in $(pgrep -f 'news_mcp\.mcp_server_fastmcp:app|uvicorn .*news_mcp\.mcp_server_fastmcp:app' || true); do
														
 
															-  stop_pid "$pid"
														
 
															-done
														
 
															+while IFS= read -r line; do
														
 
															+  pid="${line%% *}"
														
 
															+  cmd="${line#* }"
														
 
															+  if is_news_server "$cmd"; then
														
 
															+    stop_pid "$pid"
														
 
															+  fi
														
 
															+done < <(pgrep -af 'news_mcp\.mcp_server_fastmcp:app|uvicorn|run\.sh' || true)
														
 
															+
														
 
															+# Extra safety: if we were started from the project dir, kill any lingering
														
 
															+# process that still has the app module in its command line.
														
 
															+while IFS= read -r line; do
														
 
															+  pid="${line%% *}"
														
 
															+  cmd="${line#* }"
														
 
															+  if [[ "$cmd" == *"$SCRIPT_DIR"* ]] && [[ "$cmd" == *"news_mcp.mcp_server_fastmcp:app"* ]]; then
														
 
															+    stop_pid "$pid"
														
 
															+  fi
														
 
															+done < <(pgrep -af 'news_mcp\.mcp_server_fastmcp:app' || true)
														
--- a/news_mcp/config.py
+++ b/news_mcp/config.py
@@ -12,10 +12,15 @@ DATA_DIR.mkdir(parents=True, exist_ok=True)
 
															 DB_PATH = Path(os.getenv("NEWS_MCP_DB_PATH", str(DATA_DIR / "news.sqlite")))
														
 
															-RSS_FEED_URL = os.getenv("NEWS_RSS_FEED_URL", "https://breakingthenews.net/news-feed.xml")
														
 
															-# Optional multi-feed mode: comma-separated RSS URLs.
														
 
															-# If set (non-empty), this overrides RSS_FEED_URL.
														
 
															-RSS_FEED_URLS = os.getenv("NEWS_RSS_FEED_URLS", "").strip()
														
 
															+# Backward-compatible aliases for older config names.
														
 
															+NEWS_FEED_URL = os.getenv("NEWS_FEED_URL", os.getenv("NEWS_RSS_FEED_URL", "https://breakingthenews.net/news-feed.xml"))
														
 
															+# Optional multi-feed mode: comma-separated feed URLs.
														
 
															+# If set (non-empty), this overrides NEWS_FEED_URL.
														
 
															+NEWS_FEED_URLS = os.getenv("NEWS_FEED_URLS", os.getenv("NEWS_RSS_FEED_URLS", "")).strip()
														
 
															+
														
 
															+# Legacy names kept for compatibility.
														
 
															+RSS_FEED_URL = NEWS_FEED_URL
														
 
															+RSS_FEED_URLS = NEWS_FEED_URLS
														
 
															 # Clusters TTL (hours)
														
 
															 CLUSTERS_TTL_HOURS = float(os.getenv("NEWS_CLUSTERS_TTL_HOURS", "24"))
														
@@ -27,8 +32,9 @@ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
 
															 GROQ_MODEL = os.getenv("GROQ_MODEL", "llama4-16e")
														
 
															 GROQ_DEBUG = os.getenv("GROQ_DEBUG", "false").lower() == "true"
														
 
															-# Cost control: only enrich clusters whose heuristic topic is "other" by default.
														
 
															-GROQ_ENRICH_OTHER_ONLY = os.getenv("GROQ_ENRICH_OTHER_ONLY", "true").lower() == "true"
														
 
															+# Groq enrichment is the default for all incoming news.
														
 
															+# Set GROQ_ENRICH_OTHER_ONLY=true only if you want to restrict it for cost control.
														
 
															+GROQ_ENRICH_OTHER_ONLY = os.getenv("GROQ_ENRICH_OTHER_ONLY", "false").lower() == "true"
														
 
															 # Limit enriched clusters per refresh call.
														
 
															 GROQ_MAX_CLUSTERS_PER_REFRESH = int(os.getenv("GROQ_MAX_CLUSTERS_PER_REFRESH", "20"))
														
--- a/news_mcp/dedup/cluster.py
+++ b/news_mcp/dedup/cluster.py
@@ -2,7 +2,7 @@ from __future__ import annotations
 
															 from typing import Any, Dict, List, Tuple
														
 
															-from news_mcp.sources.rss_breakingthenews import normalize_topic_from_title
														
 
															+from news_mcp.sources.news_feeds import normalize_topic_from_title
														
 
															 import re
														
 
															 from difflib import SequenceMatcher
														
--- a/news_mcp/jobs/poller.py
+++ b/news_mcp/jobs/poller.py
@@ -1,28 +1,32 @@
 
															 from __future__ import annotations
														
 
															+import logging
														
 
															 from typing import Any, Dict
														
 
															-from news_mcp.config import CLUSTERS_TTL_HOURS, DB_PATH, RSS_FEED_URL, RSS_FEED_URLS
														
 
															+from news_mcp.config import CLUSTERS_TTL_HOURS, DB_PATH, NEWS_FEED_URL, NEWS_FEED_URLS
														
 
															 from news_mcp.dedup.cluster import dedup_and_cluster_articles
														
 
															 from news_mcp.enrichment.enrich import enrich_cluster
														
 
															 from news_mcp.enrichment.groq_enrich import classify_cluster_groq
														
 
															-from news_mcp.sources.rss_breakingthenews import fetch_breakingthenews_articles
														
 
															+from news_mcp.sources.news_feeds import fetch_news_articles
														
 
															 from news_mcp.storage.sqlite_store import SQLiteClusterStore
														
 
															 from news_mcp.config import GROQ_ENRICH_OTHER_ONLY, GROQ_MAX_CLUSTERS_PER_REFRESH
														
 
															 async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
														
 
															+    logger = logging.getLogger("news_mcp.refresh")
														
 
															     store = SQLiteClusterStore(DB_PATH)
														
 
															-    articles = fetch_breakingthenews_articles(limit=limit)
														
 
															+    logger.info("refresh start topic=%s limit=%s", topic, limit)
														
 
															+    articles = fetch_news_articles(limit=limit)
														
 
															+    logger.info("refresh fetched articles=%s", len(articles))
														
 
															     # Skip expensive work if the feed content (titles/urls/timestamps) didn't change.
														
 
															     import hashlib
														
 
															-    rss_urls = [u.strip() for u in RSS_FEED_URLS.split(",") if u.strip()]
														
 
															+    rss_urls = [u.strip() for u in NEWS_FEED_URLS.split(",") if u.strip()]
														
 
															     if not rss_urls:
														
 
															-        rss_urls = [RSS_FEED_URL]
														
 
															-    feed_key = "breakingthenews:" + hashlib.sha1(",".join(rss_urls).encode("utf-8")).hexdigest()
														
 
															+        rss_urls = [NEWS_FEED_URL]
														
 
															+    feed_key = "newsfeeds:" + hashlib.sha1(",".join(rss_urls).encode("utf-8")).hexdigest()
														
 
															     material = "\n".join(
														
 
															         f"{a.get('title','')}|{a.get('url','')}|{a.get('timestamp','')}"
														
 
															         for a in articles
														
@@ -30,9 +34,12 @@ async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
 
															     last_hash = hashlib.sha1(material.encode("utf-8")).hexdigest()
														
 
															     prev_hash = store.get_feed_hash(feed_key)
														
 
															     if prev_hash == last_hash:
														
 
															+        logger.info("refresh unchanged feed_key=%s topic=%s", feed_key, topic)
														
 
															         return
														
 
															+    logger.info("refresh changed feed_key=%s topic=%s", feed_key, topic)
														
 
															     store.set_feed_hash(feed_key, last_hash)
														
 
															     clustered_by_topic = dedup_and_cluster_articles(articles)
														
 
															+    logger.info("refresh clustered topics=%s", list(clustered_by_topic.keys()))
														
 
															     for t, clusters in clustered_by_topic.items():
														
 
															         if topic and t != topic:
														
@@ -62,5 +69,6 @@ async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
 
															             enriched.append(c2)
														
 
															         store.upsert_clusters(enriched, topic=t)
														
 
															+        logger.info("refresh stored topic=%s clusters=%s", t, len(enriched))
														
--- a/news_mcp/sources/news_feeds.py
+++ b/news_mcp/sources/news_feeds.py
@@ -0,0 +1,92 @@
 
															+from __future__ import annotations
														
 
															+
														
 
															+import hashlib
														
 
															+import logging
														
 
															+from typing import Any, Dict, List
														
 
															+
														
 
															+import feedparser
														
 
															+
														
 
															+from news_mcp.config import NEWS_FEED_URL, NEWS_FEED_URLS
														
 
															+
														
 
															+
														
 
															+logger = logging.getLogger(__name__)
														
 
															+
														
 
															+
														
 
															+def _canonical_url(url: str) -> str:
														
 
															+    # Minimal canonicalization for v1.
														
 
															+    return url.strip()
														
 
															+
														
 
															+
														
 
															+def _feed_urls() -> List[str]:
														
 
															+    urls = [u.strip() for u in NEWS_FEED_URLS.split(",") if u.strip()]
														
 
															+    if not urls:
														
 
															+        urls = [NEWS_FEED_URL]
														
 
															+    return urls
														
 
															+
														
 
															+
														
 
															+def fetch_news_articles(limit: int = 50) -> List[Dict[str, Any]]:
														
 
															+    feed_urls = _feed_urls()
														
 
															+    articles: List[Dict[str, Any]] = []
														
 
															+
														
 
															+    logger.info("news ingestion start feeds=%s limit=%s", len(feed_urls), limit)
														
 
															+
														
 
															+    # Evenly pull from feeds; keep total below `limit`.
														
 
															+    per_feed_limit = max(1, int(limit / max(1, len(feed_urls))))
														
 
															+
														
 
															+    for feed_url in feed_urls:
														
 
															+        feed = feedparser.parse(feed_url)
														
 
															+        feed_name = getattr(feed.feed, "title", None) or feed_url
														
 
															+        parsed_entries = len(getattr(feed, "entries", []) or [])
														
 
															+        logger.info("news feed parsed feed_url=%s feed_name=%s entries=%s", feed_url, feed_name, parsed_entries)
														
 
															+
														
 
															+        kept_before = len(articles)
														
 
															+        for entry in feed.entries[:per_feed_limit]:
														
 
															+            title = str(getattr(entry, "title", "")).strip()
														
 
															+            url = _canonical_url(str(getattr(entry, "link", "")).strip())
														
 
															+            timestamp = str(getattr(entry, "published", "")) or str(getattr(entry, "updated", ""))
														
 
															+            summary = str(getattr(entry, "summary", "")) or str(getattr(entry, "description", ""))
														
 
															+
														
 
															+            if not title or not url:
														
 
															+                continue
														
 
															+
														
 
															+            articles.append(
														
 
															+                {
														
 
															+                    "title": title,
														
 
															+                    "url": url,
														
 
															+                    "source": str(feed_name),
														
 
															+                    "feed_url": feed_url,
														
 
															+                    "timestamp": timestamp,
														
 
															+                    "summary": summary,
														
 
															+                }
														
 
															+            )
														
 
															+
														
 
															+            if len(articles) >= limit:
														
 
															+                logger.info("news ingestion limit reached feed_url=%s total_kept=%s", feed_url, len(articles))
														
 
															+                return articles
														
 
															+
														
 
															+        logger.info(
														
 
															+            "news feed completed feed_url=%s kept=%s",
														
 
															+            feed_url,
														
 
															+            len(articles) - kept_before,
														
 
															+        )
														
 
															+
														
 
															+    logger.info("news ingestion complete total_kept=%s", len(articles))
														
 
															+    return articles
														
 
															+
														
 
															+
														
 
															+def normalize_topic_from_title(title: str) -> str:
														
 
															+    t = title.lower()
														
 
															+    if any(k in t for k in ["btc", "bitcoin", "eth", "ethereum", "crypto"]):
														
 
															+        return "crypto"
														
 
															+    if any(k in t for k in ["rate", "rates", "inflation", "fed", "treasury", "euro"]):
														
 
															+        return "macro"
														
 
															+    if any(k in t for k in ["regulation", "sec", "ban", "law"]):
														
 
															+        return "regulation"
														
 
															+    if any(k in t for k in ["ai", "llm", "model", "openai", "anthropic"]):
														
 
															+        return "ai"
														
 
															+    return "other"
														
 
															+
														
 
															+
														
 
															+def cluster_id_for_title(topic: str, title: str) -> str:
														
 
															+    key = f"{topic}|{title.strip().lower()}"
														
 
															+    return hashlib.sha1(key.encode("utf-8")).hexdigest()
														
--- a/news_mcp/sources/rss_breakingthenews.py
+++ b/news_mcp/sources/rss_breakingthenews.py
@@ -1,69 +1,6 @@
 
															 from __future__ import annotations
														
 
															-import hashlib
														
 
															-from typing import Any, Dict, List
														
 
															+from news_mcp.sources.news_feeds import cluster_id_for_title, fetch_news_articles, normalize_topic_from_title
														
 
															-import feedparser
														
 
															-
														
 
															-from news_mcp.config import RSS_FEED_URL, RSS_FEED_URLS
														
 
															-
														
 
															-
														
 
															-def _canonical_url(url: str) -> str:
														
 
															-    # Minimal canonicalization for v1.
														
 
															-    return url.strip()
														
 
															-
														
 
															-
														
 
															-def fetch_breakingthenews_articles(limit: int = 50) -> List[Dict[str, Any]]:
														
 
															-    rss_urls = [u.strip() for u in RSS_FEED_URLS.split(",") if u.strip()]
														
 
															-    if not rss_urls:
														
 
															-        rss_urls = [RSS_FEED_URL]
														
 
															-
														
 
															-    articles: List[Dict[str, Any]] = []
														
 
															-
														
 
															-    # Evenly pull from feeds; keep total below `limit`.
														
 
															-    per_feed_limit = max(1, int(limit / max(1, len(rss_urls))))
														
 
															-
														
 
															-    for feed_url in rss_urls:
														
 
															-        feed = feedparser.parse(feed_url)
														
 
															-        for entry in feed.entries[:per_feed_limit]:
														
 
															-            title = str(getattr(entry, "title", "")).strip()
														
 
															-            url = _canonical_url(str(getattr(entry, "link", "")).strip())
														
 
															-            source = "RSS"
														
 
															-            timestamp = str(getattr(entry, "published", "")) or str(getattr(entry, "updated", ""))
														
 
															-            summary = str(getattr(entry, "summary", "")) or str(getattr(entry, "description", ""))
														
 
															-
														
 
															-            if not title or not url:
														
 
															-                continue
														
 
															-
														
 
															-            articles.append(
														
 
															-                {
														
 
															-                    "title": title,
														
 
															-                    "url": url,
														
 
															-                    "source": source,
														
 
															-                    "timestamp": timestamp,
														
 
															-                    "summary": summary,
														
 
															-                }
														
 
															-            )
														
 
															-
														
 
															-            if len(articles) >= limit:
														
 
															-                return articles
														
 
															-
														
 
															-    return articles
														
 
															-
														
 
															-
														
 
															-def normalize_topic_from_title(title: str) -> str:
														
 
															-    t = title.lower()
														
 
															-    if any(k in t for k in ["btc", "bitcoin", "eth", "ethereum", "crypto"]):
														
 
															-        return "crypto"
														
 
															-    if any(k in t for k in ["rate", "rates", "inflation", "fed", "treasury", "euro"]):
														
 
															-        return "macro"
														
 
															-    if any(k in t for k in ["regulation", "sec", "ban", "law"]):
														
 
															-        return "regulation"
														
 
															-    if any(k in t for k in ["ai", "llm", "model", "openai", "anthropic"]):
														
 
															-        return "ai"
														
 
															-    return "other"
														
 
															-
														
 
															-
														
 
															-def cluster_id_for_title(topic: str, title: str) -> str:
														
 
															-    key = f"{topic}|{title.strip().lower()}"
														
 
															-    return hashlib.sha1(key.encode("utf-8")).hexdigest()
														
 
															+# Backward-compatible aliases for older imports.
														
 
															+fetch_breakingthenews_articles = fetch_news_articles