소스 검색

news-mcp: support multiple RSS feeds via NEWS_RSS_FEED_URLS

Lukas Goldschmidt 1 개월 전
부모
커밋
ad50051d62
3개의 변경된 파일40개의 추가작업 그리고 23개의 파일을 삭제
  1. 3 0
      news_mcp/config.py
  2. 5 2
      news_mcp/jobs/poller.py
  3. 32 21
      news_mcp/sources/rss_breakingthenews.py

+ 3 - 0
news_mcp/config.py

@@ -13,6 +13,9 @@ DATA_DIR.mkdir(parents=True, exist_ok=True)
 DB_PATH = Path(os.getenv("NEWS_MCP_DB_PATH", str(DATA_DIR / "news.sqlite")))
 
 RSS_FEED_URL = os.getenv("NEWS_RSS_FEED_URL", "https://breakingthenews.net/news-feed.xml")
+# Optional multi-feed mode: comma-separated RSS URLs.
+# If set (non-empty), this overrides RSS_FEED_URL.
+RSS_FEED_URLS = os.getenv("NEWS_RSS_FEED_URLS", "").strip()
 
 # Clusters TTL (hours)
 CLUSTERS_TTL_HOURS = float(os.getenv("NEWS_CLUSTERS_TTL_HOURS", "24"))

+ 5 - 2
news_mcp/jobs/poller.py

@@ -2,7 +2,7 @@ from __future__ import annotations
 
 from typing import Any, Dict
 
-from news_mcp.config import CLUSTERS_TTL_HOURS, DB_PATH
+from news_mcp.config import CLUSTERS_TTL_HOURS, DB_PATH, RSS_FEED_URL, RSS_FEED_URLS
 from news_mcp.dedup.cluster import dedup_and_cluster_articles
 from news_mcp.enrichment.enrich import enrich_cluster
 from news_mcp.enrichment.groq_enrich import classify_cluster_groq
@@ -19,7 +19,10 @@ async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
 
     # Skip expensive work if the feed content (titles/urls/timestamps) didn't change.
     import hashlib
-    feed_key = "breakingthenews"  # v1: single feed
+    rss_urls = [u.strip() for u in RSS_FEED_URLS.split(",") if u.strip()]
+    if not rss_urls:
+        rss_urls = [RSS_FEED_URL]
+    feed_key = "breakingthenews:" + hashlib.sha1(",".join(rss_urls).encode("utf-8")).hexdigest()
     material = "\n".join(
         f"{a.get('title','')}|{a.get('url','')}|{a.get('timestamp','')}"
         for a in articles

+ 32 - 21
news_mcp/sources/rss_breakingthenews.py

@@ -5,7 +5,7 @@ from typing import Any, Dict, List
 
 import feedparser
 
-from news_mcp.config import RSS_FEED_URL
+from news_mcp.config import RSS_FEED_URL, RSS_FEED_URLS
 
 
 def _canonical_url(url: str) -> str:
@@ -14,28 +14,39 @@ def _canonical_url(url: str) -> str:
 
 
 def fetch_breakingthenews_articles(limit: int = 50) -> List[Dict[str, Any]]:
-    feed = feedparser.parse(RSS_FEED_URL)
+    rss_urls = [u.strip() for u in RSS_FEED_URLS.split(",") if u.strip()]
+    if not rss_urls:
+        rss_urls = [RSS_FEED_URL]
+
     articles: List[Dict[str, Any]] = []
 
-    for entry in feed.entries[:limit]:
-        title = str(getattr(entry, "title", "")).strip()
-        url = _canonical_url(str(getattr(entry, "link", "")).strip())
-        source = "BreakingTheNews"
-        timestamp = str(getattr(entry, "published", "")) or str(getattr(entry, "updated", ""))
-        summary = str(getattr(entry, "summary", "")) or str(getattr(entry, "description", ""))
-
-        if not title or not url:
-            continue
-
-        articles.append(
-            {
-                "title": title,
-                "url": url,
-                "source": source,
-                "timestamp": timestamp,
-                "summary": summary,
-            }
-        )
+    # Evenly pull from feeds; keep total below `limit`.
+    per_feed_limit = max(1, int(limit / max(1, len(rss_urls))))
+
+    for feed_url in rss_urls:
+        feed = feedparser.parse(feed_url)
+        for entry in feed.entries[:per_feed_limit]:
+            title = str(getattr(entry, "title", "")).strip()
+            url = _canonical_url(str(getattr(entry, "link", "")).strip())
+            source = "RSS"
+            timestamp = str(getattr(entry, "published", "")) or str(getattr(entry, "updated", ""))
+            summary = str(getattr(entry, "summary", "")) or str(getattr(entry, "description", ""))
+
+            if not title or not url:
+                continue
+
+            articles.append(
+                {
+                    "title": title,
+                    "url": url,
+                    "source": source,
+                    "timestamp": timestamp,
+                    "summary": summary,
+                }
+            )
+
+            if len(articles) >= limit:
+                return articles
 
     return articles