Browse Source

Stabilize news refresh and health reporting

Lukas Goldschmidt 1 month ago
parent
commit
0ae02f21ef
3 changed files with 28 additions and 6 deletions
  1. 4 1
      news_mcp/jobs/poller.py
  2. 1 0
      news_mcp/mcp_server_fastmcp.py
  3. 23 5
      news_mcp/sources/news_feeds.py

+ 4 - 1
news_mcp/jobs/poller.py

@@ -1,6 +1,8 @@
 from __future__ import annotations
 from __future__ import annotations
 
 
+import asyncio
 import logging
 import logging
+from datetime import datetime, timezone
 from typing import Any, Dict
 from typing import Any, Dict
 
 
 from news_mcp.config import DEFAULT_LOOKBACK_HOURS, DB_PATH, NEWS_FEED_URL, NEWS_FEED_URLS
 from news_mcp.config import DEFAULT_LOOKBACK_HOURS, DB_PATH, NEWS_FEED_URL, NEWS_FEED_URLS
@@ -25,7 +27,7 @@ async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
     store = SQLiteClusterStore(DB_PATH)
     store = SQLiteClusterStore(DB_PATH)
 
 
     logger.info("refresh start topic=%s limit=%s", topic, limit)
     logger.info("refresh start topic=%s limit=%s", topic, limit)
-    articles = fetch_news_articles(limit=limit)
+    articles = await asyncio.to_thread(fetch_news_articles, limit)
     logger.info("refresh fetched articles=%s", len(articles))
     logger.info("refresh fetched articles=%s", len(articles))
 
 
     # Skip expensive work if the feed content (titles/urls/timestamps) didn't change.
     # Skip expensive work if the feed content (titles/urls/timestamps) didn't change.
@@ -93,4 +95,5 @@ async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
         retention_days=NEWS_RETENTION_DAYS,
         retention_days=NEWS_RETENTION_DAYS,
         interval_hours=NEWS_PRUNE_INTERVAL_HOURS,
         interval_hours=NEWS_PRUNE_INTERVAL_HOURS,
     )
     )
+    store.set_meta("last_refresh_at", datetime.now(timezone.utc).isoformat())
     logger.info("refresh prune_result=%s", prune_result)
     logger.info("refresh prune_result=%s", prune_result)

+ 1 - 0
news_mcp/mcp_server_fastmcp.py

@@ -750,6 +750,7 @@ def health():
         "status": "ok",
         "status": "ok",
         "lookback_hours": DEFAULT_LOOKBACK_HOURS,
         "lookback_hours": DEFAULT_LOOKBACK_HOURS,
         "db": str(DB_PATH),
         "db": str(DB_PATH),
+        "last_refresh_at": store.get_meta("last_refresh_at"),
         "refresh": store.get_feed_state("breakingthenews"),
         "refresh": store.get_feed_state("breakingthenews"),
         "pruning": store.get_prune_state(
         "pruning": store.get_prune_state(
             pruning_enabled=NEWS_PRUNING_ENABLED,
             pruning_enabled=NEWS_PRUNING_ENABLED,

+ 23 - 5
news_mcp/sources/news_feeds.py

@@ -4,6 +4,8 @@ import hashlib
 import logging
 import logging
 import re
 import re
 from typing import Any, Dict, List
 from typing import Any, Dict, List
+from urllib.error import URLError, HTTPError
+from urllib.request import Request, urlopen
 
 
 import feedparser
 import feedparser
 
 
@@ -13,6 +15,9 @@ from news_mcp.config import NEWS_FEED_URL, NEWS_FEED_URLS
 logger = logging.getLogger(__name__)
 logger = logging.getLogger(__name__)
 
 
 
 
+FEED_FETCH_TIMEOUT_SECONDS = 15
+
+
 def _canonical_url(url: str) -> str:
 def _canonical_url(url: str) -> str:
     # Minimal canonicalization for v1.
     # Minimal canonicalization for v1.
     return url.strip()
     return url.strip()
@@ -34,20 +39,33 @@ def _feed_urls() -> List[str]:
     return urls
     return urls
 
 
 
 
+def _fetch_feed(feed_url: str):
+    req = Request(feed_url, headers={"User-Agent": "news-mcp/1.0"})
+    with urlopen(req, timeout=FEED_FETCH_TIMEOUT_SECONDS) as resp:
+        return feedparser.parse(resp.read())
+
+
 def fetch_news_articles(limit: int = 50) -> List[Dict[str, Any]]:
 def fetch_news_articles(limit: int = 50) -> List[Dict[str, Any]]:
     feed_urls = _feed_urls()
     feed_urls = _feed_urls()
     articles: List[Dict[str, Any]] = []
     articles: List[Dict[str, Any]] = []
 
 
-    logger.info("news ingestion start feeds=%s limit=%s", len(feed_urls), limit)
+    logger.info("news ingestion start feeds=%s limit=%s timeout_s=%s", len(feed_urls), limit, FEED_FETCH_TIMEOUT_SECONDS)
 
 
     # Evenly pull from feeds; keep total below `limit`.
     # Evenly pull from feeds; keep total below `limit`.
     per_feed_limit = max(1, int(limit / max(1, len(feed_urls))))
     per_feed_limit = max(1, int(limit / max(1, len(feed_urls))))
 
 
     for feed_url in feed_urls:
     for feed_url in feed_urls:
-        feed = feedparser.parse(feed_url)
-        feed_name = getattr(feed.feed, "title", None) or feed_url
-        parsed_entries = len(getattr(feed, "entries", []) or [])
-        logger.info("news feed parsed feed_url=%s feed_name=%s entries=%s", feed_url, feed_name, parsed_entries)
+        try:
+            feed = _fetch_feed(feed_url)
+            feed_name = getattr(feed.feed, "title", None) or feed_url
+            parsed_entries = len(getattr(feed, "entries", []) or [])
+            logger.info("news feed parsed feed_url=%s feed_name=%s entries=%s", feed_url, feed_name, parsed_entries)
+        except (HTTPError, URLError, TimeoutError, OSError) as exc:
+            logger.exception("news feed fetch failed feed_url=%s error=%s", feed_url, exc)
+            continue
+        except Exception as exc:
+            logger.exception("news feed parse failed feed_url=%s error=%s", feed_url, exc)
+            continue
 
 
         kept_before = len(articles)
         kept_before = len(articles)
         for entry in feed.entries[:per_feed_limit]:
         for entry in feed.entries[:per_feed_limit]: