|
@@ -4,6 +4,8 @@ import hashlib
|
|
|
import logging
|
|
import logging
|
|
|
import re
|
|
import re
|
|
|
from typing import Any, Dict, List
|
|
from typing import Any, Dict, List
|
|
|
|
|
+from urllib.error import URLError, HTTPError
|
|
|
|
|
+from urllib.request import Request, urlopen
|
|
|
|
|
|
|
|
import feedparser
|
|
import feedparser
|
|
|
|
|
|
|
@@ -13,6 +15,9 @@ from news_mcp.config import NEWS_FEED_URL, NEWS_FEED_URLS
|
|
|
logger = logging.getLogger(__name__)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+FEED_FETCH_TIMEOUT_SECONDS = 15
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
def _canonical_url(url: str) -> str:
|
|
def _canonical_url(url: str) -> str:
|
|
|
# Minimal canonicalization for v1.
|
|
# Minimal canonicalization for v1.
|
|
|
return url.strip()
|
|
return url.strip()
|
|
@@ -34,20 +39,33 @@ def _feed_urls() -> List[str]:
|
|
|
return urls
|
|
return urls
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+def _fetch_feed(feed_url: str):
|
|
|
|
|
+ req = Request(feed_url, headers={"User-Agent": "news-mcp/1.0"})
|
|
|
|
|
+ with urlopen(req, timeout=FEED_FETCH_TIMEOUT_SECONDS) as resp:
|
|
|
|
|
+ return feedparser.parse(resp.read())
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
def fetch_news_articles(limit: int = 50) -> List[Dict[str, Any]]:
|
|
def fetch_news_articles(limit: int = 50) -> List[Dict[str, Any]]:
|
|
|
feed_urls = _feed_urls()
|
|
feed_urls = _feed_urls()
|
|
|
articles: List[Dict[str, Any]] = []
|
|
articles: List[Dict[str, Any]] = []
|
|
|
|
|
|
|
|
- logger.info("news ingestion start feeds=%s limit=%s", len(feed_urls), limit)
|
|
|
|
|
|
|
+ logger.info("news ingestion start feeds=%s limit=%s timeout_s=%s", len(feed_urls), limit, FEED_FETCH_TIMEOUT_SECONDS)
|
|
|
|
|
|
|
|
# Evenly pull from feeds; keep total below `limit`.
|
|
# Evenly pull from feeds; keep total below `limit`.
|
|
|
per_feed_limit = max(1, int(limit / max(1, len(feed_urls))))
|
|
per_feed_limit = max(1, int(limit / max(1, len(feed_urls))))
|
|
|
|
|
|
|
|
for feed_url in feed_urls:
|
|
for feed_url in feed_urls:
|
|
|
- feed = feedparser.parse(feed_url)
|
|
|
|
|
- feed_name = getattr(feed.feed, "title", None) or feed_url
|
|
|
|
|
- parsed_entries = len(getattr(feed, "entries", []) or [])
|
|
|
|
|
- logger.info("news feed parsed feed_url=%s feed_name=%s entries=%s", feed_url, feed_name, parsed_entries)
|
|
|
|
|
|
|
+ try:
|
|
|
|
|
+ feed = _fetch_feed(feed_url)
|
|
|
|
|
+ feed_name = getattr(feed.feed, "title", None) or feed_url
|
|
|
|
|
+ parsed_entries = len(getattr(feed, "entries", []) or [])
|
|
|
|
|
+ logger.info("news feed parsed feed_url=%s feed_name=%s entries=%s", feed_url, feed_name, parsed_entries)
|
|
|
|
|
+ except (HTTPError, URLError, TimeoutError, OSError) as exc:
|
|
|
|
|
+ logger.exception("news feed fetch failed feed_url=%s error=%s", feed_url, exc)
|
|
|
|
|
+ continue
|
|
|
|
|
+ except Exception as exc:
|
|
|
|
|
+ logger.exception("news feed parse failed feed_url=%s error=%s", feed_url, exc)
|
|
|
|
|
+ continue
|
|
|
|
|
|
|
|
kept_before = len(articles)
|
|
kept_before = len(articles)
|
|
|
for entry in feed.entries[:per_feed_limit]:
|
|
for entry in feed.entries[:per_feed_limit]:
|