|
@@ -1,28 +1,32 @@
|
|
|
from __future__ import annotations
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
+import logging
|
|
|
from typing import Any, Dict
|
|
from typing import Any, Dict
|
|
|
|
|
|
|
|
-from news_mcp.config import CLUSTERS_TTL_HOURS, DB_PATH, RSS_FEED_URL, RSS_FEED_URLS
|
|
|
|
|
|
|
+from news_mcp.config import CLUSTERS_TTL_HOURS, DB_PATH, NEWS_FEED_URL, NEWS_FEED_URLS
|
|
|
from news_mcp.dedup.cluster import dedup_and_cluster_articles
|
|
from news_mcp.dedup.cluster import dedup_and_cluster_articles
|
|
|
from news_mcp.enrichment.enrich import enrich_cluster
|
|
from news_mcp.enrichment.enrich import enrich_cluster
|
|
|
from news_mcp.enrichment.groq_enrich import classify_cluster_groq
|
|
from news_mcp.enrichment.groq_enrich import classify_cluster_groq
|
|
|
-from news_mcp.sources.rss_breakingthenews import fetch_breakingthenews_articles
|
|
|
|
|
|
|
+from news_mcp.sources.news_feeds import fetch_news_articles
|
|
|
from news_mcp.storage.sqlite_store import SQLiteClusterStore
|
|
from news_mcp.storage.sqlite_store import SQLiteClusterStore
|
|
|
|
|
|
|
|
from news_mcp.config import GROQ_ENRICH_OTHER_ONLY, GROQ_MAX_CLUSTERS_PER_REFRESH
|
|
from news_mcp.config import GROQ_ENRICH_OTHER_ONLY, GROQ_MAX_CLUSTERS_PER_REFRESH
|
|
|
|
|
|
|
|
|
|
|
|
|
async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
|
|
async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
|
|
|
|
|
+ logger = logging.getLogger("news_mcp.refresh")
|
|
|
store = SQLiteClusterStore(DB_PATH)
|
|
store = SQLiteClusterStore(DB_PATH)
|
|
|
|
|
|
|
|
- articles = fetch_breakingthenews_articles(limit=limit)
|
|
|
|
|
|
|
+ logger.info("refresh start topic=%s limit=%s", topic, limit)
|
|
|
|
|
+ articles = fetch_news_articles(limit=limit)
|
|
|
|
|
+ logger.info("refresh fetched articles=%s", len(articles))
|
|
|
|
|
|
|
|
# Skip expensive work if the feed content (titles/urls/timestamps) didn't change.
|
|
# Skip expensive work if the feed content (titles/urls/timestamps) didn't change.
|
|
|
import hashlib
|
|
import hashlib
|
|
|
- rss_urls = [u.strip() for u in RSS_FEED_URLS.split(",") if u.strip()]
|
|
|
|
|
|
|
+ rss_urls = [u.strip() for u in NEWS_FEED_URLS.split(",") if u.strip()]
|
|
|
if not rss_urls:
|
|
if not rss_urls:
|
|
|
- rss_urls = [RSS_FEED_URL]
|
|
|
|
|
- feed_key = "breakingthenews:" + hashlib.sha1(",".join(rss_urls).encode("utf-8")).hexdigest()
|
|
|
|
|
|
|
+ rss_urls = [NEWS_FEED_URL]
|
|
|
|
|
+ feed_key = "newsfeeds:" + hashlib.sha1(",".join(rss_urls).encode("utf-8")).hexdigest()
|
|
|
material = "\n".join(
|
|
material = "\n".join(
|
|
|
f"{a.get('title','')}|{a.get('url','')}|{a.get('timestamp','')}"
|
|
f"{a.get('title','')}|{a.get('url','')}|{a.get('timestamp','')}"
|
|
|
for a in articles
|
|
for a in articles
|
|
@@ -30,9 +34,12 @@ async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
|
|
|
last_hash = hashlib.sha1(material.encode("utf-8")).hexdigest()
|
|
last_hash = hashlib.sha1(material.encode("utf-8")).hexdigest()
|
|
|
prev_hash = store.get_feed_hash(feed_key)
|
|
prev_hash = store.get_feed_hash(feed_key)
|
|
|
if prev_hash == last_hash:
|
|
if prev_hash == last_hash:
|
|
|
|
|
+ logger.info("refresh unchanged feed_key=%s topic=%s", feed_key, topic)
|
|
|
return
|
|
return
|
|
|
|
|
+ logger.info("refresh changed feed_key=%s topic=%s", feed_key, topic)
|
|
|
store.set_feed_hash(feed_key, last_hash)
|
|
store.set_feed_hash(feed_key, last_hash)
|
|
|
clustered_by_topic = dedup_and_cluster_articles(articles)
|
|
clustered_by_topic = dedup_and_cluster_articles(articles)
|
|
|
|
|
+ logger.info("refresh clustered topics=%s", list(clustered_by_topic.keys()))
|
|
|
|
|
|
|
|
for t, clusters in clustered_by_topic.items():
|
|
for t, clusters in clustered_by_topic.items():
|
|
|
if topic and t != topic:
|
|
if topic and t != topic:
|
|
@@ -62,5 +69,6 @@ async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
|
|
|
enriched.append(c2)
|
|
enriched.append(c2)
|
|
|
|
|
|
|
|
store.upsert_clusters(enriched, topic=t)
|
|
store.upsert_clusters(enriched, topic=t)
|
|
|
|
|
+ logger.info("refresh stored topic=%s clusters=%s", t, len(enriched))
|
|
|
|
|
|
|
|
|
|
|