Эх сурвалжийг харах

news-mcp: improve detect_emerging_topics with importance-weighting

Lukas Goldschmidt 1 сар өмнө
parent
commit
c3d59ba466

+ 26 - 1
news_mcp/mcp_server_fastmcp.py

@@ -254,15 +254,35 @@ async def detect_emerging_topics(limit: int = 10):
     import re
 
     entity_counts = Counter()
+    entity_importance_sum = Counter()
     phrase_counts = Counter()
     topic_counts = Counter()
 
+    # Very light heuristics to reduce “meta entities” dominating emerging topics.
+    # Keep it conservative: only skip obvious boilerplate.
+    def _is_generic_entity(ent: str) -> bool:
+        e = str(ent).strip().lower()
+        if not e:
+            return True
+        if len(e) < 4:
+            return True
+        # common outlet-ish / meta-ish tokens
+        if e in {"news", "latest", "breaking"}:
+            return True
+        return False
+
     for c in clusters:
         topic_counts[c.get("topic", "other")] += 1
         for ent in c.get("entities", []) or []:
+            if _is_generic_entity(ent):
+                continue
             key = str(ent).strip().lower()
             if key:
                 entity_counts[key] += 1
+                try:
+                    entity_importance_sum[key] += float(c.get("importance", 0.0) or 0.0)
+                except Exception:
+                    pass
 
         text = f"{c.get('headline','')} {c.get('summary','')}"
         words = [w for w in re.findall(r"[A-Za-z][A-Za-z0-9\-]{2,}", text.lower())]
@@ -272,13 +292,18 @@ async def detect_emerging_topics(limit: int = 10):
                 phrase_counts[phrase] += 1
 
     emerging = []
+    # Combine frequency with average importance so “big signal” rises over pure repetition.
     for ent, count in entity_counts.most_common(limit):
+        avg_imp = entity_importance_sum[ent] / max(1, count)
+        # avg_imp is typically 0..~1; keep score bounded.
+        trend_score = 0.25 + 0.40 * min(1.0, avg_imp) + 0.08 * min(6.0, float(count))
         emerging.append({
             "topic": ent,
-            "trend_score": min(0.99, round(0.25 + 0.15 * count, 2)),
+            "trend_score": min(0.99, round(trend_score, 2)),
             "related_entities": [ent],
             "signal_type": "entity",
             "count": count,
+            "avg_importance": round(avg_imp, 3),
         })
 
     for phrase, count in phrase_counts.most_common(limit * 2):