소스 검색

detect_emerging_topics: related_entities via co-occurrence

Lukas Goldschmidt 1 개월 전
부모
커밋
60b02963b7
1개의 변경된 파일30개의 추가작업 그리고 8개의 파일을 삭제
  1. 30 8
      news_mcp/mcp_server_fastmcp.py

+ 30 - 8
news_mcp/mcp_server_fastmcp.py

@@ -255,6 +255,8 @@ async def detect_emerging_topics(limit: int = 10):
 
     entity_counts = Counter()
     entity_importance_sum = Counter()
+    # co-occurrence: ent -> other_ent -> count
+    entity_cooccur = {}
     phrase_counts = Counter()
     topic_counts = Counter()
 
@@ -273,17 +275,31 @@ async def detect_emerging_topics(limit: int = 10):
 
     for c in clusters:
         topic_counts[c.get("topic", "other")] += 1
-        for ent in c.get("entities", []) or []:
+        ents_in_cluster = [e for e in (c.get("entities", []) or []) if not _is_generic_entity(e)]
+        ents_in_cluster_norm = [str(e).strip().lower() for e in ents_in_cluster if str(e).strip()]
+        for ent in ents_in_cluster_norm:
             if _is_generic_entity(ent):
                 continue
-            key = str(ent).strip().lower()
-            if key:
-                entity_counts[key] += 1
-                try:
-                    entity_importance_sum[key] += float(c.get("importance", 0.0) or 0.0)
-                except Exception:
+            entity_counts[ent] += 1
+            try:
+                    entity_importance_sum[ent] += float(c.get("importance", 0.0) or 0.0)
+            except Exception:
                     pass
 
+        # update co-occurrence counts
+        for i in range(len(ents_in_cluster_norm)):
+            a = ents_in_cluster_norm[i]
+            if not a:
+                continue
+            entity_cooccur.setdefault(a, Counter())
+            for j in range(len(ents_in_cluster_norm)):
+                if i == j:
+                    continue
+                b = ents_in_cluster_norm[j]
+                if not b:
+                    continue
+                entity_cooccur[a][b] += 1
+
         text = f"{c.get('headline','')} {c.get('summary','')}"
         words = [w for w in re.findall(r"[A-Za-z][A-Za-z0-9\-]{2,}", text.lower())]
         for i in range(len(words) - 1):
@@ -297,10 +313,16 @@ async def detect_emerging_topics(limit: int = 10):
         avg_imp = entity_importance_sum[ent] / max(1, count)
         # avg_imp is typically 0..~1; keep score bounded.
         trend_score = 0.25 + 0.40 * min(1.0, avg_imp) + 0.08 * min(6.0, float(count))
+        related = []
+        for other, _cnt in (entity_cooccur.get(ent) or Counter()).most_common(3):
+            # avoid returning the entity itself (shouldn't happen, but be safe)
+            if other != ent:
+                related.append(other)
+
         emerging.append({
             "topic": ent,
             "trend_score": min(0.99, round(trend_score, 2)),
-            "related_entities": [ent],
+            "related_entities": related if related else [ent],
             "signal_type": "entity",
             "count": count,
             "avg_importance": round(avg_imp, 3),