Browse Source

fix: deduplicate related_keywords against related_entities in emerging topics

Entity results: filter out keywords that are substrings of any related
entity name (or vice versa). E.g. 'wembanyama' is already covered by
'related_entities: [victor wembanyama]', so it no longer appears in
related_keywords. Only genuinely thematic keywords like 'finals',
'dream' survive.

Keyword results: split kw_cooccur entries into related_entities and
related_keywords by checking membership in all_entities. Previously all
co-occurrences (including entities) were dumped into related_keywords
with related_entities always empty.

Both fixes are O(n) set lookups at scoring time — no extra DB queries.
Lukas Goldschmidt 1 week ago
parent
commit
4ed086e30c
1 changed files with 24 additions and 4 deletions
  1. 24 4
      news_mcp/mcp_server_fastmcp.py

+ 24 - 4
news_mcp/mcp_server_fastmcp.py

@@ -872,8 +872,20 @@ async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic:
 
         related_kws = []
         if ent in entity_kw_cooccur:
-            for kw, _cnt in entity_kw_cooccur[ent].most_common(5):
+            # Build a set of related entity names (lowercased) to deduplicate
+            # keywords that are already represented in related_entities
+            related_ent_names = {e.strip().lower() for e in related}
+            # Also include the entity itself and its common aliases
+            related_ent_names.add(ent.strip().lower())
+            for kw, _cnt in entity_kw_cooccur[ent].most_common(10):
+                kw_lower = kw.strip().lower()
+                # Skip keywords that are just a related entity name (substring match)
+                if any(kw_lower in ent_name or ent_name in kw_lower
+                       for ent_name in related_ent_names):
+                    continue
                 related_kws.append(kw)
+                if len(related_kws) >= 5:
+                    break
 
         scored.append({
             "topic": ent,
@@ -920,15 +932,23 @@ async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic:
         )
 
         kw_related_kws = []
+        kw_related_ents = []
         if kw in kw_cooccur:
-            for other, _cnt in kw_cooccur[kw].most_common(5):
-                if other != kw:
+            for other, _cnt in kw_cooccur[kw].most_common(10):
+                if other == kw:
+                    continue
+                # If this co-occurring term is a known entity, route to related_entities
+                if other in all_entities:
+                    kw_related_ents.append(other)
+                else:
                     kw_related_kws.append(other)
+                if len(kw_related_kws) >= 5 and len(kw_related_ents) >= 3:
+                    break
 
         kw_scored.append({
             "topic": kw,
             "trend_score": min(0.99, round(composed_score, 3)),
-            "related_entities": [],
+            "related_entities": kw_related_ents[:5],
             "related_keywords": kw_related_kws[:5],
             "velocity": round(velocity, 2),
             "recent_count": recent_n,