Explorar o código

prompt, keywords filter, emerging topics - related entities

Lukas Goldschmidt hai 1 semana
pai
achega
e77a2e6e3e

+ 10 - 0
news_mcp/enrichment/llm_enrich.py

@@ -56,6 +56,16 @@ async def classify_cluster_llm(cluster: Dict[str, Any]) -> Dict[str, Any]:
     _topic_labels = {t.lower() for t in DEFAULT_TOPICS}
     keywords = [k for k in keywords if k.lower() not in _topic_labels]
 
+    # Enforce per-keyword length cap (max 2 words) as a hard guard.
+    # The prompt requests this but the LLM occasionally ignores it.
+    keywords = [k for k in keywords if len(k.split()) <= 2]
+
+    # De-duplicate entities vs keywords — entities list is the
+    # authoritative source for proper nouns; keywords should be the
+    # thematic complement, not a repeat.
+    _entity_keys = {e.strip().lower() for e in entities}
+    keywords = [k for k in keywords if k.strip().lower() not in _entity_keys]
+
     out.update({
         "topic": topic,
         "entities": entities,

+ 3 - 3
news_mcp/mcp_server_fastmcp.py

@@ -866,7 +866,7 @@ async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic:
 
         related = []
         if ent in entity_cooccur:
-            for other, _cnt in entity_cooccur[ent].most_common(5):
+            for other, _cnt in entity_cooccur[ent].most_common(10):
                 if other != ent:
                     related.append(other)
 
@@ -890,7 +890,7 @@ async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic:
         scored.append({
             "topic": ent,
             "trend_score": min(0.99, round(composed_score, 3)),
-            "related_entities": related[:3] if related else [ent],
+            "related_entities": related[:5] if related else [ent],
             "related_keywords": related_kws[:5],
             "velocity": round(velocity, 2),
             "recent_count": recent_n,
@@ -942,7 +942,7 @@ async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic:
                     kw_related_ents.append(other)
                 else:
                     kw_related_kws.append(other)
-                if len(kw_related_kws) >= 5 and len(kw_related_ents) >= 3:
+                if len(kw_related_kws) >= 5 and len(kw_related_ents) >= 5:
                     break
 
         kw_scored.append({

+ 8 - 0
prompts/extract_entities.prompt

@@ -15,6 +15,14 @@ Entity rules (strict):
 - Prefer canonical entity forms over aliases when obvious (for example, use full organization or place names where helpful).
 - Do NOT return empty entities if any such names/places appear.
 
+Keyword rules (strict):
+- Each keyword MUST be 1-2 words. Never 3+.
+- Keywords are thematic search tags, NOT headline restatements or verb phrases.
+- Good keywords: noun phrases or named concepts (e.g. "drone strikes", "energy infrastructure", "nuclear plant", "oil refinery").
+- Bad keywords: full headline fragments, verb-heavy phrases, or anything over 2 words.
+- Keywords should capture the *themes* of the story, not repeat entity names already in the entities list.
+- Return 2-4 keywords. Fewer is better than bad ones.
+
 Sentiment rules:
 - positive: clearly encouraging, improving, or supportive tone
 - negative: clearly alarming, worsening, severe, conflict, loss, risk, warning tone