|
@@ -254,15 +254,35 @@ async def detect_emerging_topics(limit: int = 10):
|
|
|
import re
|
|
import re
|
|
|
|
|
|
|
|
entity_counts = Counter()
|
|
entity_counts = Counter()
|
|
|
|
|
+ entity_importance_sum = Counter()
|
|
|
phrase_counts = Counter()
|
|
phrase_counts = Counter()
|
|
|
topic_counts = Counter()
|
|
topic_counts = Counter()
|
|
|
|
|
|
|
|
|
|
+ # Very light heuristics to reduce “meta entities” dominating emerging topics.
|
|
|
|
|
+ # Keep it conservative: only skip obvious boilerplate.
|
|
|
|
|
+ def _is_generic_entity(ent: str) -> bool:
|
|
|
|
|
+ e = str(ent).strip().lower()
|
|
|
|
|
+ if not e:
|
|
|
|
|
+ return True
|
|
|
|
|
+ if len(e) < 4:
|
|
|
|
|
+ return True
|
|
|
|
|
+ # common outlet-ish / meta-ish tokens
|
|
|
|
|
+ if e in {"news", "latest", "breaking"}:
|
|
|
|
|
+ return True
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
for c in clusters:
|
|
for c in clusters:
|
|
|
topic_counts[c.get("topic", "other")] += 1
|
|
topic_counts[c.get("topic", "other")] += 1
|
|
|
for ent in c.get("entities", []) or []:
|
|
for ent in c.get("entities", []) or []:
|
|
|
|
|
+ if _is_generic_entity(ent):
|
|
|
|
|
+ continue
|
|
|
key = str(ent).strip().lower()
|
|
key = str(ent).strip().lower()
|
|
|
if key:
|
|
if key:
|
|
|
entity_counts[key] += 1
|
|
entity_counts[key] += 1
|
|
|
|
|
+ try:
|
|
|
|
|
+ entity_importance_sum[key] += float(c.get("importance", 0.0) or 0.0)
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ pass
|
|
|
|
|
|
|
|
text = f"{c.get('headline','')} {c.get('summary','')}"
|
|
text = f"{c.get('headline','')} {c.get('summary','')}"
|
|
|
words = [w for w in re.findall(r"[A-Za-z][A-Za-z0-9\-]{2,}", text.lower())]
|
|
words = [w for w in re.findall(r"[A-Za-z][A-Za-z0-9\-]{2,}", text.lower())]
|
|
@@ -272,13 +292,18 @@ async def detect_emerging_topics(limit: int = 10):
|
|
|
phrase_counts[phrase] += 1
|
|
phrase_counts[phrase] += 1
|
|
|
|
|
|
|
|
emerging = []
|
|
emerging = []
|
|
|
|
|
+ # Combine frequency with average importance so “big signal” rises over pure repetition.
|
|
|
for ent, count in entity_counts.most_common(limit):
|
|
for ent, count in entity_counts.most_common(limit):
|
|
|
|
|
+ avg_imp = entity_importance_sum[ent] / max(1, count)
|
|
|
|
|
+ # avg_imp is typically 0..~1; keep score bounded.
|
|
|
|
|
+ trend_score = 0.25 + 0.40 * min(1.0, avg_imp) + 0.08 * min(6.0, float(count))
|
|
|
emerging.append({
|
|
emerging.append({
|
|
|
"topic": ent,
|
|
"topic": ent,
|
|
|
- "trend_score": min(0.99, round(0.25 + 0.15 * count, 2)),
|
|
|
|
|
|
|
+ "trend_score": min(0.99, round(trend_score, 2)),
|
|
|
"related_entities": [ent],
|
|
"related_entities": [ent],
|
|
|
"signal_type": "entity",
|
|
"signal_type": "entity",
|
|
|
"count": count,
|
|
"count": count,
|
|
|
|
|
+ "avg_importance": round(avg_imp, 3),
|
|
|
})
|
|
})
|
|
|
|
|
|
|
|
for phrase, count in phrase_counts.most_common(limit * 2):
|
|
for phrase, count in phrase_counts.most_common(limit * 2):
|