6 روز پیش · 8fe316db24
--- a/scripts/analyze_ft.py
+++ b/scripts/analyze_ft.py
@@ -0,0 +1,110 @@
 
															+#!/usr/bin/env python3
														
 
															+"""Analyze FT-sourced clusters in the live DB.
														
 
															+Run: docker exec -it news-mcp python3 /app/scripts/analyze_ft.py
														
 
															+"""
														
 
															+import sqlite3, json
														
 
															+
														
 
															+DB = "./data/news.sqlite"
														
 
															+conn = sqlite3.connect(DB)
														
 
															+
														
 
															+print("=== FT clusters in DB ===")
														
 
															+rows = conn.execute("""
														
 
															+    SELECT cluster_id, 
														
 
															+           json_array_length(payload, '$.articles'), 
														
 
															+           json_extract(payload, '$.headline'),
														
 
															+           json_extract(payload, '$.sources'),
														
 
															+           json_extract(payload, '$.timestamp')
														
 
															+    FROM clusters 
														
 
															+    WHERE payload LIKE '%ft.com%' 
														
 
															+    ORDER BY json_array_length(payload, '$.articles') DESC 
														
 
															+    LIMIT 30
														
 
															+""").fetchall()
														
 
															+print("FT clusters (top 30 by article count):")
														
 
															+for r in rows:
														
 
															+    src = (r[3] or "")[:30]
														
 
															+    ts = (r[4] or "")[:16]
														
 
															+    hl = (r[2] or "")[:55]
														
 
															+    print(f"  {r[0][:12]}  arts={r[1]}  {hl}  src={src}  ts={ts}")
														
 
															+
														
 
															+total = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%'").fetchone()[0]
														
 
															+singletons = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%' AND json_array_length(payload, '$.articles') = 1").fetchone()[0]
														
 
															+print(f"\nTotal FT clusters: {total}, singletons: {singletons}, multi-article: {total - singletons}")
														
 
															+
														
 
															+# Duplicate headlines
														
 
															+print("\n=== Potential duplicates (same headline in multiple clusters) ===")
														
 
															+dupe_rows = conn.execute("""
														
 
															+    SELECT json_extract(payload, '$.headline'), count(*) as cnt, group_concat(substr(cluster_id,1,12))
														
 
															+    FROM clusters 
														
 
															+    WHERE payload LIKE '%ft.com%'
														
 
															+    GROUP BY json_extract(payload, '$.headline')
														
 
															+    HAVING cnt > 1
														
 
															+    ORDER BY cnt DESC
														
 
															+    LIMIT 10
														
 
															+""").fetchall()
														
 
															+if dupe_rows:
														
 
															+    for r in dupe_rows:
														
 
															+        print(f'  "{(r[0] or "")[:60]}" appears in {r[1]} clusters: {r[2]}')
														
 
															+else:
														
 
															+    print("  No duplicates found")
														
 
															+
														
 
															+# BreakingTheNews comparison
														
 
															+print("\n=== BreakingTheNews clusters (for comparison) ===")
														
 
															+bt_total = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%breakingthenews%'").fetchone()[0]
														
 
															+bt_singletons = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%breakingthenews%' AND json_array_length(payload, '$.articles') = 1").fetchone()[0]
														
 
															+print(f"Total BT clusters: {bt_total}, singletons: {bt_singletons}, multi-article: {bt_total - bt_singletons}")
														
 
															+
														
 
															+# Cross-feed clusters
														
 
															+print("\n=== Cross-feed clusters (FT + BT merged) ===")
														
 
															+cross = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%' AND payload LIKE '%breakingthenews%'").fetchone()[0]
														
 
															+print(f"Cross-feed clusters: {cross}")
														
 
															+
														
 
															+# FoxNews clusters
														
 
															+print("\n=== FoxNews clusters ===")
														
 
															+fox_total = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%foxnews%'").fetchone()[0]
														
 
															+fox_singletons = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%foxnews%' AND json_array_length(payload, '$.articles') = 1").fetchone()[0]
														
 
															+print(f"Total FoxNews clusters: {fox_total}, singletons: {fox_singletons}, multi-article: {fox_total - fox_singletons}")
														
 
															+
														
 
															+# All clusters summary
														
 
															+print("\n=== All clusters summary ===")
														
 
															+all_total = conn.execute("SELECT count(*) FROM clusters").fetchone()[0]
														
 
															+all_singletons = conn.execute("SELECT count(*) FROM clusters WHERE json_array_length(payload, '$.articles') = 1").fetchone()[0]
														
 
															+print(f"Total clusters: {all_total}, singletons: {all_singletons}, multi-article: {all_total - all_singletons}")
														
 
															+
														
 
															+# Feed state
														
 
															+print("\n=== Feed state (active feeds) ===")
														
 
															+feeds = conn.execute("SELECT feed_key, last_hash, last_item_count, enabled, updated_at FROM feed_state WHERE last_hash != ''").fetchall()
														
 
															+for f in feeds:
														
 
															+    print(f"  {f[0][:55]}  items={f[2]}  enabled={f[3]}  updated={f[4][:19]}")
														
 
															+
														
 
															+# Check for clusters covering same story from different feeds (by title similarity)
														
 
															+print("\n=== Clusters with similar headlines across feeds (sample) ===")
														
 
															+all_clusters = conn.execute("""
														
 
															+    SELECT cluster_id, json_extract(payload, '$.headline'), json_extract(payload, '$.sources')
														
 
															+    FROM clusters
														
 
															+    ORDER BY cluster_id
														
 
															+    LIMIT 200
														
 
															+""").fetchall()
														
 
															+# Quick check: any FT headline that contains words also in BT headlines
														
 
															+ft_headlines = [(cid, hl, src) for cid, hl, src in all_clusters if "ft.com" in (src or "")]
														
 
															+bt_headlines = [(cid, hl, src) for cid, hl, src in all_clusters if "breakingthenews" in (src or "")]
														
 
															+if ft_headlines and bt_headlines:
														
 
															+    from difflib import SequenceMatcher
														
 
															+    def norm(t):
														
 
															+        import re
														
 
															+        t = t.lower().strip()
														
 
															+        t = re.sub(r"[^a-z0-9\s]", " ", t)
														
 
															+        return re.sub(r"\s+", " ", t).strip()
														
 
															+    matches = []
														
 
															+    for fc, fh, fs in ft_headlines[:20]:
														
 
															+        for bc, bh, bs in bt_headlines[:20]:
														
 
															+            sim = SequenceMatcher(None, norm(fh or ""), norm(bh or "")).ratio()
														
 
															+            if sim >= 0.50:
														
 
															+                matches.append((sim, fh[:50], bh[:50], fc[:8], bc[:8]))
														
 
															+    matches.sort(reverse=True)
														
 
															+    for sim, fh, bh, fc, bc in matches[:8]:
														
 
															+        print(f"  sim={sim:.2f}  FT[{fc}]: {fh}")
														
 
															+        print(f"           BT[{bc}]: {bh}")
														
 
															+    if not matches:
														
 
															+        print("  No cross-feed headline matches >= 0.50 found")
														
 
															+
														
 
															+conn.close()
														
--- a/scripts/analyze_ft.sh
+++ b/scripts/analyze_ft.sh
@@ -0,0 +1,87 @@
 
															+#!/bin/bash
														
 
															+# Analyze FT-sourced clusters in the live DB
														
 
															+# Run on thinkcenter-2: docker exec -it news-mcp bash /app/scripts/analyze_ft.sh
														
 
															+# Or: docker exec -it news-mcp python3 /app/scripts/analyze_ft.py
														
 
															+
														
 
															+echo "=== FT clusters in DB ==="
														
 
															+python3 -c "
														
 
															+import sqlite3, json
														
 
															+conn = sqlite3.connect('./data/news.sqlite')
														
 
															+
														
 
															+# FT clusters
														
 
															+rows = conn.execute('''
														
 
															+    SELECT cluster_id, 
														
 
															+           json_array_length(payload, '\$.articles'), 
														
 
															+           payload->>'\$.headline',
														
 
															+           payload->>'\$.sources',
														
 
															+           payload->>'\$.timestamp'
														
 
															+    FROM clusters 
														
 
															+    WHERE payload LIKE '%ft.com%' 
														
 
															+    ORDER BY json_array_length(payload, '\$.articles') DESC 
														
 
															+    LIMIT 30
														
 
															+''').fetchall()
														
 
															+print(f'FT clusters (top 30 by article count):')
														
 
															+for r in rows:
														
 
															+    print(f'  {r[0][:12]}  arts={r[1]}  {r[2][:55]}  src={r[3][:30]}  ts={r[4][:16]}')
														
 
															+
														
 
															+total = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%'\").fetchone()[0]
														
 
															+singletons = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%' AND json_array_length(payload, '$.articles') = 1\").fetchone()[0]
														
 
															+multi = total - singletons
														
 
															+print(f'\nTotal FT clusters: {total}, singletons: {singletons}, multi-article: {multi}')
														
 
															+
														
 
															+# Check for duplicate headlines (same story, different clusters)
														
 
															+print('\n=== Potential duplicates (same headline in multiple clusters) ===')
														
 
															+dupe_rows = conn.execute('''
														
 
															+    SELECT payload->>'\$.headline', count(*) as cnt, group_concat(substr(cluster_id,1,12))
														
 
															+    FROM clusters 
														
 
															+    WHERE payload LIKE '%ft.com%'
														
 
															+    GROUP BY payload->>'\$.headline'
														
 
															+    HAVING cnt > 1
														
 
															+    ORDER BY cnt DESC
														
 
															+    LIMIT 10
														
 
															+''').fetchall()
														
 
															+for r in dupe_rows:
														
 
															+    print(f'  \"{r[0][:60]}\" appears in {r[1]} clusters: {r[2]}')
														
 
															+
														
 
															+# BreakingTheNews clusters for comparison
														
 
															+print('\n=== BreakingTheNews clusters (for comparison) ===')
														
 
															+bt_total = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%breakingthenews%'\").fetchone()[0]
														
 
															+bt_singletons = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%breakingthenews%' AND json_array_length(payload, '$.articles') = 1\").fetchone()[0]
														
 
															+print(f'Total BT clusters: {bt_total}, singletons: {bt_singletons}, multi-article: {bt_total - bt_singletons}')
														
 
															+
														
 
															+# Cross-feed clusters (have both FT and BT sources)
														
 
															+print('\n=== Cross-feed clusters (FT + BT merged) ===')
														
 
															+cross = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%' AND payload LIKE '%breakingthenews%'\").fetchone()[0]
														
 
															+print(f'Cross-feed clusters: {cross}')
														
 
															+
														
 
															+# All clusters summary
														
 
															+print('\n=== All clusters summary ===')
														
 
															+all_total = conn.execute('SELECT count(*) FROM clusters').fetchone()[0]
														
 
															+all_singletons = conn.execute(\"SELECT count(*) FROM clusters WHERE json_array_length(payload, '$.articles') = 1\").fetchone()[0]
														
 
															+print(f'Total clusters: {all_total}, singletons: {all_singletons}, multi-article: {all_total - all_singletons}')
														
 
															+
														
 
															+# Feed state
														
 
															+print('\n=== Feed state ===')
														
 
															+feeds = conn.execute('SELECT feed_key, last_hash, last_item_count, enabled, updated_at FROM feed_state WHERE last_hash != \"\"').fetchall()
														
 
															+for f in feeds:
														
 
															+    print(f'  {f[0][:50]}  items={f[2]}  enabled={f[3]}  updated={f[4][:19]}')
														
 
															+"
														
 
															+
														
 
															+echo ""
														
 
															+echo "=== Recent poller clustering stats ==="
														
 
															+python3 -c "
														
 
															+import sqlite3, json
														
 
															+conn = sqlite3.connect('./data/news.sqlite')
														
 
															+# Check poll_stats table if it exists
														
 
															+try:
														
 
															+    rows = conn.execute('SELECT * FROM poll_stats ORDER BY rowid DESC LIMIT 5').fetchall()
														
 
															+    if rows:
														
 
															+        cols = [d[0] for d in conn.execute('SELECT * FROM pragma_table_info(poll_stats)').fetchall()]
														
 
															+        print(f'poll_stats columns: {cols}')
														
 
															+        for r in rows:
														
 
															+            print(f'  {r}')
														
 
															+    else:
														
 
															+        print('poll_stats table is empty')
														
 
															+except Exception as e:
														
 
															+    print(f'poll_stats table: {e}')
														
 
															+"