6 일 전 · 8fe316db24
--- a/scripts/analyze_ft.py
+++ b/scripts/analyze_ft.py
@@ -0,0 +1,110 @@
 
				+#!/usr/bin/env python3
			
 
				+"""Analyze FT-sourced clusters in the live DB.
			
 
				+Run: docker exec -it news-mcp python3 /app/scripts/analyze_ft.py
			
 
				+"""
			
 
				+import sqlite3, json
			
 
				+
			
 
				+DB = "./data/news.sqlite"
			
 
				+conn = sqlite3.connect(DB)
			
 
				+
			
 
				+print("=== FT clusters in DB ===")
			
 
				+rows = conn.execute("""
			
 
				+    SELECT cluster_id, 
			
 
				+           json_array_length(payload, '$.articles'), 
			
 
				+           json_extract(payload, '$.headline'),
			
 
				+           json_extract(payload, '$.sources'),
			
 
				+           json_extract(payload, '$.timestamp')
			
 
				+    FROM clusters 
			
 
				+    WHERE payload LIKE '%ft.com%' 
			
 
				+    ORDER BY json_array_length(payload, '$.articles') DESC 
			
 
				+    LIMIT 30
			
 
				+""").fetchall()
			
 
				+print("FT clusters (top 30 by article count):")
			
 
				+for r in rows:
			
 
				+    src = (r[3] or "")[:30]
			
 
				+    ts = (r[4] or "")[:16]
			
 
				+    hl = (r[2] or "")[:55]
			
 
				+    print(f"  {r[0][:12]}  arts={r[1]}  {hl}  src={src}  ts={ts}")
			
 
				+
			
 
				+total = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%'").fetchone()[0]
			
 
				+singletons = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%' AND json_array_length(payload, '$.articles') = 1").fetchone()[0]
			
 
				+print(f"\nTotal FT clusters: {total}, singletons: {singletons}, multi-article: {total - singletons}")
			
 
				+
			
 
				+# Duplicate headlines
			
 
				+print("\n=== Potential duplicates (same headline in multiple clusters) ===")
			
 
				+dupe_rows = conn.execute("""
			
 
				+    SELECT json_extract(payload, '$.headline'), count(*) as cnt, group_concat(substr(cluster_id,1,12))
			
 
				+    FROM clusters 
			
 
				+    WHERE payload LIKE '%ft.com%'
			
 
				+    GROUP BY json_extract(payload, '$.headline')
			
 
				+    HAVING cnt > 1
			
 
				+    ORDER BY cnt DESC
			
 
				+    LIMIT 10
			
 
				+""").fetchall()
			
 
				+if dupe_rows:
			
 
				+    for r in dupe_rows:
			
 
				+        print(f'  "{(r[0] or "")[:60]}" appears in {r[1]} clusters: {r[2]}')
			
 
				+else:
			
 
				+    print("  No duplicates found")
			
 
				+
			
 
				+# BreakingTheNews comparison
			
 
				+print("\n=== BreakingTheNews clusters (for comparison) ===")
			
 
				+bt_total = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%breakingthenews%'").fetchone()[0]
			
 
				+bt_singletons = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%breakingthenews%' AND json_array_length(payload, '$.articles') = 1").fetchone()[0]
			
 
				+print(f"Total BT clusters: {bt_total}, singletons: {bt_singletons}, multi-article: {bt_total - bt_singletons}")
			
 
				+
			
 
				+# Cross-feed clusters
			
 
				+print("\n=== Cross-feed clusters (FT + BT merged) ===")
			
 
				+cross = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%' AND payload LIKE '%breakingthenews%'").fetchone()[0]
			
 
				+print(f"Cross-feed clusters: {cross}")
			
 
				+
			
 
				+# FoxNews clusters
			
 
				+print("\n=== FoxNews clusters ===")
			
 
				+fox_total = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%foxnews%'").fetchone()[0]
			
 
				+fox_singletons = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%foxnews%' AND json_array_length(payload, '$.articles') = 1").fetchone()[0]
			
 
				+print(f"Total FoxNews clusters: {fox_total}, singletons: {fox_singletons}, multi-article: {fox_total - fox_singletons}")
			
 
				+
			
 
				+# All clusters summary
			
 
				+print("\n=== All clusters summary ===")
			
 
				+all_total = conn.execute("SELECT count(*) FROM clusters").fetchone()[0]
			
 
				+all_singletons = conn.execute("SELECT count(*) FROM clusters WHERE json_array_length(payload, '$.articles') = 1").fetchone()[0]
			
 
				+print(f"Total clusters: {all_total}, singletons: {all_singletons}, multi-article: {all_total - all_singletons}")
			
 
				+
			
 
				+# Feed state
			
 
				+print("\n=== Feed state (active feeds) ===")
			
 
				+feeds = conn.execute("SELECT feed_key, last_hash, last_item_count, enabled, updated_at FROM feed_state WHERE last_hash != ''").fetchall()
			
 
				+for f in feeds:
			
 
				+    print(f"  {f[0][:55]}  items={f[2]}  enabled={f[3]}  updated={f[4][:19]}")
			
 
				+
			
 
				+# Check for clusters covering same story from different feeds (by title similarity)
			
 
				+print("\n=== Clusters with similar headlines across feeds (sample) ===")
			
 
				+all_clusters = conn.execute("""
			
 
				+    SELECT cluster_id, json_extract(payload, '$.headline'), json_extract(payload, '$.sources')
			
 
				+    FROM clusters
			
 
				+    ORDER BY cluster_id
			
 
				+    LIMIT 200
			
 
				+""").fetchall()
			
 
				+# Quick check: any FT headline that contains words also in BT headlines
			
 
				+ft_headlines = [(cid, hl, src) for cid, hl, src in all_clusters if "ft.com" in (src or "")]
			
 
				+bt_headlines = [(cid, hl, src) for cid, hl, src in all_clusters if "breakingthenews" in (src or "")]
			
 
				+if ft_headlines and bt_headlines:
			
 
				+    from difflib import SequenceMatcher
			
 
				+    def norm(t):
			
 
				+        import re
			
 
				+        t = t.lower().strip()
			
 
				+        t = re.sub(r"[^a-z0-9\s]", " ", t)
			
 
				+        return re.sub(r"\s+", " ", t).strip()
			
 
				+    matches = []
			
 
				+    for fc, fh, fs in ft_headlines[:20]:
			
 
				+        for bc, bh, bs in bt_headlines[:20]:
			
 
				+            sim = SequenceMatcher(None, norm(fh or ""), norm(bh or "")).ratio()
			
 
				+            if sim >= 0.50:
			
 
				+                matches.append((sim, fh[:50], bh[:50], fc[:8], bc[:8]))
			
 
				+    matches.sort(reverse=True)
			
 
				+    for sim, fh, bh, fc, bc in matches[:8]:
			
 
				+        print(f"  sim={sim:.2f}  FT[{fc}]: {fh}")
			
 
				+        print(f"           BT[{bc}]: {bh}")
			
 
				+    if not matches:
			
 
				+        print("  No cross-feed headline matches >= 0.50 found")
			
 
				+
			
 
				+conn.close()
			
--- a/scripts/analyze_ft.sh
+++ b/scripts/analyze_ft.sh
@@ -0,0 +1,87 @@
 
				+#!/bin/bash
			
 
				+# Analyze FT-sourced clusters in the live DB
			
 
				+# Run on thinkcenter-2: docker exec -it news-mcp bash /app/scripts/analyze_ft.sh
			
 
				+# Or: docker exec -it news-mcp python3 /app/scripts/analyze_ft.py
			
 
				+
			
 
				+echo "=== FT clusters in DB ==="
			
 
				+python3 -c "
			
 
				+import sqlite3, json
			
 
				+conn = sqlite3.connect('./data/news.sqlite')
			
 
				+
			
 
				+# FT clusters
			
 
				+rows = conn.execute('''
			
 
				+    SELECT cluster_id, 
			
 
				+           json_array_length(payload, '\$.articles'), 
			
 
				+           payload->>'\$.headline',
			
 
				+           payload->>'\$.sources',
			
 
				+           payload->>'\$.timestamp'
			
 
				+    FROM clusters 
			
 
				+    WHERE payload LIKE '%ft.com%' 
			
 
				+    ORDER BY json_array_length(payload, '\$.articles') DESC 
			
 
				+    LIMIT 30
			
 
				+''').fetchall()
			
 
				+print(f'FT clusters (top 30 by article count):')
			
 
				+for r in rows:
			
 
				+    print(f'  {r[0][:12]}  arts={r[1]}  {r[2][:55]}  src={r[3][:30]}  ts={r[4][:16]}')
			
 
				+
			
 
				+total = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%'\").fetchone()[0]
			
 
				+singletons = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%' AND json_array_length(payload, '$.articles') = 1\").fetchone()[0]
			
 
				+multi = total - singletons
			
 
				+print(f'\nTotal FT clusters: {total}, singletons: {singletons}, multi-article: {multi}')
			
 
				+
			
 
				+# Check for duplicate headlines (same story, different clusters)
			
 
				+print('\n=== Potential duplicates (same headline in multiple clusters) ===')
			
 
				+dupe_rows = conn.execute('''
			
 
				+    SELECT payload->>'\$.headline', count(*) as cnt, group_concat(substr(cluster_id,1,12))
			
 
				+    FROM clusters 
			
 
				+    WHERE payload LIKE '%ft.com%'
			
 
				+    GROUP BY payload->>'\$.headline'
			
 
				+    HAVING cnt > 1
			
 
				+    ORDER BY cnt DESC
			
 
				+    LIMIT 10
			
 
				+''').fetchall()
			
 
				+for r in dupe_rows:
			
 
				+    print(f'  \"{r[0][:60]}\" appears in {r[1]} clusters: {r[2]}')
			
 
				+
			
 
				+# BreakingTheNews clusters for comparison
			
 
				+print('\n=== BreakingTheNews clusters (for comparison) ===')
			
 
				+bt_total = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%breakingthenews%'\").fetchone()[0]
			
 
				+bt_singletons = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%breakingthenews%' AND json_array_length(payload, '$.articles') = 1\").fetchone()[0]
			
 
				+print(f'Total BT clusters: {bt_total}, singletons: {bt_singletons}, multi-article: {bt_total - bt_singletons}')
			
 
				+
			
 
				+# Cross-feed clusters (have both FT and BT sources)
			
 
				+print('\n=== Cross-feed clusters (FT + BT merged) ===')
			
 
				+cross = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%' AND payload LIKE '%breakingthenews%'\").fetchone()[0]
			
 
				+print(f'Cross-feed clusters: {cross}')
			
 
				+
			
 
				+# All clusters summary
			
 
				+print('\n=== All clusters summary ===')
			
 
				+all_total = conn.execute('SELECT count(*) FROM clusters').fetchone()[0]
			
 
				+all_singletons = conn.execute(\"SELECT count(*) FROM clusters WHERE json_array_length(payload, '$.articles') = 1\").fetchone()[0]
			
 
				+print(f'Total clusters: {all_total}, singletons: {all_singletons}, multi-article: {all_total - all_singletons}')
			
 
				+
			
 
				+# Feed state
			
 
				+print('\n=== Feed state ===')
			
 
				+feeds = conn.execute('SELECT feed_key, last_hash, last_item_count, enabled, updated_at FROM feed_state WHERE last_hash != \"\"').fetchall()
			
 
				+for f in feeds:
			
 
				+    print(f'  {f[0][:50]}  items={f[2]}  enabled={f[3]}  updated={f[4][:19]}')
			
 
				+"
			
 
				+
			
 
				+echo ""
			
 
				+echo "=== Recent poller clustering stats ==="
			
 
				+python3 -c "
			
 
				+import sqlite3, json
			
 
				+conn = sqlite3.connect('./data/news.sqlite')
			
 
				+# Check poll_stats table if it exists
			
 
				+try:
			
 
				+    rows = conn.execute('SELECT * FROM poll_stats ORDER BY rowid DESC LIMIT 5').fetchall()
			
 
				+    if rows:
			
 
				+        cols = [d[0] for d in conn.execute('SELECT * FROM pragma_table_info(poll_stats)').fetchall()]
			
 
				+        print(f'poll_stats columns: {cols}')
			
 
				+        for r in rows:
			
 
				+            print(f'  {r}')
			
 
				+    else:
			
 
				+        print('poll_stats table is empty')
			
 
				+except Exception as e:
			
 
				+    print(f'poll_stats table: {e}')
			
 
				+"