#!/bin/bash # Analyze FT-sourced clusters in the live DB # Run on thinkcenter-2: docker exec -it news-mcp bash /app/scripts/analyze_ft.sh # Or: docker exec -it news-mcp python3 /app/scripts/analyze_ft.py echo "=== FT clusters in DB ===" python3 -c " import sqlite3, json conn = sqlite3.connect('./data/news.sqlite') # FT clusters rows = conn.execute(''' SELECT cluster_id, json_array_length(payload, '\$.articles'), payload->>'\$.headline', payload->>'\$.sources', payload->>'\$.timestamp' FROM clusters WHERE payload LIKE '%ft.com%' ORDER BY json_array_length(payload, '\$.articles') DESC LIMIT 30 ''').fetchall() print(f'FT clusters (top 30 by article count):') for r in rows: print(f' {r[0][:12]} arts={r[1]} {r[2][:55]} src={r[3][:30]} ts={r[4][:16]}') total = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%'\").fetchone()[0] singletons = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%' AND json_array_length(payload, '$.articles') = 1\").fetchone()[0] multi = total - singletons print(f'\nTotal FT clusters: {total}, singletons: {singletons}, multi-article: {multi}') # Check for duplicate headlines (same story, different clusters) print('\n=== Potential duplicates (same headline in multiple clusters) ===') dupe_rows = conn.execute(''' SELECT payload->>'\$.headline', count(*) as cnt, group_concat(substr(cluster_id,1,12)) FROM clusters WHERE payload LIKE '%ft.com%' GROUP BY payload->>'\$.headline' HAVING cnt > 1 ORDER BY cnt DESC LIMIT 10 ''').fetchall() for r in dupe_rows: print(f' \"{r[0][:60]}\" appears in {r[1]} clusters: {r[2]}') # BreakingTheNews clusters for comparison print('\n=== BreakingTheNews clusters (for comparison) ===') bt_total = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%breakingthenews%'\").fetchone()[0] bt_singletons = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%breakingthenews%' AND json_array_length(payload, '$.articles') = 1\").fetchone()[0] print(f'Total BT clusters: {bt_total}, singletons: {bt_singletons}, multi-article: {bt_total - bt_singletons}') # Cross-feed clusters (have both FT and BT sources) print('\n=== Cross-feed clusters (FT + BT merged) ===') cross = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%' AND payload LIKE '%breakingthenews%'\").fetchone()[0] print(f'Cross-feed clusters: {cross}') # All clusters summary print('\n=== All clusters summary ===') all_total = conn.execute('SELECT count(*) FROM clusters').fetchone()[0] all_singletons = conn.execute(\"SELECT count(*) FROM clusters WHERE json_array_length(payload, '$.articles') = 1\").fetchone()[0] print(f'Total clusters: {all_total}, singletons: {all_singletons}, multi-article: {all_total - all_singletons}') # Feed state print('\n=== Feed state ===') feeds = conn.execute('SELECT feed_key, last_hash, last_item_count, enabled, updated_at FROM feed_state WHERE last_hash != \"\"').fetchall() for f in feeds: print(f' {f[0][:50]} items={f[2]} enabled={f[3]} updated={f[4][:19]}') " echo "" echo "=== Recent poller clustering stats ===" python3 -c " import sqlite3, json conn = sqlite3.connect('./data/news.sqlite') # Check poll_stats table if it exists try: rows = conn.execute('SELECT * FROM poll_stats ORDER BY rowid DESC LIMIT 5').fetchall() if rows: cols = [d[0] for d in conn.execute('SELECT * FROM pragma_table_info(poll_stats)').fetchall()] print(f'poll_stats columns: {cols}') for r in rows: print(f' {r}') else: print('poll_stats table is empty') except Exception as e: print(f'poll_stats table: {e}') "