#!/bin/bash
# Analyze FT-sourced clusters in the live DB
# Run on thinkcenter-2: docker exec -it news-mcp bash /app/scripts/analyze_ft.sh
# Or: docker exec -it news-mcp python3 /app/scripts/analyze_ft.py

echo "=== FT clusters in DB ==="
python3 -c "
import sqlite3, json
conn = sqlite3.connect('./data/news.sqlite')

# FT clusters
rows = conn.execute('''
    SELECT cluster_id, 
           json_array_length(payload, '\$.articles'), 
           payload->>'\$.headline',
           payload->>'\$.sources',
           payload->>'\$.timestamp'
    FROM clusters 
    WHERE payload LIKE '%ft.com%' 
    ORDER BY json_array_length(payload, '\$.articles') DESC 
    LIMIT 30
''').fetchall()
print(f'FT clusters (top 30 by article count):')
for r in rows:
    print(f'  {r[0][:12]}  arts={r[1]}  {r[2][:55]}  src={r[3][:30]}  ts={r[4][:16]}')

total = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%'\").fetchone()[0]
singletons = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%' AND json_array_length(payload, '$.articles') = 1\").fetchone()[0]
multi = total - singletons
print(f'\nTotal FT clusters: {total}, singletons: {singletons}, multi-article: {multi}')

# Check for duplicate headlines (same story, different clusters)
print('\n=== Potential duplicates (same headline in multiple clusters) ===')
dupe_rows = conn.execute('''
    SELECT payload->>'\$.headline', count(*) as cnt, group_concat(substr(cluster_id,1,12))
    FROM clusters 
    WHERE payload LIKE '%ft.com%'
    GROUP BY payload->>'\$.headline'
    HAVING cnt > 1
    ORDER BY cnt DESC
    LIMIT 10
''').fetchall()
for r in dupe_rows:
    print(f'  \"{r[0][:60]}\" appears in {r[1]} clusters: {r[2]}')

# BreakingTheNews clusters for comparison
print('\n=== BreakingTheNews clusters (for comparison) ===')
bt_total = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%breakingthenews%'\").fetchone()[0]
bt_singletons = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%breakingthenews%' AND json_array_length(payload, '$.articles') = 1\").fetchone()[0]
print(f'Total BT clusters: {bt_total}, singletons: {bt_singletons}, multi-article: {bt_total - bt_singletons}')

# Cross-feed clusters (have both FT and BT sources)
print('\n=== Cross-feed clusters (FT + BT merged) ===')
cross = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%' AND payload LIKE '%breakingthenews%'\").fetchone()[0]
print(f'Cross-feed clusters: {cross}')

# All clusters summary
print('\n=== All clusters summary ===')
all_total = conn.execute('SELECT count(*) FROM clusters').fetchone()[0]
all_singletons = conn.execute(\"SELECT count(*) FROM clusters WHERE json_array_length(payload, '$.articles') = 1\").fetchone()[0]
print(f'Total clusters: {all_total}, singletons: {all_singletons}, multi-article: {all_total - all_singletons}')

# Feed state
print('\n=== Feed state ===')
feeds = conn.execute('SELECT feed_key, last_hash, last_item_count, enabled, updated_at FROM feed_state WHERE last_hash != \"\"').fetchall()
for f in feeds:
    print(f'  {f[0][:50]}  items={f[2]}  enabled={f[3]}  updated={f[4][:19]}')
"

echo ""
echo "=== Recent poller clustering stats ==="
python3 -c "
import sqlite3, json
conn = sqlite3.connect('./data/news.sqlite')
# Check poll_stats table if it exists
try:
    rows = conn.execute('SELECT * FROM poll_stats ORDER BY rowid DESC LIMIT 5').fetchall()
    if rows:
        cols = [d[0] for d in conn.execute('SELECT * FROM pragma_table_info(poll_stats)').fetchall()]
        print(f'poll_stats columns: {cols}')
        for r in rows:
            print(f'  {r}')
    else:
        print('poll_stats table is empty')
except Exception as e:
    print(f'poll_stats table: {e}')
"