Просмотр исходного кода

feat: article_identity module, site_config DB table, debug_dedup tool

1. Deduplicate _article_key: single source of truth in
   news_mcp/article_identity.py (article_key + article_content_hash).
   Both cluster.py and sqlite_store.py now import from there.
   Also adds article_content_hash() for future content-change detection.

2. DB-backed site_config table: all tunable parameters in one place.
   Seeded from .env overrides or defaults on first startup.
   REST API: GET /api/v1/config, POST /api/v1/config/update,
   POST /api/v1/config/reset. Dashboard-tuneable without restart.
   Categories: clustering, enrichment, retention.

3. debug_dedup MCP tool: given a URL (+ optional title), reports:
   - Whether the article is in seen_articles
   - Which cluster it belongs to
   - Similarity signals (title, jaccard, cosine) against top-10
     existing clusters, with match decisions and thresholds
Lukas Goldschmidt 6 дней назад
Родитель
Сommit
b22882c580

+ 48 - 0
news_mcp/article_identity.py

@@ -0,0 +1,48 @@
+"""Article identity and content hashing — single source of truth.
+
+Used by:
+  - news_mcp.dedup.cluster  (clustering identity, orphan merge, stable cluster IDs)
+  - news_mcp.storage.sqlite_store  (seen_articles, dedup, upsert)
+  - scripts/backfill_seen_articles.py  (backfill)
+"""
+
+from __future__ import annotations
+
+import hashlib
+from typing import Any
+from urllib.parse import urlparse
+
+
+def article_key(article: dict[str, Any]) -> str:
+    """Deterministic identity key derived from an article's URL.
+
+    If a URL exists, returns the last path segment (e.g. '/content/uuid' → 'uuid',
+    '/Article/Slug/66427393' → '66427393').  Falls back to the full URL if no
+    path segments, or to the title if no URL at all.
+
+    This is the primary dedup identity — two articles with the same key
+    are considered the same article regardless of source.
+    """
+    url = str(article.get("url") or "").strip()
+    if not url:
+        return str(article.get("title") or "")
+    try:
+        parsed = urlparse(url)
+        parts = [p for p in parsed.path.split("/") if p]
+        if parts:
+            return parts[-1]
+    except Exception:
+        pass
+    return url
+
+
+def article_content_hash(article: dict[str, Any]) -> str:
+    """SHA-1 hash of title + summary for detecting content changes.
+
+    Used to detect in-place article updates (e.g. a stub that gets fleshed
+    out) where the URL stays the same but the content changes.
+    """
+    title = str(article.get("title") or "").strip()
+    summary = str(article.get("summary") or "").strip()
+    material = f"{title}|{summary}"
+    return hashlib.sha1(material.encode("utf-8")).hexdigest()

+ 3 - 13
news_mcp/dedup/cluster.py

@@ -6,13 +6,13 @@ import re
 from datetime import datetime, timezone, timedelta
 from datetime import datetime, timezone, timedelta
 from difflib import SequenceMatcher
 from difflib import SequenceMatcher
 from typing import Any, Dict, List
 from typing import Any, Dict, List
-from urllib.parse import urlparse
 
 
 from news_mcp.config import (
 from news_mcp.config import (
     NEWS_EMBEDDINGS_ENABLED,
     NEWS_EMBEDDINGS_ENABLED,
     NEWS_EMBEDDING_SIMILARITY_THRESHOLD,
     NEWS_EMBEDDING_SIMILARITY_THRESHOLD,
     NEWS_CLUSTER_MAX_AGE_HOURS,
     NEWS_CLUSTER_MAX_AGE_HOURS,
 )
 )
+from news_mcp.article_identity import article_key, article_content_hash
 from news_mcp.dedup.embedding_support import cosine_similarity, ollama_embed
 from news_mcp.dedup.embedding_support import cosine_similarity, ollama_embed
 from news_mcp.sources.news_feeds import normalize_topic_from_title
 from news_mcp.sources.news_feeds import normalize_topic_from_title
 
 
@@ -33,18 +33,8 @@ def _title_similarity(a: str, b: str) -> float:
     return SequenceMatcher(None, _normalize_title(a), _normalize_title(b)).ratio()
     return SequenceMatcher(None, _normalize_title(a), _normalize_title(b)).ratio()
 
 
 
 
-def _article_key(article: Dict[str, Any]) -> str:
-    url = str(article.get("url") or "").strip()
-    if not url:
-        return str(article.get("title") or "")
-    try:
-        parsed = urlparse(url)
-        parts = [p for p in parsed.path.split("/") if p]
-        if parts:
-            return parts[-1]
-    except Exception:
-        pass
-    return url
+# For internal use — canonical name is article_key(article) from article_identity
+_article_key = article_key
 
 
 
 
 def _cluster_text(a: Dict[str, Any]) -> str:
 def _cluster_text(a: Dict[str, Any]) -> str:

+ 109 - 0
news_mcp/mcp_server_fastmcp.py

@@ -349,6 +349,70 @@ async def toggle_feed(feed_url: str, enabled: bool) -> dict:
     return {"ok": True, "feed_key": feed_url.strip(), "enabled": enabled, "details": updated}
     return {"ok": True, "feed_key": feed_url.strip(), "enabled": enabled, "details": updated}
 
 
 
 
+@mcp.tool(description="Debug dedup: inspect whether an article URL was already processed, which cluster it belongs to, and what similarity signals it would produce against existing clusters.")
+async def debug_dedup(url: str, title: str | None = None) -> dict:
+    """Given an article URL (and optional title), report dedup status.
+
+    Returns:
+      - seen: whether the article_key is in seen_articles
+      - article_key: the identity key derived from the URL
+      - cluster_id: which cluster it belongs to (if seen)
+      - similarity_signals: if title is provided, compute signals against
+        the top-N most similar existing clusters
+    """
+    from news_mcp.article_identity import article_key, article_content_hash
+    from news_mcp.dedup.cluster import _title_similarity, _normalize_title, _signals, _is_match
+    from news_mcp.config import NEWS_EMBEDDINGS_ENABLED
+
+    art = {"url": url, "title": title or ""}
+    akey = article_key(art)
+    result = {"url": url, "article_key": akey}
+
+    store = SQLiteClusterStore(DB_PATH)
+    with store._conn() as conn:
+        # Check seen_articles
+        row = conn.execute(
+            "SELECT cluster_id, first_seen, url FROM seen_articles WHERE article_key=?",
+            (akey,),
+        ).fetchone()
+        if row:
+            result["seen"] = True
+            result["cluster_id"] = row[0]
+            result["first_seen"] = row[1]
+            result["stored_url"] = row[2]
+        else:
+            result["seen"] = False
+
+    # If title provided, compute similarity against top clusters
+    if title:
+        # Get recent clusters for comparison
+        recent = store.get_latest_clusters_all_topics(ttl_hours=24, limit=20)
+        signals_list = []
+        for c in recent:
+            c_title = c.get("headline", "")
+            sigs = _signals(art, c)
+            matched, signal_name, signal_value = _is_match(
+                sigs, embeddings_enabled=NEWS_EMBEDDINGS_ENABLED,
+            )
+            signals_list.append({
+                "cluster_id": c.get("cluster_id", "")[:12],
+                "headline": c_title[:60],
+                "title_sim": round(sigs["title"], 3),
+                "jaccard": round(sigs["jaccard"], 3),
+                "cosine": round(sigs["cosine"], 3) if sigs["cosine"] else None,
+                "matched": matched,
+                "match_signal": signal_name,
+                "match_value": round(signal_value, 3) if signal_value else None,
+            })
+        # Sort by best title similarity
+        signals_list.sort(key=lambda x: x["title_sim"], reverse=True)
+        result["similarity_signals"] = signals_list[:10]
+        result["title_threshold"] = 0.75  # DEFAULT_TITLE_THRESHOLD
+        result["jaccard_threshold"] = 0.55  # DEFAULT_JACCARD_THRESHOLD
+
+    return result
+
+
 @mcp.tool(description="Investigate a topic and return the newest deduplicated news clusters with entities and thematic keywords, sorted by recency.")
 @mcp.tool(description="Investigate a topic and return the newest deduplicated news clusters with entities and thematic keywords, sorted by recency.")
 async def get_latest_events(topic: str | None = None, limit: int = 5, include_articles: bool = False):
 async def get_latest_events(topic: str | None = None, limit: int = 5, include_articles: bool = False):
     limit = max(1, min(int(limit), 20))
     limit = max(1, min(int(limit), 20))
@@ -1380,6 +1444,51 @@ async def api_feed_toggle(feed_url: str = Form(), enabled: bool = Form()):
         return _api_err(e, f"toggle({feed_url})")
         return _api_err(e, f"toggle({feed_url})")
 
 
 
 
+# ------------------------------------------------------------------ #
+#  Site config (dashboard-tuneable parameters)
+# ------------------------------------------------------------------ #
+
+@app.get("/api/v1/config")
+def api_config():
+    """All site config parameters (seeded from .env/defaults)."""
+    try:
+        from news_mcp.site_config import get_site_config
+        with _shared_store._conn() as conn:
+            rows = get_site_config(conn)
+        return {"config": rows}
+    except Exception as e:
+        return _api_err(e, "config")
+
+
+@app.post("/api/v1/config/update")
+async def api_config_update(key: str = Form(), value: str = Form()):
+    """Update a single config parameter at runtime."""
+    try:
+        from news_mcp.site_config import set_config_value
+        with _shared_store._conn() as conn:
+            ok = set_config_value(conn, key.strip(), value.strip())
+            conn.commit()
+        if not ok:
+            return JSONResponse(status_code=404, content={"error": f"Config key not found: {key}"})
+        return {"ok": True, "key": key.strip(), "value": value.strip()}
+    except Exception as e:
+        return _api_err(e, f"config/update({key})")
+
+
+@app.post("/api/v1/config/reset")
+async def api_config_reset():
+    """Reset all config to .env/defaults (drops and re-seeds site_config)."""
+    try:
+        from news_mcp.site_config import seed_site_config
+        with _shared_store._conn() as conn:
+            conn.execute("DELETE FROM site_config")
+            seeded = seed_site_config(conn)
+            conn.commit()
+        return {"ok": True, "seeded": seeded}
+    except Exception as e:
+        return _api_err(e, "config/reset")
+
+
 @app.get("/health")
 @app.get("/health")
 def health():
 def health():
     return {
     return {

+ 167 - 0
news_mcp/site_config.py

@@ -0,0 +1,167 @@
+"""DB-backed site configuration.
+
+All tunable parameters live in the `site_config` SQLite table.
+On startup, if the table is empty, it is seeded from .env overrides
+or Python defaults. After that, values are read from the DB — allowing
+runtime updates via the REST API without restart.
+
+Categories:
+  clustering  — similarity thresholds and merge criteria
+  enrichment  — LLM behavior, rate limits, embedding settings
+  retention   — pruning, age windows, refresh intervals
+"""
+
+from __future__ import annotations
+
+import os
+from typing import Any
+
+# ---------------------------------------------------------------------------
+#  Default registry — (key, default_value, type, category, description)
+#  The type is one of: float, int, bool, str
+# ---------------------------------------------------------------------------
+
+CONFIG_DEFAULTS: list[tuple[str, str, str, str, str]] = [
+    # Clustering thresholds
+    ("title_threshold",             "0.75",  "float", "clustering", "Min title similarity to merge (SequenceMatcher)"),
+    ("jaccard_threshold",           "0.55",  "float", "clustering", "Min Jaccard token overlap to merge"),
+    ("dual_title_floor",            "0.55",  "float", "clustering", "Dual-signal: min title for title+jaccard merge"),
+    ("dual_jaccard_floor",          "0.25",  "float", "clustering", "Dual-signal: min jaccard for title+jaccard merge"),
+    ("early_exit_title",            "0.95",  "float", "clustering", "Early-exit title signal (both must be met)"),
+    ("early_exit_jaccard",          "0.80",  "float", "clustering", "Early-exit jaccard signal"),
+    ("consensus_cosine_floor",      "0.80",  "float", "clustering", "Consensus path: min cosine"),
+    ("consensus_jaccard_floor",     "0.30",  "float", "clustering", "Consensus path: min jaccard (or title)"),
+    ("consensus_title_floor",       "0.55",  "float", "clustering", "Consensus path: min title (or jaccard)"),
+    ("crosstopic_title_threshold",  "0.90",  "float", "clustering", "Cross-topic merge: min title similarity"),
+    ("embedding_similarity_threshold", "0.885", "float", "clustering", "Cosine threshold for embedding-only merge"),
+    ("cluster_max_age_hours",       "6",     "float", "clustering", "Cross-cycle merge window (hours, 0=off)"),
+
+    # Enrichment/LLM
+    ("embeddings_enabled",          "true",  "bool",  "enrichment", "Enable Ollama embedding computation"),
+    ("ollama_base_url",             "http://192.168.0.200:11434", "str", "enrichment", "Ollama API base URL"),
+    ("ollama_embedding_model",      "nomic-embed-text", "str", "enrichment", "Ollama embedding model name"),
+    ("extract_provider",            "groq",  "str",   "enrichment", "LLM provider for extraction"),
+    ("extract_model",               "llama4-16e", "str", "enrichment", "LLM model for extraction"),
+    ("summary_provider",            "groq",  "str",   "enrichment", "LLM provider for summarisation"),
+    ("summary_model",               "llama4-16e", "str", "enrichment", "LLM model for summarisation"),
+    ("enrichment_max_per_refresh",  "0",     "int",   "enrichment", "Max enrichments per cycle (0=unlimited)"),
+    ("enrich_other_topics_only",    "false", "bool",  "enrichment", "Only enrich 'other' topic (legacy guard)"),
+
+    # Retention/pruning
+    ("pruning_enabled",             "true",  "bool",  "retention", "Enable periodic cluster pruning"),
+    ("retention_days",              "10",    "float", "retention", "Delete clusters older than N days"),
+    ("prune_interval_hours",        "12",    "float", "retention", "Run prune every N hours"),
+    ("refresh_interval_seconds",    "300",   "int",   "retention", "Polling cycle interval (seconds)"),
+    ("background_refresh_enabled",  "true",  "bool",  "retention", "Enable background polling"),
+    ("default_lookback_hours",      "24",    "float", "retention", "Default lookback for read queries"),
+]
+
+# ---------------------------------------------------------------------------
+#  .env override map: CONFIG_KEY → ENV_VAR_NAME
+#  When seeding, if the env var is set, its value overrides the default.
+# ---------------------------------------------------------------------------
+
+ENV_OVERRIDES: dict[str, str] = {
+    "title_threshold":             "NEWS_TITLE_THRESHOLD",
+    "jaccard_threshold":           "NEWS_JACCARD_THRESHOLD",
+    "embedding_similarity_threshold": "NEWS_EMBEDDING_SIMILARITY_THRESHOLD",
+    "cluster_max_age_hours":       "NEWS_CLUSTER_MAX_AGE_HOURS",
+    "embeddings_enabled":          "NEWS_EMBEDDINGS_ENABLED",
+    "ollama_base_url":             "OLLAMA_BASE_URL",
+    "ollama_embedding_model":      "OLLAMA_EMBEDDING_MODEL",
+    "extract_provider":            "NEWS_EXTRACT_PROVIDER",
+    "extract_model":               "NEWS_EXTRACT_MODEL",
+    "summary_provider":            "NEWS_SUMMARY_PROVIDER",
+    "summary_model":               "NEWS_SUMMARY_MODEL",
+    "enrichment_max_per_refresh":  "ENRICHMENT_MAX_PER_REFRESH",
+    "enrich_other_topics_only":    "ENRICH_OTHER_TOPICS_ONLY",
+    "pruning_enabled":             "NEWS_PRUNING_ENABLED",
+    "retention_days":              "NEWS_RETENTION_DAYS",
+    "prune_interval_hours":        "NEWS_PRUNE_INTERVAL_HOURS",
+    "refresh_interval_seconds":    "NEWS_REFRESH_INTERVAL_SECONDS",
+    "background_refresh_enabled":  "NEWS_BACKGROUND_REFRESH_ENABLED",
+    "default_lookback_hours":      "NEWS_DEFAULT_LOOKBACK_HOURS",
+}
+
+
+def _coerce(value: str, typ: str) -> Any:
+    """Convert a string value to its declared type."""
+    if typ == "float":
+        return float(value)
+    if typ == "int":
+        return int(value)
+    if typ == "bool":
+        return value.lower() in ("true", "1", "yes")
+    return value  # str
+
+
+def seed_site_config(conn) -> int:
+    """Create the site_config table and seed it if empty.
+
+    Returns the number of rows inserted (0 if already seeded).
+    """
+    conn.execute("""
+        CREATE TABLE IF NOT EXISTS site_config (
+            key         TEXT PRIMARY KEY,
+            value       TEXT NOT NULL,
+            type        TEXT NOT NULL DEFAULT 'str',
+            category    TEXT NOT NULL DEFAULT 'general',
+            description TEXT NOT NULL DEFAULT '',
+            source      TEXT NOT NULL DEFAULT 'default'
+        )
+    """)
+
+    count = conn.execute("SELECT count(*) FROM site_config").fetchone()[0]
+    if count > 0:
+        return 0
+
+    inserted = 0
+    for key, default, typ, category, description in CONFIG_DEFAULTS:
+        # Check .env override
+        env_var = ENV_OVERRIDES.get(key)
+        env_val = os.getenv(env_var) if env_var else None
+        if env_val is not None:
+            value = env_val
+            source = "env"
+        else:
+            value = default
+            source = "default"
+        conn.execute(
+            "INSERT INTO site_config(key, value, type, category, description, source) VALUES(?,?,?,?,?,?)",
+            (key, value, typ, category, description, source),
+        )
+        inserted += 1
+    return inserted
+
+
+def get_site_config(conn) -> list[dict]:
+    """Return all config rows as dicts."""
+    conn.execute("""
+        CREATE TABLE IF NOT EXISTS site_config (
+            key         TEXT PRIMARY KEY,
+            value       TEXT NOT NULL,
+            type        TEXT NOT NULL DEFAULT 'str',
+            category    TEXT NOT NULL DEFAULT 'general',
+            description TEXT NOT NULL DEFAULT '',
+            source      TEXT NOT NULL DEFAULT 'default'
+        )
+    """)
+    rows = conn.execute(
+        "SELECT key, value, type, category, description, source FROM site_config ORDER BY category, key"
+    ).fetchall()
+    return [
+        {"key": r[0], "value": r[1], "type": r[2], "category": r[3], "description": r[4], "source": r[5]}
+        for r in rows
+    ]
+
+
+def get_config_value(conn, key: str) -> str | None:
+    """Get a single config value by key. Returns None if not found."""
+    row = conn.execute("SELECT value FROM site_config WHERE key=?", (key,)).fetchone()
+    return row[0] if row else None
+
+
+def set_config_value(conn, key: str, value: str) -> bool:
+    """Update a single config value. Returns True if the key existed."""
+    cur = conn.execute("UPDATE site_config SET value=?, source='api' WHERE key=?", (value, key))
+    return cur.rowcount > 0

+ 10 - 14
news_mcp/storage/sqlite_store.py

@@ -6,9 +6,9 @@ from dataclasses import dataclass
 from datetime import datetime, timezone, timedelta
 from datetime import datetime, timezone, timedelta
 from pathlib import Path
 from pathlib import Path
 from typing import Any
 from typing import Any
-from urllib.parse import urlparse
 from email.utils import parsedate_to_datetime
 from email.utils import parsedate_to_datetime
 
 
+from news_mcp.article_identity import article_key
 from news_mcp.config import (
 from news_mcp.config import (
     NEWS_PRUNE_INTERVAL_HOURS,
     NEWS_PRUNE_INTERVAL_HOURS,
     NEWS_PRUNING_ENABLED,
     NEWS_PRUNING_ENABLED,
@@ -87,19 +87,8 @@ class ClusterRow:
 
 
 META_LAST_PRUNE_AT = "last_prune_at"
 META_LAST_PRUNE_AT = "last_prune_at"
 
 
-
-def _article_key(article: dict[str, Any]) -> str:
-    url = str(article.get("url") or "").strip()
-    if not url:
-        return str(article.get("title") or "")
-    try:
-        parsed = urlparse(url)
-        parts = [p for p in parsed.path.split("/") if p]
-        if parts:
-            return parts[-1]
-    except Exception:
-        pass
-    return url
+# For internal use — canonical name is article_key(article) from article_identity
+_article_key = article_key
 
 
 
 
 def _dedup_articles(articles: list[dict[str, Any]]) -> list[dict[str, Any]]:
 def _dedup_articles(articles: list[dict[str, Any]]) -> list[dict[str, Any]]:
@@ -329,6 +318,13 @@ class SQLiteClusterStore:
                 """
                 """
             )
             )
 
 
+            # Seed site_config from .env / defaults (no-op if already populated)
+            from news_mcp.site_config import seed_site_config
+            seeded = seed_site_config(conn)
+            if seeded:
+                import logging
+                logging.getLogger(__name__).info("site_config: seeded %d rows from env/defaults", seeded)
+
     def upsert_clusters(self, clusters: list[dict], topic: str) -> None:
     def upsert_clusters(self, clusters: list[dict], topic: str) -> None:
         now = datetime.now(timezone.utc)
         now = datetime.now(timezone.utc)
         with self._conn() as conn:
         with self._conn() as conn:

+ 5 - 14
scripts/backfill_seen_articles.py

@@ -11,21 +11,12 @@ import json
 import sqlite3
 import sqlite3
 import sys
 import sys
 from datetime import datetime, timezone
 from datetime import datetime, timezone
-from urllib.parse import urlparse
 
 
+# Add parent dir so we can import news_mcp when run as a standalone script
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
 
 
-def _article_key(article: dict) -> str:
-    url = str(article.get("url") or "").strip()
-    if not url:
-        return str(article.get("title") or "")
-    try:
-        parsed = urlparse(url)
-        parts = [p for p in parsed.path.split("/") if p]
-        if parts:
-            return parts[-1]
-    except Exception:
-        pass
-    return url
+from news_mcp.article_identity import article_key
 
 
 
 
 def main(db_path: str = "./data/news.sqlite"):
 def main(db_path: str = "./data/news.sqlite"):
@@ -53,7 +44,7 @@ def main(db_path: str = "./data/news.sqlite"):
             skipped += 1
             skipped += 1
             continue
             continue
         for art in payload.get("articles", []):
         for art in payload.get("articles", []):
-            akey = _article_key(art)
+            akey = article_key(art)
             if not akey:
             if not akey:
                 continue
                 continue
             art_url = str(art.get("url") or "").strip()
             art_url = str(art.get("url") or "").strip()