فهرست منبع

config: add embedding similarity threshold

Lukas Goldschmidt 1 ماه پیش
والد
کامیت
469003b99f
5فایلهای تغییر یافته به همراه6 افزوده شده و 2 حذف شده
  1. 1 0
      .env.example
  2. 1 0
      PROJECT.md
  3. 1 0
      README.md
  4. 1 0
      news_mcp/config.py
  5. 2 2
      news_mcp/dedup/cluster.py

+ 1 - 0
.env.example

@@ -20,6 +20,7 @@ GROQ_MAX_CLUSTERS_PER_REFRESH=20
 NEWS_EMBEDDINGS_ENABLED=false
 OLLAMA_BASE_URL=http://127.0.0.1:11434
 OLLAMA_EMBEDDING_MODEL=nomic-embed-text
+NEWS_EMBEDDING_SIMILARITY_THRESHOLD=0.885
 
 # Feeds
 NEWS_FEED_URL=https://breakingthenews.net/news-feed.xml

+ 1 - 0
PROJECT.md

@@ -9,6 +9,7 @@ Provide a signal-extraction MCP server that converts RSS into **deduplicated, en
 - RSS fetch (breakingthenews.net)
 - v1 dedup via fuzzy title similarity
 - optional Ollama embeddings path for clustering (when `NEWS_EMBEDDINGS_ENABLED=true`)
+- configurable embedding similarity threshold (`NEWS_EMBEDDING_SIMILARITY_THRESHOLD`)
 - optional embeddings backfill script for precomputing cluster vectors in SQLite
 - optional merge-analysis script for threshold experiments before any DB rewrite
 - optional merge pass for destructive consolidation after threshold review

+ 1 - 0
README.md

@@ -75,6 +75,7 @@ Key variables:
 - `NEWS_EMBEDDINGS_ENABLED` (default false; enables Ollama embeddings for clustering when wired in)
 - `OLLAMA_BASE_URL` / `OLLAMA_URL` (default `http://127.0.0.1:11434`)
 - `OLLAMA_EMBEDDING_MODEL` (default `nomic-embed-text`)
+- `NEWS_EMBEDDING_SIMILARITY_THRESHOLD` (default `0.885`; used when embeddings are enabled)
 
 When embeddings are enabled, news-mcp tries Ollama first and falls back to the existing heuristic clustering path if Ollama is unavailable.
 

+ 1 - 0
news_mcp/config.py

@@ -37,6 +37,7 @@ GROQ_MAX_CLUSTERS_PER_REFRESH = int(os.getenv("GROQ_MAX_CLUSTERS_PER_REFRESH", "
 NEWS_EMBEDDINGS_ENABLED = os.getenv("NEWS_EMBEDDINGS_ENABLED", "false").lower() == "true"
 OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", os.getenv("OLLAMA_URL", "http://127.0.0.1:11434"))
 OLLAMA_EMBEDDING_MODEL = os.getenv("OLLAMA_EMBEDDING_MODEL", "nomic-embed-text")
+NEWS_EMBEDDING_SIMILARITY_THRESHOLD = float(os.getenv("NEWS_EMBEDDING_SIMILARITY_THRESHOLD", "0.885"))
 
 NEWS_REFRESH_INTERVAL_SECONDS = int(os.getenv("NEWS_REFRESH_INTERVAL_SECONDS", "900"))
 NEWS_BACKGROUND_REFRESH_ENABLED = os.getenv("NEWS_BACKGROUND_REFRESH_ENABLED", "true").lower() == "true"

+ 2 - 2
news_mcp/dedup/cluster.py

@@ -4,7 +4,7 @@ from typing import Any, Dict, List, Tuple
 
 from news_mcp.sources.news_feeds import normalize_topic_from_title
 from news_mcp.dedup.embedding_support import CandidateRules, cluster_is_candidate, cosine_similarity, ollama_embed
-from news_mcp.config import NEWS_EMBEDDINGS_ENABLED
+from news_mcp.config import NEWS_EMBEDDINGS_ENABLED, NEWS_EMBEDDING_SIMILARITY_THRESHOLD
 
 import re
 from difflib import SequenceMatcher
@@ -80,7 +80,7 @@ def dedup_and_cluster_articles(
 
         threshold = similarity_threshold
         if NEWS_EMBEDDINGS_ENABLED:
-            threshold = max(similarity_threshold, 0.82)
+            threshold = max(similarity_threshold, NEWS_EMBEDDING_SIMILARITY_THRESHOLD)
 
         if best_idx is not None and best_sim >= threshold:
             c = clusters[best_idx]