| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167 |
- """DB-backed site configuration.
- All tunable parameters live in the `site_config` SQLite table.
- On startup, if the table is empty, it is seeded from .env overrides
- or Python defaults. After that, values are read from the DB — allowing
- runtime updates via the REST API without restart.
- Categories:
- clustering — similarity thresholds and merge criteria
- enrichment — LLM behavior, rate limits, embedding settings
- retention — pruning, age windows, refresh intervals
- """
- from __future__ import annotations
- import os
- from typing import Any
- # ---------------------------------------------------------------------------
- # Default registry — (key, default_value, type, category, description)
- # The type is one of: float, int, bool, str
- # ---------------------------------------------------------------------------
- CONFIG_DEFAULTS: list[tuple[str, str, str, str, str]] = [
- # Clustering thresholds
- ("title_threshold", "0.75", "float", "clustering", "Min title similarity to merge (SequenceMatcher)"),
- ("jaccard_threshold", "0.55", "float", "clustering", "Min Jaccard token overlap to merge"),
- ("dual_title_floor", "0.55", "float", "clustering", "Dual-signal: min title for title+jaccard merge"),
- ("dual_jaccard_floor", "0.25", "float", "clustering", "Dual-signal: min jaccard for title+jaccard merge"),
- ("early_exit_title", "0.95", "float", "clustering", "Early-exit title signal (both must be met)"),
- ("early_exit_jaccard", "0.80", "float", "clustering", "Early-exit jaccard signal"),
- ("consensus_cosine_floor", "0.80", "float", "clustering", "Consensus path: min cosine"),
- ("consensus_jaccard_floor", "0.30", "float", "clustering", "Consensus path: min jaccard (or title)"),
- ("consensus_title_floor", "0.55", "float", "clustering", "Consensus path: min title (or jaccard)"),
- ("crosstopic_title_threshold", "0.90", "float", "clustering", "Cross-topic merge: min title similarity"),
- ("embedding_similarity_threshold", "0.885", "float", "clustering", "Cosine threshold for embedding-only merge"),
- ("cluster_max_age_hours", "6", "float", "clustering", "Cross-cycle merge window (hours, 0=off)"),
- # Enrichment/LLM
- ("embeddings_enabled", "true", "bool", "enrichment", "Enable Ollama embedding computation"),
- ("ollama_base_url", "http://192.168.0.200:11434", "str", "enrichment", "Ollama API base URL"),
- ("ollama_embedding_model", "nomic-embed-text", "str", "enrichment", "Ollama embedding model name"),
- ("extract_provider", "groq", "str", "enrichment", "LLM provider for extraction"),
- ("extract_model", "llama4-16e", "str", "enrichment", "LLM model for extraction"),
- ("summary_provider", "groq", "str", "enrichment", "LLM provider for summarisation"),
- ("summary_model", "llama4-16e", "str", "enrichment", "LLM model for summarisation"),
- ("enrichment_max_per_refresh", "0", "int", "enrichment", "Max enrichments per cycle (0=unlimited)"),
- ("enrich_other_topics_only", "false", "bool", "enrichment", "Only enrich 'other' topic (legacy guard)"),
- # Retention/pruning
- ("pruning_enabled", "true", "bool", "retention", "Enable periodic cluster pruning"),
- ("retention_days", "10", "float", "retention", "Delete clusters older than N days"),
- ("prune_interval_hours", "12", "float", "retention", "Run prune every N hours"),
- ("refresh_interval_seconds", "300", "int", "retention", "Polling cycle interval (seconds)"),
- ("background_refresh_enabled", "true", "bool", "retention", "Enable background polling"),
- ("default_lookback_hours", "24", "float", "retention", "Default lookback for read queries"),
- ]
- # ---------------------------------------------------------------------------
- # .env override map: CONFIG_KEY → ENV_VAR_NAME
- # When seeding, if the env var is set, its value overrides the default.
- # ---------------------------------------------------------------------------
- ENV_OVERRIDES: dict[str, str] = {
- "title_threshold": "NEWS_TITLE_THRESHOLD",
- "jaccard_threshold": "NEWS_JACCARD_THRESHOLD",
- "embedding_similarity_threshold": "NEWS_EMBEDDING_SIMILARITY_THRESHOLD",
- "cluster_max_age_hours": "NEWS_CLUSTER_MAX_AGE_HOURS",
- "embeddings_enabled": "NEWS_EMBEDDINGS_ENABLED",
- "ollama_base_url": "OLLAMA_BASE_URL",
- "ollama_embedding_model": "OLLAMA_EMBEDDING_MODEL",
- "extract_provider": "NEWS_EXTRACT_PROVIDER",
- "extract_model": "NEWS_EXTRACT_MODEL",
- "summary_provider": "NEWS_SUMMARY_PROVIDER",
- "summary_model": "NEWS_SUMMARY_MODEL",
- "enrichment_max_per_refresh": "ENRICHMENT_MAX_PER_REFRESH",
- "enrich_other_topics_only": "ENRICH_OTHER_TOPICS_ONLY",
- "pruning_enabled": "NEWS_PRUNING_ENABLED",
- "retention_days": "NEWS_RETENTION_DAYS",
- "prune_interval_hours": "NEWS_PRUNE_INTERVAL_HOURS",
- "refresh_interval_seconds": "NEWS_REFRESH_INTERVAL_SECONDS",
- "background_refresh_enabled": "NEWS_BACKGROUND_REFRESH_ENABLED",
- "default_lookback_hours": "NEWS_DEFAULT_LOOKBACK_HOURS",
- }
- def _coerce(value: str, typ: str) -> Any:
- """Convert a string value to its declared type."""
- if typ == "float":
- return float(value)
- if typ == "int":
- return int(value)
- if typ == "bool":
- return value.lower() in ("true", "1", "yes")
- return value # str
- def seed_site_config(conn) -> int:
- """Create the site_config table and seed it if empty.
- Returns the number of rows inserted (0 if already seeded).
- """
- conn.execute("""
- CREATE TABLE IF NOT EXISTS site_config (
- key TEXT PRIMARY KEY,
- value TEXT NOT NULL,
- type TEXT NOT NULL DEFAULT 'str',
- category TEXT NOT NULL DEFAULT 'general',
- description TEXT NOT NULL DEFAULT '',
- source TEXT NOT NULL DEFAULT 'default'
- )
- """)
- count = conn.execute("SELECT count(*) FROM site_config").fetchone()[0]
- if count > 0:
- return 0
- inserted = 0
- for key, default, typ, category, description in CONFIG_DEFAULTS:
- # Check .env override
- env_var = ENV_OVERRIDES.get(key)
- env_val = os.getenv(env_var) if env_var else None
- if env_val is not None:
- value = env_val
- source = "env"
- else:
- value = default
- source = "default"
- conn.execute(
- "INSERT INTO site_config(key, value, type, category, description, source) VALUES(?,?,?,?,?,?)",
- (key, value, typ, category, description, source),
- )
- inserted += 1
- return inserted
- def get_site_config(conn) -> list[dict]:
- """Return all config rows as dicts."""
- conn.execute("""
- CREATE TABLE IF NOT EXISTS site_config (
- key TEXT PRIMARY KEY,
- value TEXT NOT NULL,
- type TEXT NOT NULL DEFAULT 'str',
- category TEXT NOT NULL DEFAULT 'general',
- description TEXT NOT NULL DEFAULT '',
- source TEXT NOT NULL DEFAULT 'default'
- )
- """)
- rows = conn.execute(
- "SELECT key, value, type, category, description, source FROM site_config ORDER BY category, key"
- ).fetchall()
- return [
- {"key": r[0], "value": r[1], "type": r[2], "category": r[3], "description": r[4], "source": r[5]}
- for r in rows
- ]
- def get_config_value(conn, key: str) -> str | None:
- """Get a single config value by key. Returns None if not found."""
- row = conn.execute("SELECT value FROM site_config WHERE key=?", (key,)).fetchone()
- return row[0] if row else None
- def set_config_value(conn, key: str, value: str) -> bool:
- """Update a single config value. Returns True if the key existed."""
- cur = conn.execute("UPDATE site_config SET value=?, source='api' WHERE key=?", (value, key))
- return cur.rowcount > 0
|