groq_enrich.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. from __future__ import annotations
  2. import json
  3. import logging
  4. from typing import Any, Dict, List
  5. import httpx
  6. from news_mcp.config import GROQ_API_KEY, GROQ_MODEL, GROQ_DEBUG
  7. logger = logging.getLogger(__name__)
  8. _SYSTEM = "You are a news signal extraction engine. Return STRICT JSON only."
  9. def _build_prompt(articles: List[Dict[str, Any]], headline: str, summary: str | None) -> str:
  10. # Keep prompt compact: clusters already deduped.
  11. sample = articles[:6]
  12. return json.dumps(
  13. {
  14. "cluster": {
  15. "headline": headline,
  16. "summary": summary or "",
  17. "articles": [
  18. {
  19. "title": a.get("title"),
  20. "url": a.get("url"),
  21. "source": a.get("source"),
  22. "timestamp": a.get("timestamp"),
  23. "summary": a.get("summary", ""),
  24. }
  25. for a in sample
  26. ],
  27. }
  28. },
  29. ensure_ascii=False,
  30. )
  31. async def classify_cluster_groq(cluster: Dict[str, Any]) -> Dict[str, Any]:
  32. if not GROQ_API_KEY:
  33. # No enrichment configured.
  34. return cluster
  35. headline = cluster.get("headline", "")
  36. summary = cluster.get("summary", "")
  37. articles = cluster.get("articles", [])
  38. user_payload = _build_prompt(articles=articles, headline=headline, summary=summary)
  39. prompt = (
  40. f"Input cluster JSON:\n{user_payload}\n\n"
  41. "You MUST extract a news signal from the headline AND summary. Do not leave entities empty when the text mentions obvious names.\n"
  42. "Task:\n"
  43. "1) infer the best top-level topic\n"
  44. "2) extract concise entities from the cluster\n"
  45. "3) assign sentiment from the wording/context\n"
  46. "4) provide short keywords that justify the classification\n\n"
  47. "Entity rules (strict):\n"
  48. "- Use short strings (1-5 words).\n"
  49. "- Include all obvious named entities mentioned in headline or summary: people, countries, regions, organizations, ministries, presidents, leaders, wars/conflicts if named.\n"
  50. "- Also include finance/crypto entities when present: BTC, ETH, Bitcoin, Ethereum, ETF, SEC, ECB, Fed, euro, inflation, rates.\n"
  51. "- If the cluster mentions Iran, UAE, Egypt, Germany, Europe, Trump, Merz, Sisi, those should appear in entities.\n"
  52. "- Do NOT return empty entities if any such names/places appear.\n\n"
  53. "Sentiment rules:\n"
  54. "- positive: clearly encouraging, improving, or supportive tone\n"
  55. "- negative: clearly alarming, worsening, severe, conflict, loss, risk, warning tone\n"
  56. "- neutral: factual, balanced, or mixed\n"
  57. "- sentimentScore must be a number from -1.0 to 1.0 and should reflect the sentiment label.\n\n"
  58. "Return STRICT JSON with EXACT keys only:\n"
  59. "{ topic, entities, sentiment, sentimentScore, keywords }\n"
  60. "where topic is one of [crypto, macro, regulation, ai, other].\n"
  61. )
  62. if GROQ_DEBUG:
  63. msg = f"[GROQ PROMPT] {prompt}"
  64. logger.warning(msg)
  65. print(msg, flush=True)
  66. req = {
  67. "model": GROQ_MODEL,
  68. "messages": [
  69. {"role": "system", "content": _SYSTEM},
  70. {"role": "user", "content": prompt},
  71. ],
  72. "temperature": 0.2,
  73. "response_format": {"type": "json_object"},
  74. }
  75. async with httpx.AsyncClient(timeout=30.0) as client:
  76. resp = await client.post(
  77. "https://api.groq.com/openai/v1/chat/completions",
  78. headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
  79. json=req,
  80. )
  81. resp.raise_for_status()
  82. data = resp.json()
  83. content = data["choices"][0]["message"]["content"]
  84. if GROQ_DEBUG:
  85. msg = f"[GROQ RAW RESPONSE] {content}"
  86. logger.warning(msg)
  87. print(msg, flush=True)
  88. parsed = json.loads(content)
  89. # Normalize output types into our cluster shape.
  90. topic = parsed.get("topic") or cluster.get("topic")
  91. entities = parsed.get("entities") or []
  92. sentiment = parsed.get("sentiment") or "neutral"
  93. sentiment_score = parsed.get("sentimentScore")
  94. keywords = parsed.get("keywords") or []
  95. out = dict(cluster)
  96. if topic:
  97. out["topic"] = topic
  98. out["entities"] = entities
  99. out["sentiment"] = sentiment
  100. if sentiment_score is not None:
  101. out["sentimentScore"] = float(sentiment_score)
  102. out["keywords"] = keywords
  103. return out
  104. async def summarize_cluster_groq(cluster: Dict[str, Any]) -> Dict[str, Any]:
  105. """Produce a compact agent-facing summary.
  106. Returns:
  107. {
  108. "headline": str,
  109. "mergedSummary": str,
  110. "keyFacts": [str,...],
  111. "sources": [str,...]
  112. }
  113. """
  114. if not GROQ_API_KEY:
  115. return {
  116. "headline": cluster.get("headline"),
  117. "mergedSummary": cluster.get("summary"),
  118. "keyFacts": [],
  119. "sources": cluster.get("sources", []),
  120. }
  121. headline = cluster.get("headline", "")
  122. summary = cluster.get("summary", "")
  123. articles = cluster.get("articles", [])
  124. sample = articles[:5]
  125. req = {
  126. "model": GROQ_MODEL,
  127. "messages": [
  128. {
  129. "role": "system",
  130. "content": "You are a summarization engine for news clusters. Return strict JSON only.",
  131. },
  132. {
  133. "role": "user",
  134. "content": json.dumps(
  135. {
  136. "headline": headline,
  137. "summary": summary,
  138. "articles": [
  139. {
  140. "title": a.get("title"),
  141. "url": a.get("url"),
  142. "source": a.get("source"),
  143. "timestamp": a.get("timestamp"),
  144. }
  145. for a in sample
  146. ],
  147. },
  148. ensure_ascii=False,
  149. )
  150. + "\n\nReturn keys: headline, mergedSummary, keyFacts (5-8 strings), sources. mergedSummary should be 2-4 sentences.",
  151. },
  152. ],
  153. "temperature": 0.2,
  154. "response_format": {"type": "json_object"},
  155. }
  156. async with httpx.AsyncClient(timeout=45.0) as client:
  157. resp = await client.post(
  158. "https://api.groq.com/openai/v1/chat/completions",
  159. headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
  160. json=req,
  161. )
  162. resp.raise_for_status()
  163. data = resp.json()
  164. content = data["choices"][0]["message"]["content"]
  165. parsed = json.loads(content)
  166. return parsed