build_annotated_set.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387
  1. import json
  2. from pathlib import Path
  3. # 30 diverse clusters selected from live data, manually annotated
  4. ANNOTATED_SAMPLES = [
  5. # === REGULATION (6) ===
  6. {
  7. "name": "sec_binance_lawsuit",
  8. "cluster": {
  9. "headline": "SEC sues Binance over unregistered securities",
  10. "summary": "The Securities and Exchange Commission filed a lawsuit against Binance, the world's largest crypto exchange, alleging it operated as an unregistered securities exchange and commingled customer funds."
  11. },
  12. "expected": {
  13. "entities": ["SEC", "Binance"],
  14. "keywords": ["securities law", "crypto exchange", "enforcement action"],
  15. "topic": "regulation"
  16. }
  17. },
  18. {
  19. "name": "iran_frozen_funds_talks",
  20. "cluster": {
  21. "headline": "Khamenei aide says $24B in frozen funds blocking talks",
  22. "summary": "A senior aide to Iran's Supreme Leader said $24 billion in frozen Iranian funds are blocking progress in indirect talks with the United States."
  23. },
  24. "expected": {
  25. "entities": ["Iran", "United States", "Khamenei", "Mohsen Rezaei"],
  26. "keywords": ["frozen funds", "peace talks", "Iran sanctions"],
  27. "topic": "regulation"
  28. }
  29. },
  30. {
  31. "name": "us_iran_sanctions",
  32. "cluster": {
  33. "headline": "US issues new Iran-linked sanctions",
  34. "summary": "The US Treasury Department imposed new sanctions targeting Iranian oil and petrochemical networks."
  35. },
  36. "expected": {
  37. "entities": ["United States", "Iran", "Department of the Treasury", "OFAC"],
  38. "keywords": ["sanctions", "oil network", "petrochemical"],
  39. "topic": "regulation"
  40. }
  41. },
  42. {
  43. "name": "house_crypto_bills",
  44. "cluster": {
  45. "headline": "U.S. House tax committee weighs crypto bills, including relief for small transactions",
  46. "summary": "The House Ways and Means Committee discussed legislation providing tax relief for small crypto transactions and clarifying digital asset rules."
  47. },
  48. "expected": {
  49. "entities": ["U.S. House tax committee", "Bitcoin", "Ethereum", "SEC"],
  50. "keywords": ["crypto bills", "tax relief", "small transactions"],
  51. "topic": "regulation"
  52. }
  53. },
  54. {
  55. "name": "wamco_sec_settlement",
  56. "cluster": {
  57. "headline": "Wamco to Pay $100 Million in SEC Settlement Over Leech Trades",
  58. "summary": "Western Asset Management agreed to pay $100 million to settle SEC charges over improper trading by former portfolio manager Ken Leech."
  59. },
  60. "expected": {
  61. "entities": ["Western Asset Management Co.", "Securities and Exchange Commission", "Ken Leech"],
  62. "keywords": ["SEC settlement", "trading practices", "portfolio manager"],
  63. "topic": "regulation"
  64. }
  65. },
  66. {
  67. "name": "us_cuba_sanctions",
  68. "cluster": {
  69. "headline": "US imposes sanctions on Cuban president, Castro family members",
  70. "summary": "The United States sanctioned Cuban President Miguel Diaz-Canel and members of the Castro family over human rights abuses."
  71. },
  72. "expected": {
  73. "entities": ["Cuba", "Cuban president", "Castro family", "Raul Castro", "United States"],
  74. "keywords": ["US sanctions", "Cuba tensions", "human rights"],
  75. "topic": "regulation"
  76. }
  77. },
  78. # === MACRO (7) ===
  79. {
  80. "name": "fed_rates_inflation",
  81. "cluster": {
  82. "headline": "Fed holds rates steady as inflation cools",
  83. "summary": "The Federal Reserve kept interest rates unchanged at 5.25-5.50%, citing progress on inflation but signaling caution on future cuts."
  84. },
  85. "expected": {
  86. "entities": ["Federal Reserve"],
  87. "keywords": ["interest rates", "inflation", "monetary policy"],
  88. "topic": "macro"
  89. }
  90. },
  91. {
  92. "name": "ecb_rate_cut",
  93. "cluster": {
  94. "headline": "ECB cuts rates as eurozone inflation falls to 2.4%",
  95. "summary": "The European Central Bank lowered its deposit rate by 25 basis points to 3.75%, marking its first cut since 2019 as inflation approaches target."
  96. },
  97. "expected": {
  98. "entities": ["European Central Bank", "ECB"],
  99. "keywords": ["rate cut", "eurozone inflation", "deposit rate", "monetary easing"],
  100. "topic": "macro"
  101. }
  102. },
  103. {
  104. "name": "china_stimulus",
  105. "cluster": {
  106. "headline": "China unveils stimulus package to boost slowing economy",
  107. "summary": "Beijing announced a comprehensive stimulus package including infrastructure spending, tax cuts, and monetary easing to counter slowing growth and property sector weakness."
  108. },
  109. "expected": {
  110. "entities": ["China", "Beijing"],
  111. "keywords": ["stimulus package", "infrastructure spending", "monetary easing", "property sector"],
  112. "topic": "macro"
  113. }
  114. },
  115. {
  116. "name": "oil_opep_cuts",
  117. "cluster": {
  118. "headline": "Oil jumps after OPEC+ extends production cuts",
  119. "summary": "Crude oil prices rose 3% after OPEC+ agreed to extend production cuts through year-end, tightening global supply amid demand concerns."
  120. },
  121. "expected": {
  122. "entities": ["OPEC+"],
  123. "keywords": ["production cuts", "oil prices", "global supply", "demand concerns"],
  124. "topic": "macro"
  125. }
  126. },
  127. {
  128. "name": "india_forex_reserves",
  129. "cluster": {
  130. "headline": "India's Forex Reserves Hit $682.32 Billion as RBI Tightens Its Economic Grip",
  131. "summary": "India's foreign exchange reserves reached a record high as the Reserve Bank of India maintains tight monetary policy."
  132. },
  133. "expected": {
  134. "entities": ["India", "RBI"],
  135. "keywords": ["forex reserves", "monetary policy", "economic grip"],
  136. "topic": "macro"
  137. }
  138. },
  139. {
  140. "name": "india_us_trade_pact",
  141. "cluster": {
  142. "headline": "India, US May Execute Interim Trade Pact by July, Minister Says",
  143. "summary": "Commerce Minister Piyush Goyal said India and the US could finalize an interim trade agreement by July, addressing tariffs and market access."
  144. },
  145. "expected": {
  146. "entities": ["India", "US", "Piyush Goyal"],
  147. "keywords": ["trade deal", "tariffs", "market access"],
  148. "topic": "macro"
  149. }
  150. },
  151. {
  152. "name": "jobs_report_fed_bets",
  153. "cluster": {
  154. "headline": "Investors boost bets for Fed rate rise after bumper US jobs report",
  155. "summary": "Strong US payrolls data led traders to increase wagers on Federal Reserve interest rate hikes."
  156. },
  157. "expected": {
  158. "entities": ["US", "Federal Reserve"],
  159. "keywords": ["jobs report", "rate hike", "rate rise"],
  160. "topic": "macro"
  161. }
  162. },
  163. # === CRYPTO (6) ===
  164. {
  165. "name": "bitcoin_etf_flows",
  166. "cluster": {
  167. "headline": "Bitcoin ETFs see record inflows as BTC tops $70k",
  168. "summary": "US spot Bitcoin ETFs attracted $2.3 billion in net inflows this week as Bitcoin surged past $70,000, driven by institutional demand."
  169. },
  170. "expected": {
  171. "entities": ["Bitcoin", "BTC"],
  172. "keywords": ["ETF inflows", "institutional demand", "price surge"],
  173. "topic": "crypto"
  174. }
  175. },
  176. {
  177. "name": "memecoins_dive",
  178. "cluster": {
  179. "headline": "Memecoins dogecoin, shiba inu dive 9% as bitcoin nears $60,000",
  180. "summary": "Dogecoin and Shiba Inu led memecoin losses as Bitcoin approached the $60,000 level."
  181. },
  182. "expected": {
  183. "entities": ["dogecoin", "shiba inu", "Bitcoin", "memecoins"],
  184. "keywords": ["crypto crash", "memecoins dive", "price drop"],
  185. "topic": "crypto"
  186. }
  187. },
  188. {
  189. "name": "bitcoin_seller_exhaustion",
  190. "cluster": {
  191. "headline": "Bitcoin teases 'seller exhaustion' as BTC price downside reaches $60.3K",
  192. "summary": "Technical analysts note signs of seller exhaustion in Bitcoin as the cryptocurrency tests support near $60,300."
  193. },
  194. "expected": {
  195. "entities": ["Bitcoin", "BTC"],
  196. "keywords": ["seller exhaustion", "price support", "technical analysis"],
  197. "topic": "crypto"
  198. }
  199. },
  200. {
  201. "name": "xrp_liquidation_selloff",
  202. "cluster": {
  203. "headline": "XRP falls toward $1.10 as liquidation-driven selloff pushes token to multi-month lows",
  204. "summary": "XRP dropped sharply as leveraged positions were liquidated, pushing the token to its lowest level in months."
  205. },
  206. "expected": {
  207. "entities": ["XRP", "Bitcoin", "Ethereum"],
  208. "keywords": ["liquidation", "selloff", "price crash"],
  209. "topic": "crypto"
  210. }
  211. },
  212. {
  213. "name": "visa_stablecoin_test",
  214. "cluster": {
  215. "headline": "Visa tests private stablecoin settlement with Brale, Canton",
  216. "summary": "Visa is piloting private stablecoin settlement using Brale and Canton networks for institutional payments."
  217. },
  218. "expected": {
  219. "entities": ["Visa", "Brale", "Canton"],
  220. "keywords": ["stablecoin settlement", "private network", "institutional payments"],
  221. "topic": "crypto"
  222. }
  223. },
  224. {
  225. "name": "prediction_markets_kalshi",
  226. "cluster": {
  227. "headline": "Prediction Markets Hit $29.4 Billion in May as Kalshi Leads and Brokers Pile In",
  228. "summary": "Prediction market volume surged to record levels with Kalshi leading the growth as traditional brokers enter the space."
  229. },
  230. "expected": {
  231. "entities": ["Kalshi", "Prediction Markets", "Bitcoin", "Ethereum", "SEC"],
  232. "keywords": ["prediction markets", "trading volume", "broker adoption"],
  233. "topic": "crypto"
  234. }
  235. },
  236. # === AI (5) ===
  237. {
  238. "name": "nvidia_earnings_ai",
  239. "cluster": {
  240. "headline": "Nvidia beats earnings on AI chip demand",
  241. "summary": "Nvidia reported quarterly revenue of $26 billion, up 262% year-over-year, driven by insatiable demand for its H100 and Blackwell AI chips."
  242. },
  243. "expected": {
  244. "entities": ["Nvidia", "H100", "Blackwell"],
  245. "keywords": ["AI chips", "earnings beat", "revenue growth", "chip demand"],
  246. "topic": "ai"
  247. }
  248. },
  249. {
  250. "name": "anthropic_ai_pause",
  251. "cluster": {
  252. "headline": "Anthropic calls for pause of global AI development",
  253. "summary": "AI safety company Anthropic urged a coordinated pause on advanced AI development to establish safety standards."
  254. },
  255. "expected": {
  256. "entities": ["Anthropic", "Claude"],
  257. "keywords": ["AI development", "global pause", "AI safety"],
  258. "topic": "ai"
  259. }
  260. },
  261. {
  262. "name": "microsoft_ai_products",
  263. "cluster": {
  264. "headline": "Has Microsoft Lost Its Mojo (Again)?",
  265. "summary": "Analysts question whether Microsoft's AI product strategy is falling behind competitors despite massive investment."
  266. },
  267. "expected": {
  268. "entities": ["Microsoft", "Scott Hanselman", "Github"],
  269. "keywords": ["AI products", "catch-up mode", "competition"],
  270. "topic": "ai"
  271. }
  272. },
  273. {
  274. "name": "ai_bubble_debate",
  275. "cluster": {
  276. "headline": "`There Is No AI Bubble,' Says BI's Rob Schiffman",
  277. "summary": "Business Insider's Rob Schiffman argues current AI investment levels are justified by real revenue growth, not speculation."
  278. },
  279. "expected": {
  280. "entities": ["Robert Schiffman", "New York", "Business Insider"],
  281. "keywords": ["AI bubble", "investment thesis", "revenue growth"],
  282. "topic": "ai"
  283. }
  284. },
  285. {
  286. "name": "morgan_stanley_ai_funding",
  287. "cluster": {
  288. "headline": "Morgan Stanley Sees AI-Related Funding Expanding to 15% of All Credit Deals",
  289. "summary": "Morgan Stanley reports AI-related financing now represents 15% of credit deals, up from near zero two years ago."
  290. },
  291. "expected": {
  292. "entities": ["Morgan Stanley", "Diameter Capital Partners", "Scott Goodwin"],
  293. "keywords": ["AI funding", "credit deals", "financing growth"],
  294. "topic": "ai"
  295. }
  296. },
  297. # === OTHER (6) ===
  298. {
  299. "name": "israel_iran_strikes",
  300. "cluster": {
  301. "headline": "Israel strikes Iranian missile sites in Syria",
  302. "summary": "Israeli warplanes targeted Iranian missile depots near Damascus overnight, escalating regional tensions."
  303. },
  304. "expected": {
  305. "entities": ["Israel", "Iran", "Syria", "Damascus"],
  306. "keywords": ["airstrikes", "missile sites", "regional escalation"],
  307. "topic": "other"
  308. }
  309. },
  310. {
  311. "name": "trump_intel_firings",
  312. "cluster": {
  313. "headline": "Trump orders Pulte to start mass firings at intel agencies",
  314. "summary": "President Trump directed Bill Pulte to begin mass firings across US intelligence agencies."
  315. },
  316. "expected": {
  317. "entities": ["Donald Trump", "Bill Pulte", "United States"],
  318. "keywords": ["mass firings", "intelligence agencies", "government restructuring"],
  319. "topic": "other"
  320. }
  321. },
  322. {
  323. "name": "boeing_737_max",
  324. "cluster": {
  325. "headline": "Boeing to launch 737 Max production in July",
  326. "summary": "Boeing plans to restart 737 Max production in July under new CEO Kelly Ortberg."
  327. },
  328. "expected": {
  329. "entities": ["Boeing", "Kelly Ortberg", "Seattle", "Everett", "737 Max"],
  330. "keywords": ["aircraft manufacturing", "production restart", "737 Max"],
  331. "topic": "other"
  332. }
  333. },
  334. {
  335. "name": "putin_trump_peer",
  336. "cluster": {
  337. "headline": "Putin says he treats Trump as 'peer, with respect'",
  338. "summary": "Vladimir Putin described his relationship with Donald Trump as one of mutual respect between peers."
  339. },
  340. "expected": {
  341. "entities": ["Vladimir Putin", "Donald Trump", "Ukraine", "St. Petersburg"],
  342. "keywords": ["Ukraine war", "diplomatic relations", "peer respect"],
  343. "topic": "other"
  344. }
  345. },
  346. {
  347. "name": "paris_bridge_history",
  348. "cluster": {
  349. "headline": "Why is Paris's oldest bridge called the 'New Bridge'?",
  350. "summary": "The history of Paris's Pont Neuf, which despite its name is the city's oldest standing bridge."
  351. },
  352. "expected": {
  353. "entities": ["Paris", "Pont Neuf", "Louis Vuitton", "Tanishk Saha"],
  354. "keywords": ["Paris bridge", "bridge history", "artistic installations"],
  355. "topic": "other"
  356. }
  357. },
  358. {
  359. "name": "ukraine_drone_attack",
  360. "cluster": {
  361. "headline": "Ukraine under heavy drone attack as Zelensky seeks direct meeting with Putin",
  362. "summary": "Russia launched a massive drone barrage on Ukraine as President Zelensky pushes for direct talks with Putin."
  363. },
  364. "expected": {
  365. "entities": ["Ukraine", "Russia", "Volodymyr Zelensky", "Vladimir Putin", "Moscow", "Kyiv"],
  366. "keywords": ["drone strikes", "conflict escalation", "peace talks"],
  367. "topic": "other"
  368. }
  369. },
  370. ]
  371. # Write to JSON file
  372. output_path = Path(__file__).parent.parent / "data" / "annotated_samples.json"
  373. output_path.write_text(json.dumps(ANNOTATED_SAMPLES, indent=2, ensure_ascii=False))
  374. print(f"Wrote {len(ANNOTATED_SAMPLES)} annotated samples to {output_path}")
  375. # Print distribution
  376. from collections import Counter
  377. topics = Counter(s["expected"]["topic"] for s in ANNOTATED_SAMPLES)
  378. print("\nTopic distribution:")
  379. for t, c in sorted(topics.items()):
  380. print(f" {t}: {c}")