import json from pathlib import Path # 30 diverse clusters selected from live data, manually annotated ANNOTATED_SAMPLES = [ # === REGULATION (6) === { "name": "sec_binance_lawsuit", "cluster": { "headline": "SEC sues Binance over unregistered securities", "summary": "The Securities and Exchange Commission filed a lawsuit against Binance, the world's largest crypto exchange, alleging it operated as an unregistered securities exchange and commingled customer funds." }, "expected": { "entities": ["SEC", "Binance"], "keywords": ["securities law", "crypto exchange", "enforcement action"], "topic": "regulation" } }, { "name": "iran_frozen_funds_talks", "cluster": { "headline": "Khamenei aide says $24B in frozen funds blocking talks", "summary": "A senior aide to Iran's Supreme Leader said $24 billion in frozen Iranian funds are blocking progress in indirect talks with the United States." }, "expected": { "entities": ["Iran", "United States", "Khamenei", "Mohsen Rezaei"], "keywords": ["frozen funds", "peace talks", "Iran sanctions"], "topic": "regulation" } }, { "name": "us_iran_sanctions", "cluster": { "headline": "US issues new Iran-linked sanctions", "summary": "The US Treasury Department imposed new sanctions targeting Iranian oil and petrochemical networks." }, "expected": { "entities": ["United States", "Iran", "Department of the Treasury", "OFAC"], "keywords": ["sanctions", "oil network", "petrochemical"], "topic": "regulation" } }, { "name": "house_crypto_bills", "cluster": { "headline": "U.S. House tax committee weighs crypto bills, including relief for small transactions", "summary": "The House Ways and Means Committee discussed legislation providing tax relief for small crypto transactions and clarifying digital asset rules." }, "expected": { "entities": ["U.S. House tax committee", "Bitcoin", "Ethereum", "SEC"], "keywords": ["crypto bills", "tax relief", "small transactions"], "topic": "regulation" } }, { "name": "wamco_sec_settlement", "cluster": { "headline": "Wamco to Pay $100 Million in SEC Settlement Over Leech Trades", "summary": "Western Asset Management agreed to pay $100 million to settle SEC charges over improper trading by former portfolio manager Ken Leech." }, "expected": { "entities": ["Western Asset Management Co.", "Securities and Exchange Commission", "Ken Leech"], "keywords": ["SEC settlement", "trading practices", "portfolio manager"], "topic": "regulation" } }, { "name": "us_cuba_sanctions", "cluster": { "headline": "US imposes sanctions on Cuban president, Castro family members", "summary": "The United States sanctioned Cuban President Miguel Diaz-Canel and members of the Castro family over human rights abuses." }, "expected": { "entities": ["Cuba", "Cuban president", "Castro family", "Raul Castro", "United States"], "keywords": ["US sanctions", "Cuba tensions", "human rights"], "topic": "regulation" } }, # === MACRO (7) === { "name": "fed_rates_inflation", "cluster": { "headline": "Fed holds rates steady as inflation cools", "summary": "The Federal Reserve kept interest rates unchanged at 5.25-5.50%, citing progress on inflation but signaling caution on future cuts." }, "expected": { "entities": ["Federal Reserve"], "keywords": ["interest rates", "inflation", "monetary policy"], "topic": "macro" } }, { "name": "ecb_rate_cut", "cluster": { "headline": "ECB cuts rates as eurozone inflation falls to 2.4%", "summary": "The European Central Bank lowered its deposit rate by 25 basis points to 3.75%, marking its first cut since 2019 as inflation approaches target." }, "expected": { "entities": ["European Central Bank", "ECB"], "keywords": ["rate cut", "eurozone inflation", "deposit rate", "monetary easing"], "topic": "macro" } }, { "name": "china_stimulus", "cluster": { "headline": "China unveils stimulus package to boost slowing economy", "summary": "Beijing announced a comprehensive stimulus package including infrastructure spending, tax cuts, and monetary easing to counter slowing growth and property sector weakness." }, "expected": { "entities": ["China", "Beijing"], "keywords": ["stimulus package", "infrastructure spending", "monetary easing", "property sector"], "topic": "macro" } }, { "name": "oil_opep_cuts", "cluster": { "headline": "Oil jumps after OPEC+ extends production cuts", "summary": "Crude oil prices rose 3% after OPEC+ agreed to extend production cuts through year-end, tightening global supply amid demand concerns." }, "expected": { "entities": ["OPEC+"], "keywords": ["production cuts", "oil prices", "global supply", "demand concerns"], "topic": "macro" } }, { "name": "india_forex_reserves", "cluster": { "headline": "India's Forex Reserves Hit $682.32 Billion as RBI Tightens Its Economic Grip", "summary": "India's foreign exchange reserves reached a record high as the Reserve Bank of India maintains tight monetary policy." }, "expected": { "entities": ["India", "RBI"], "keywords": ["forex reserves", "monetary policy", "economic grip"], "topic": "macro" } }, { "name": "india_us_trade_pact", "cluster": { "headline": "India, US May Execute Interim Trade Pact by July, Minister Says", "summary": "Commerce Minister Piyush Goyal said India and the US could finalize an interim trade agreement by July, addressing tariffs and market access." }, "expected": { "entities": ["India", "US", "Piyush Goyal"], "keywords": ["trade deal", "tariffs", "market access"], "topic": "macro" } }, { "name": "jobs_report_fed_bets", "cluster": { "headline": "Investors boost bets for Fed rate rise after bumper US jobs report", "summary": "Strong US payrolls data led traders to increase wagers on Federal Reserve interest rate hikes." }, "expected": { "entities": ["US", "Federal Reserve"], "keywords": ["jobs report", "rate hike", "rate rise"], "topic": "macro" } }, # === CRYPTO (6) === { "name": "bitcoin_etf_flows", "cluster": { "headline": "Bitcoin ETFs see record inflows as BTC tops $70k", "summary": "US spot Bitcoin ETFs attracted $2.3 billion in net inflows this week as Bitcoin surged past $70,000, driven by institutional demand." }, "expected": { "entities": ["Bitcoin", "BTC"], "keywords": ["ETF inflows", "institutional demand", "price surge"], "topic": "crypto" } }, { "name": "memecoins_dive", "cluster": { "headline": "Memecoins dogecoin, shiba inu dive 9% as bitcoin nears $60,000", "summary": "Dogecoin and Shiba Inu led memecoin losses as Bitcoin approached the $60,000 level." }, "expected": { "entities": ["dogecoin", "shiba inu", "Bitcoin", "memecoins"], "keywords": ["crypto crash", "memecoins dive", "price drop"], "topic": "crypto" } }, { "name": "bitcoin_seller_exhaustion", "cluster": { "headline": "Bitcoin teases 'seller exhaustion' as BTC price downside reaches $60.3K", "summary": "Technical analysts note signs of seller exhaustion in Bitcoin as the cryptocurrency tests support near $60,300." }, "expected": { "entities": ["Bitcoin", "BTC"], "keywords": ["seller exhaustion", "price support", "technical analysis"], "topic": "crypto" } }, { "name": "xrp_liquidation_selloff", "cluster": { "headline": "XRP falls toward $1.10 as liquidation-driven selloff pushes token to multi-month lows", "summary": "XRP dropped sharply as leveraged positions were liquidated, pushing the token to its lowest level in months." }, "expected": { "entities": ["XRP", "Bitcoin", "Ethereum"], "keywords": ["liquidation", "selloff", "price crash"], "topic": "crypto" } }, { "name": "visa_stablecoin_test", "cluster": { "headline": "Visa tests private stablecoin settlement with Brale, Canton", "summary": "Visa is piloting private stablecoin settlement using Brale and Canton networks for institutional payments." }, "expected": { "entities": ["Visa", "Brale", "Canton"], "keywords": ["stablecoin settlement", "private network", "institutional payments"], "topic": "crypto" } }, { "name": "prediction_markets_kalshi", "cluster": { "headline": "Prediction Markets Hit $29.4 Billion in May as Kalshi Leads and Brokers Pile In", "summary": "Prediction market volume surged to record levels with Kalshi leading the growth as traditional brokers enter the space." }, "expected": { "entities": ["Kalshi", "Prediction Markets", "Bitcoin", "Ethereum", "SEC"], "keywords": ["prediction markets", "trading volume", "broker adoption"], "topic": "crypto" } }, # === AI (5) === { "name": "nvidia_earnings_ai", "cluster": { "headline": "Nvidia beats earnings on AI chip demand", "summary": "Nvidia reported quarterly revenue of $26 billion, up 262% year-over-year, driven by insatiable demand for its H100 and Blackwell AI chips." }, "expected": { "entities": ["Nvidia", "H100", "Blackwell"], "keywords": ["AI chips", "earnings beat", "revenue growth", "chip demand"], "topic": "ai" } }, { "name": "anthropic_ai_pause", "cluster": { "headline": "Anthropic calls for pause of global AI development", "summary": "AI safety company Anthropic urged a coordinated pause on advanced AI development to establish safety standards." }, "expected": { "entities": ["Anthropic", "Claude"], "keywords": ["AI development", "global pause", "AI safety"], "topic": "ai" } }, { "name": "microsoft_ai_products", "cluster": { "headline": "Has Microsoft Lost Its Mojo (Again)?", "summary": "Analysts question whether Microsoft's AI product strategy is falling behind competitors despite massive investment." }, "expected": { "entities": ["Microsoft", "Scott Hanselman", "Github"], "keywords": ["AI products", "catch-up mode", "competition"], "topic": "ai" } }, { "name": "ai_bubble_debate", "cluster": { "headline": "`There Is No AI Bubble,' Says BI's Rob Schiffman", "summary": "Business Insider's Rob Schiffman argues current AI investment levels are justified by real revenue growth, not speculation." }, "expected": { "entities": ["Robert Schiffman", "New York", "Business Insider"], "keywords": ["AI bubble", "investment thesis", "revenue growth"], "topic": "ai" } }, { "name": "morgan_stanley_ai_funding", "cluster": { "headline": "Morgan Stanley Sees AI-Related Funding Expanding to 15% of All Credit Deals", "summary": "Morgan Stanley reports AI-related financing now represents 15% of credit deals, up from near zero two years ago." }, "expected": { "entities": ["Morgan Stanley", "Diameter Capital Partners", "Scott Goodwin"], "keywords": ["AI funding", "credit deals", "financing growth"], "topic": "ai" } }, # === OTHER (6) === { "name": "israel_iran_strikes", "cluster": { "headline": "Israel strikes Iranian missile sites in Syria", "summary": "Israeli warplanes targeted Iranian missile depots near Damascus overnight, escalating regional tensions." }, "expected": { "entities": ["Israel", "Iran", "Syria", "Damascus"], "keywords": ["airstrikes", "missile sites", "regional escalation"], "topic": "other" } }, { "name": "trump_intel_firings", "cluster": { "headline": "Trump orders Pulte to start mass firings at intel agencies", "summary": "President Trump directed Bill Pulte to begin mass firings across US intelligence agencies." }, "expected": { "entities": ["Donald Trump", "Bill Pulte", "United States"], "keywords": ["mass firings", "intelligence agencies", "government restructuring"], "topic": "other" } }, { "name": "boeing_737_max", "cluster": { "headline": "Boeing to launch 737 Max production in July", "summary": "Boeing plans to restart 737 Max production in July under new CEO Kelly Ortberg." }, "expected": { "entities": ["Boeing", "Kelly Ortberg", "Seattle", "Everett", "737 Max"], "keywords": ["aircraft manufacturing", "production restart", "737 Max"], "topic": "other" } }, { "name": "putin_trump_peer", "cluster": { "headline": "Putin says he treats Trump as 'peer, with respect'", "summary": "Vladimir Putin described his relationship with Donald Trump as one of mutual respect between peers." }, "expected": { "entities": ["Vladimir Putin", "Donald Trump", "Ukraine", "St. Petersburg"], "keywords": ["Ukraine war", "diplomatic relations", "peer respect"], "topic": "other" } }, { "name": "paris_bridge_history", "cluster": { "headline": "Why is Paris's oldest bridge called the 'New Bridge'?", "summary": "The history of Paris's Pont Neuf, which despite its name is the city's oldest standing bridge." }, "expected": { "entities": ["Paris", "Pont Neuf", "Louis Vuitton", "Tanishk Saha"], "keywords": ["Paris bridge", "bridge history", "artistic installations"], "topic": "other" } }, { "name": "ukraine_drone_attack", "cluster": { "headline": "Ukraine under heavy drone attack as Zelensky seeks direct meeting with Putin", "summary": "Russia launched a massive drone barrage on Ukraine as President Zelensky pushes for direct talks with Putin." }, "expected": { "entities": ["Ukraine", "Russia", "Volodymyr Zelensky", "Vladimir Putin", "Moscow", "Kyiv"], "keywords": ["drone strikes", "conflict escalation", "peace talks"], "topic": "other" } }, ] # Write to JSON file output_path = Path(__file__).parent.parent / "data" / "annotated_samples.json" output_path.write_text(json.dumps(ANNOTATED_SAMPLES, indent=2, ensure_ascii=False)) print(f"Wrote {len(ANNOTATED_SAMPLES)} annotated samples to {output_path}") # Print distribution from collections import Counter topics = Counter(s["expected"]["topic"] for s in ANNOTATED_SAMPLES) print("\nTopic distribution:") for t, c in sorted(topics.items()): print(f" {t}: {c}")