| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387 |
- import json
- from pathlib import Path
- # 30 diverse clusters selected from live data, manually annotated
- ANNOTATED_SAMPLES = [
- # === REGULATION (6) ===
- {
- "name": "sec_binance_lawsuit",
- "cluster": {
- "headline": "SEC sues Binance over unregistered securities",
- "summary": "The Securities and Exchange Commission filed a lawsuit against Binance, the world's largest crypto exchange, alleging it operated as an unregistered securities exchange and commingled customer funds."
- },
- "expected": {
- "entities": ["SEC", "Binance"],
- "keywords": ["securities law", "crypto exchange", "enforcement action"],
- "topic": "regulation"
- }
- },
- {
- "name": "iran_frozen_funds_talks",
- "cluster": {
- "headline": "Khamenei aide says $24B in frozen funds blocking talks",
- "summary": "A senior aide to Iran's Supreme Leader said $24 billion in frozen Iranian funds are blocking progress in indirect talks with the United States."
- },
- "expected": {
- "entities": ["Iran", "United States", "Khamenei", "Mohsen Rezaei"],
- "keywords": ["frozen funds", "peace talks", "Iran sanctions"],
- "topic": "regulation"
- }
- },
- {
- "name": "us_iran_sanctions",
- "cluster": {
- "headline": "US issues new Iran-linked sanctions",
- "summary": "The US Treasury Department imposed new sanctions targeting Iranian oil and petrochemical networks."
- },
- "expected": {
- "entities": ["United States", "Iran", "Department of the Treasury", "OFAC"],
- "keywords": ["sanctions", "oil network", "petrochemical"],
- "topic": "regulation"
- }
- },
- {
- "name": "house_crypto_bills",
- "cluster": {
- "headline": "U.S. House tax committee weighs crypto bills, including relief for small transactions",
- "summary": "The House Ways and Means Committee discussed legislation providing tax relief for small crypto transactions and clarifying digital asset rules."
- },
- "expected": {
- "entities": ["U.S. House tax committee", "Bitcoin", "Ethereum", "SEC"],
- "keywords": ["crypto bills", "tax relief", "small transactions"],
- "topic": "regulation"
- }
- },
- {
- "name": "wamco_sec_settlement",
- "cluster": {
- "headline": "Wamco to Pay $100 Million in SEC Settlement Over Leech Trades",
- "summary": "Western Asset Management agreed to pay $100 million to settle SEC charges over improper trading by former portfolio manager Ken Leech."
- },
- "expected": {
- "entities": ["Western Asset Management Co.", "Securities and Exchange Commission", "Ken Leech"],
- "keywords": ["SEC settlement", "trading practices", "portfolio manager"],
- "topic": "regulation"
- }
- },
- {
- "name": "us_cuba_sanctions",
- "cluster": {
- "headline": "US imposes sanctions on Cuban president, Castro family members",
- "summary": "The United States sanctioned Cuban President Miguel Diaz-Canel and members of the Castro family over human rights abuses."
- },
- "expected": {
- "entities": ["Cuba", "Cuban president", "Castro family", "Raul Castro", "United States"],
- "keywords": ["US sanctions", "Cuba tensions", "human rights"],
- "topic": "regulation"
- }
- },
- # === MACRO (7) ===
- {
- "name": "fed_rates_inflation",
- "cluster": {
- "headline": "Fed holds rates steady as inflation cools",
- "summary": "The Federal Reserve kept interest rates unchanged at 5.25-5.50%, citing progress on inflation but signaling caution on future cuts."
- },
- "expected": {
- "entities": ["Federal Reserve"],
- "keywords": ["interest rates", "inflation", "monetary policy"],
- "topic": "macro"
- }
- },
- {
- "name": "ecb_rate_cut",
- "cluster": {
- "headline": "ECB cuts rates as eurozone inflation falls to 2.4%",
- "summary": "The European Central Bank lowered its deposit rate by 25 basis points to 3.75%, marking its first cut since 2019 as inflation approaches target."
- },
- "expected": {
- "entities": ["European Central Bank", "ECB"],
- "keywords": ["rate cut", "eurozone inflation", "deposit rate", "monetary easing"],
- "topic": "macro"
- }
- },
- {
- "name": "china_stimulus",
- "cluster": {
- "headline": "China unveils stimulus package to boost slowing economy",
- "summary": "Beijing announced a comprehensive stimulus package including infrastructure spending, tax cuts, and monetary easing to counter slowing growth and property sector weakness."
- },
- "expected": {
- "entities": ["China", "Beijing"],
- "keywords": ["stimulus package", "infrastructure spending", "monetary easing", "property sector"],
- "topic": "macro"
- }
- },
- {
- "name": "oil_opep_cuts",
- "cluster": {
- "headline": "Oil jumps after OPEC+ extends production cuts",
- "summary": "Crude oil prices rose 3% after OPEC+ agreed to extend production cuts through year-end, tightening global supply amid demand concerns."
- },
- "expected": {
- "entities": ["OPEC+"],
- "keywords": ["production cuts", "oil prices", "global supply", "demand concerns"],
- "topic": "macro"
- }
- },
- {
- "name": "india_forex_reserves",
- "cluster": {
- "headline": "India's Forex Reserves Hit $682.32 Billion as RBI Tightens Its Economic Grip",
- "summary": "India's foreign exchange reserves reached a record high as the Reserve Bank of India maintains tight monetary policy."
- },
- "expected": {
- "entities": ["India", "RBI"],
- "keywords": ["forex reserves", "monetary policy", "economic grip"],
- "topic": "macro"
- }
- },
- {
- "name": "india_us_trade_pact",
- "cluster": {
- "headline": "India, US May Execute Interim Trade Pact by July, Minister Says",
- "summary": "Commerce Minister Piyush Goyal said India and the US could finalize an interim trade agreement by July, addressing tariffs and market access."
- },
- "expected": {
- "entities": ["India", "US", "Piyush Goyal"],
- "keywords": ["trade deal", "tariffs", "market access"],
- "topic": "macro"
- }
- },
- {
- "name": "jobs_report_fed_bets",
- "cluster": {
- "headline": "Investors boost bets for Fed rate rise after bumper US jobs report",
- "summary": "Strong US payrolls data led traders to increase wagers on Federal Reserve interest rate hikes."
- },
- "expected": {
- "entities": ["US", "Federal Reserve"],
- "keywords": ["jobs report", "rate hike", "rate rise"],
- "topic": "macro"
- }
- },
- # === CRYPTO (6) ===
- {
- "name": "bitcoin_etf_flows",
- "cluster": {
- "headline": "Bitcoin ETFs see record inflows as BTC tops $70k",
- "summary": "US spot Bitcoin ETFs attracted $2.3 billion in net inflows this week as Bitcoin surged past $70,000, driven by institutional demand."
- },
- "expected": {
- "entities": ["Bitcoin", "BTC"],
- "keywords": ["ETF inflows", "institutional demand", "price surge"],
- "topic": "crypto"
- }
- },
- {
- "name": "memecoins_dive",
- "cluster": {
- "headline": "Memecoins dogecoin, shiba inu dive 9% as bitcoin nears $60,000",
- "summary": "Dogecoin and Shiba Inu led memecoin losses as Bitcoin approached the $60,000 level."
- },
- "expected": {
- "entities": ["dogecoin", "shiba inu", "Bitcoin", "memecoins"],
- "keywords": ["crypto crash", "memecoins dive", "price drop"],
- "topic": "crypto"
- }
- },
- {
- "name": "bitcoin_seller_exhaustion",
- "cluster": {
- "headline": "Bitcoin teases 'seller exhaustion' as BTC price downside reaches $60.3K",
- "summary": "Technical analysts note signs of seller exhaustion in Bitcoin as the cryptocurrency tests support near $60,300."
- },
- "expected": {
- "entities": ["Bitcoin", "BTC"],
- "keywords": ["seller exhaustion", "price support", "technical analysis"],
- "topic": "crypto"
- }
- },
- {
- "name": "xrp_liquidation_selloff",
- "cluster": {
- "headline": "XRP falls toward $1.10 as liquidation-driven selloff pushes token to multi-month lows",
- "summary": "XRP dropped sharply as leveraged positions were liquidated, pushing the token to its lowest level in months."
- },
- "expected": {
- "entities": ["XRP", "Bitcoin", "Ethereum"],
- "keywords": ["liquidation", "selloff", "price crash"],
- "topic": "crypto"
- }
- },
- {
- "name": "visa_stablecoin_test",
- "cluster": {
- "headline": "Visa tests private stablecoin settlement with Brale, Canton",
- "summary": "Visa is piloting private stablecoin settlement using Brale and Canton networks for institutional payments."
- },
- "expected": {
- "entities": ["Visa", "Brale", "Canton"],
- "keywords": ["stablecoin settlement", "private network", "institutional payments"],
- "topic": "crypto"
- }
- },
- {
- "name": "prediction_markets_kalshi",
- "cluster": {
- "headline": "Prediction Markets Hit $29.4 Billion in May as Kalshi Leads and Brokers Pile In",
- "summary": "Prediction market volume surged to record levels with Kalshi leading the growth as traditional brokers enter the space."
- },
- "expected": {
- "entities": ["Kalshi", "Prediction Markets", "Bitcoin", "Ethereum", "SEC"],
- "keywords": ["prediction markets", "trading volume", "broker adoption"],
- "topic": "crypto"
- }
- },
- # === AI (5) ===
- {
- "name": "nvidia_earnings_ai",
- "cluster": {
- "headline": "Nvidia beats earnings on AI chip demand",
- "summary": "Nvidia reported quarterly revenue of $26 billion, up 262% year-over-year, driven by insatiable demand for its H100 and Blackwell AI chips."
- },
- "expected": {
- "entities": ["Nvidia", "H100", "Blackwell"],
- "keywords": ["AI chips", "earnings beat", "revenue growth", "chip demand"],
- "topic": "ai"
- }
- },
- {
- "name": "anthropic_ai_pause",
- "cluster": {
- "headline": "Anthropic calls for pause of global AI development",
- "summary": "AI safety company Anthropic urged a coordinated pause on advanced AI development to establish safety standards."
- },
- "expected": {
- "entities": ["Anthropic", "Claude"],
- "keywords": ["AI development", "global pause", "AI safety"],
- "topic": "ai"
- }
- },
- {
- "name": "microsoft_ai_products",
- "cluster": {
- "headline": "Has Microsoft Lost Its Mojo (Again)?",
- "summary": "Analysts question whether Microsoft's AI product strategy is falling behind competitors despite massive investment."
- },
- "expected": {
- "entities": ["Microsoft", "Scott Hanselman", "Github"],
- "keywords": ["AI products", "catch-up mode", "competition"],
- "topic": "ai"
- }
- },
- {
- "name": "ai_bubble_debate",
- "cluster": {
- "headline": "`There Is No AI Bubble,' Says BI's Rob Schiffman",
- "summary": "Business Insider's Rob Schiffman argues current AI investment levels are justified by real revenue growth, not speculation."
- },
- "expected": {
- "entities": ["Robert Schiffman", "New York", "Business Insider"],
- "keywords": ["AI bubble", "investment thesis", "revenue growth"],
- "topic": "ai"
- }
- },
- {
- "name": "morgan_stanley_ai_funding",
- "cluster": {
- "headline": "Morgan Stanley Sees AI-Related Funding Expanding to 15% of All Credit Deals",
- "summary": "Morgan Stanley reports AI-related financing now represents 15% of credit deals, up from near zero two years ago."
- },
- "expected": {
- "entities": ["Morgan Stanley", "Diameter Capital Partners", "Scott Goodwin"],
- "keywords": ["AI funding", "credit deals", "financing growth"],
- "topic": "ai"
- }
- },
- # === OTHER (6) ===
- {
- "name": "israel_iran_strikes",
- "cluster": {
- "headline": "Israel strikes Iranian missile sites in Syria",
- "summary": "Israeli warplanes targeted Iranian missile depots near Damascus overnight, escalating regional tensions."
- },
- "expected": {
- "entities": ["Israel", "Iran", "Syria", "Damascus"],
- "keywords": ["airstrikes", "missile sites", "regional escalation"],
- "topic": "other"
- }
- },
- {
- "name": "trump_intel_firings",
- "cluster": {
- "headline": "Trump orders Pulte to start mass firings at intel agencies",
- "summary": "President Trump directed Bill Pulte to begin mass firings across US intelligence agencies."
- },
- "expected": {
- "entities": ["Donald Trump", "Bill Pulte", "United States"],
- "keywords": ["mass firings", "intelligence agencies", "government restructuring"],
- "topic": "other"
- }
- },
- {
- "name": "boeing_737_max",
- "cluster": {
- "headline": "Boeing to launch 737 Max production in July",
- "summary": "Boeing plans to restart 737 Max production in July under new CEO Kelly Ortberg."
- },
- "expected": {
- "entities": ["Boeing", "Kelly Ortberg", "Seattle", "Everett", "737 Max"],
- "keywords": ["aircraft manufacturing", "production restart", "737 Max"],
- "topic": "other"
- }
- },
- {
- "name": "putin_trump_peer",
- "cluster": {
- "headline": "Putin says he treats Trump as 'peer, with respect'",
- "summary": "Vladimir Putin described his relationship with Donald Trump as one of mutual respect between peers."
- },
- "expected": {
- "entities": ["Vladimir Putin", "Donald Trump", "Ukraine", "St. Petersburg"],
- "keywords": ["Ukraine war", "diplomatic relations", "peer respect"],
- "topic": "other"
- }
- },
- {
- "name": "paris_bridge_history",
- "cluster": {
- "headline": "Why is Paris's oldest bridge called the 'New Bridge'?",
- "summary": "The history of Paris's Pont Neuf, which despite its name is the city's oldest standing bridge."
- },
- "expected": {
- "entities": ["Paris", "Pont Neuf", "Louis Vuitton", "Tanishk Saha"],
- "keywords": ["Paris bridge", "bridge history", "artistic installations"],
- "topic": "other"
- }
- },
- {
- "name": "ukraine_drone_attack",
- "cluster": {
- "headline": "Ukraine under heavy drone attack as Zelensky seeks direct meeting with Putin",
- "summary": "Russia launched a massive drone barrage on Ukraine as President Zelensky pushes for direct talks with Putin."
- },
- "expected": {
- "entities": ["Ukraine", "Russia", "Volodymyr Zelensky", "Vladimir Putin", "Moscow", "Kyiv"],
- "keywords": ["drone strikes", "conflict escalation", "peace talks"],
- "topic": "other"
- }
- },
- ]
- # Write to JSON file
- output_path = Path(__file__).parent.parent / "data" / "annotated_samples.json"
- output_path.write_text(json.dumps(ANNOTATED_SAMPLES, indent=2, ensure_ascii=False))
- print(f"Wrote {len(ANNOTATED_SAMPLES)} annotated samples to {output_path}")
- # Print distribution
- from collections import Counter
- topics = Counter(s["expected"]["topic"] for s in ANNOTATED_SAMPLES)
- print("\nTopic distribution:")
- for t, c in sorted(topics.items()):
- print(f" {t}: {c}")
|