from __future__ import annotations from pathlib import Path import argparse import json import sys ROOT = Path(__file__).resolve().parents[1] SRC = ROOT / "src" SIM_SRC = ROOT / "simulation" / "src" for path in (str(SRC), str(SIM_SRC)): if path not in sys.path: sys.path.insert(0, path) from hermes_sim.candles import Candle # noqa: E402 from hermes_sim.preprocess import build_manifest, discover_raw_archives, load_raw_archives, write_manifest, write_prepared_csv # noqa: E402 def main() -> int: parser = argparse.ArgumentParser(description="Preprocess raw Binance archives into a canonical Hermes input CSV.") parser.add_argument("--symbol", default="XRPUSDT") parser.add_argument("--interval", default="1m") parser.add_argument("--raw-dir", default=str(ROOT / "simulation" / "data" / "raw")) parser.add_argument("--out", default=None, help="Output CSV path. Defaults to simulation/data/prepared///dataset.csv") parser.add_argument("--manifest-out", default=None, help="Output manifest JSON path. Defaults beside the CSV.") args = parser.parse_args() raw_dir = Path(args.raw_dir) archives = discover_raw_archives(raw_dir, args.symbol, args.interval) if not archives: raise SystemExit(f"No raw archives found under {raw_dir}") candles = load_raw_archives(archives) out_path = Path(args.out) if args.out else ROOT / "simulation" / "data" / "prepared" / args.symbol.upper() / args.interval / f"{args.symbol.upper()}-{args.interval}.csv" write_prepared_csv(out_path, candles) dataset = build_manifest( symbol=args.symbol, interval=args.interval, source_kind="bulk", source_files=archives, candles=candles, output_csv=out_path, ) manifest_path = Path(args.manifest_out) if args.manifest_out else out_path.with_suffix(".manifest.json") write_manifest(manifest_path, dataset) print(json.dumps({"candles": len(candles), "csv": str(out_path), "manifest": str(manifest_path)}, indent=2)) return 0 if __name__ == "__main__": raise SystemExit(main())