""" One-time download of all defeatbeta parquet files + company_tickers.json. Run this once; after that use offline.py for zero-network Ticker() calls. uv run python download_data.py uv run python download_data.py --out data/parquet # custom directory """ import argparse import sys from pathlib import Path import requests BASE = "https://huggingface.co/datasets/defeatbeta/yahoo-finance-data/resolve/main" PARQUET_TABLES = [ "stock_profile", "stock_officers", "stock_tailing_eps", "stock_earning_calendar", "stock_statement", "stock_prices", "stock_dividend_events", "stock_split_events", "exchange_rate", "daily_treasury_yield", "stock_earning_call_transcripts", "stock_news", "stock_revenue_breakdown", "stock_shares_outstanding", "stock_sec_filing", ] EXTRA_FILES = [ ("data/company_tickers.json", "company_tickers.json"), ] def download(url: str, dest: Path, label: str) -> None: if dest.exists(): print(f" skip {label} ({dest.stat().st_size / 1e6:.1f} MB on disk)") return print(f" fetch {label} ...", end="", flush=True) with requests.get(url, stream=True, timeout=60) as r: r.raise_for_status() tmp = dest.with_suffix(".tmp") with open(tmp, "wb") as f: for chunk in r.iter_content(chunk_size=8 * 1024 * 1024): f.write(chunk) tmp.rename(dest) print(f" {dest.stat().st_size / 1e6:.1f} MB") def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--out", default="data/parquet", help="local output directory") args = parser.parse_args() out = Path(args.out) out.mkdir(parents=True, exist_ok=True) print(f"Saving to: {out.resolve()}\n") for table in PARQUET_TABLES: url = f"{BASE}/data/{table}.parquet" download(url, out / f"{table}.parquet", table) for remote_path, local_name in EXTRA_FILES: url = f"{BASE}/{remote_path}" download(url, out / local_name, local_name) print(f"\nDone. {sum(1 for _ in out.iterdir())} files in {out.resolve()}") if __name__ == "__main__": main()