Files
learn-trading/download_data.py
tomatocream b5bf689e72 docs: add API references, mapping corrections, and verification script
- Add yfinance.org and defeatbeta-api.org reference docs
- Fix defeatbeta_mapping.org: deprecated yfinance property names
  (quarterly_financials→quarterly_income_stmt, financials→income_stmt),
  longName vs longBusinessSummary conceptual mismatch, cashflow note typo
- Add Mapping Limitations section with live verification results (AAPL):
  DuckDB 1.4.3 incompatibility, format differences, coverage gaps
- Add docs/test_mapping.py as runnable mapping verification script
- Add offline.py, persistent_cache.py, download_data.py, warmup_cache.py
  for offline/cached defeatbeta usage
- Add aapl_yfinance.py exploration script and quant.py scaffold
- Add .envrc (uv layout) and update pyproject.toml + uv.lock

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-26 15:33:21 +08:00

76 lines
2.1 KiB
Python

"""
One-time download of all defeatbeta parquet files + company_tickers.json.
Run this once; after that use offline.py for zero-network Ticker() calls.
uv run python download_data.py
uv run python download_data.py --out data/parquet # custom directory
"""
import argparse
import sys
from pathlib import Path
import requests
BASE = "https://huggingface.co/datasets/defeatbeta/yahoo-finance-data/resolve/main"
PARQUET_TABLES = [
"stock_profile",
"stock_officers",
"stock_tailing_eps",
"stock_earning_calendar",
"stock_statement",
"stock_prices",
"stock_dividend_events",
"stock_split_events",
"exchange_rate",
"daily_treasury_yield",
"stock_earning_call_transcripts",
"stock_news",
"stock_revenue_breakdown",
"stock_shares_outstanding",
"stock_sec_filing",
]
EXTRA_FILES = [
("data/company_tickers.json", "company_tickers.json"),
]
def download(url: str, dest: Path, label: str) -> None:
if dest.exists():
print(f" skip {label} ({dest.stat().st_size / 1e6:.1f} MB on disk)")
return
print(f" fetch {label} ...", end="", flush=True)
with requests.get(url, stream=True, timeout=60) as r:
r.raise_for_status()
tmp = dest.with_suffix(".tmp")
with open(tmp, "wb") as f:
for chunk in r.iter_content(chunk_size=8 * 1024 * 1024):
f.write(chunk)
tmp.rename(dest)
print(f" {dest.stat().st_size / 1e6:.1f} MB")
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--out", default="data/parquet", help="local output directory")
args = parser.parse_args()
out = Path(args.out)
out.mkdir(parents=True, exist_ok=True)
print(f"Saving to: {out.resolve()}\n")
for table in PARQUET_TABLES:
url = f"{BASE}/data/{table}.parquet"
download(url, out / f"{table}.parquet", table)
for remote_path, local_name in EXTRA_FILES:
url = f"{BASE}/{remote_path}"
download(url, out / local_name, local_name)
print(f"\nDone. {sum(1 for _ in out.iterdir())} files in {out.resolve()}")
if __name__ == "__main__":
main()