b5bf689e72
- Add yfinance.org and defeatbeta-api.org reference docs - Fix defeatbeta_mapping.org: deprecated yfinance property names (quarterly_financials→quarterly_income_stmt, financials→income_stmt), longName vs longBusinessSummary conceptual mismatch, cashflow note typo - Add Mapping Limitations section with live verification results (AAPL): DuckDB 1.4.3 incompatibility, format differences, coverage gaps - Add docs/test_mapping.py as runnable mapping verification script - Add offline.py, persistent_cache.py, download_data.py, warmup_cache.py for offline/cached defeatbeta usage - Add aapl_yfinance.py exploration script and quant.py scaffold - Add .envrc (uv layout) and update pyproject.toml + uv.lock Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
76 lines
2.1 KiB
Python
76 lines
2.1 KiB
Python
"""
|
|
One-time download of all defeatbeta parquet files + company_tickers.json.
|
|
Run this once; after that use offline.py for zero-network Ticker() calls.
|
|
|
|
uv run python download_data.py
|
|
uv run python download_data.py --out data/parquet # custom directory
|
|
"""
|
|
import argparse
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
|
|
BASE = "https://huggingface.co/datasets/defeatbeta/yahoo-finance-data/resolve/main"
|
|
|
|
PARQUET_TABLES = [
|
|
"stock_profile",
|
|
"stock_officers",
|
|
"stock_tailing_eps",
|
|
"stock_earning_calendar",
|
|
"stock_statement",
|
|
"stock_prices",
|
|
"stock_dividend_events",
|
|
"stock_split_events",
|
|
"exchange_rate",
|
|
"daily_treasury_yield",
|
|
"stock_earning_call_transcripts",
|
|
"stock_news",
|
|
"stock_revenue_breakdown",
|
|
"stock_shares_outstanding",
|
|
"stock_sec_filing",
|
|
]
|
|
|
|
EXTRA_FILES = [
|
|
("data/company_tickers.json", "company_tickers.json"),
|
|
]
|
|
|
|
|
|
def download(url: str, dest: Path, label: str) -> None:
|
|
if dest.exists():
|
|
print(f" skip {label} ({dest.stat().st_size / 1e6:.1f} MB on disk)")
|
|
return
|
|
print(f" fetch {label} ...", end="", flush=True)
|
|
with requests.get(url, stream=True, timeout=60) as r:
|
|
r.raise_for_status()
|
|
tmp = dest.with_suffix(".tmp")
|
|
with open(tmp, "wb") as f:
|
|
for chunk in r.iter_content(chunk_size=8 * 1024 * 1024):
|
|
f.write(chunk)
|
|
tmp.rename(dest)
|
|
print(f" {dest.stat().st_size / 1e6:.1f} MB")
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--out", default="data/parquet", help="local output directory")
|
|
args = parser.parse_args()
|
|
|
|
out = Path(args.out)
|
|
out.mkdir(parents=True, exist_ok=True)
|
|
print(f"Saving to: {out.resolve()}\n")
|
|
|
|
for table in PARQUET_TABLES:
|
|
url = f"{BASE}/data/{table}.parquet"
|
|
download(url, out / f"{table}.parquet", table)
|
|
|
|
for remote_path, local_name in EXTRA_FILES:
|
|
url = f"{BASE}/{remote_path}"
|
|
download(url, out / local_name, local_name)
|
|
|
|
print(f"\nDone. {sum(1 for _ in out.iterdir())} files in {out.resolve()}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|