docs: add API references, mapping corrections, and verification script
- Add yfinance.org and defeatbeta-api.org reference docs - Fix defeatbeta_mapping.org: deprecated yfinance property names (quarterly_financials→quarterly_income_stmt, financials→income_stmt), longName vs longBusinessSummary conceptual mismatch, cashflow note typo - Add Mapping Limitations section with live verification results (AAPL): DuckDB 1.4.3 incompatibility, format differences, coverage gaps - Add docs/test_mapping.py as runnable mapping verification script - Add offline.py, persistent_cache.py, download_data.py, warmup_cache.py for offline/cached defeatbeta usage - Add aapl_yfinance.py exploration script and quant.py scaffold - Add .envrc (uv layout) and update pyproject.toml + uv.lock Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,75 @@
|
||||
"""
|
||||
One-time download of all defeatbeta parquet files + company_tickers.json.
|
||||
Run this once; after that use offline.py for zero-network Ticker() calls.
|
||||
|
||||
uv run python download_data.py
|
||||
uv run python download_data.py --out data/parquet # custom directory
|
||||
"""
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
BASE = "https://huggingface.co/datasets/defeatbeta/yahoo-finance-data/resolve/main"
|
||||
|
||||
PARQUET_TABLES = [
|
||||
"stock_profile",
|
||||
"stock_officers",
|
||||
"stock_tailing_eps",
|
||||
"stock_earning_calendar",
|
||||
"stock_statement",
|
||||
"stock_prices",
|
||||
"stock_dividend_events",
|
||||
"stock_split_events",
|
||||
"exchange_rate",
|
||||
"daily_treasury_yield",
|
||||
"stock_earning_call_transcripts",
|
||||
"stock_news",
|
||||
"stock_revenue_breakdown",
|
||||
"stock_shares_outstanding",
|
||||
"stock_sec_filing",
|
||||
]
|
||||
|
||||
EXTRA_FILES = [
|
||||
("data/company_tickers.json", "company_tickers.json"),
|
||||
]
|
||||
|
||||
|
||||
def download(url: str, dest: Path, label: str) -> None:
|
||||
if dest.exists():
|
||||
print(f" skip {label} ({dest.stat().st_size / 1e6:.1f} MB on disk)")
|
||||
return
|
||||
print(f" fetch {label} ...", end="", flush=True)
|
||||
with requests.get(url, stream=True, timeout=60) as r:
|
||||
r.raise_for_status()
|
||||
tmp = dest.with_suffix(".tmp")
|
||||
with open(tmp, "wb") as f:
|
||||
for chunk in r.iter_content(chunk_size=8 * 1024 * 1024):
|
||||
f.write(chunk)
|
||||
tmp.rename(dest)
|
||||
print(f" {dest.stat().st_size / 1e6:.1f} MB")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--out", default="data/parquet", help="local output directory")
|
||||
args = parser.parse_args()
|
||||
|
||||
out = Path(args.out)
|
||||
out.mkdir(parents=True, exist_ok=True)
|
||||
print(f"Saving to: {out.resolve()}\n")
|
||||
|
||||
for table in PARQUET_TABLES:
|
||||
url = f"{BASE}/data/{table}.parquet"
|
||||
download(url, out / f"{table}.parquet", table)
|
||||
|
||||
for remote_path, local_name in EXTRA_FILES:
|
||||
url = f"{BASE}/{remote_path}"
|
||||
download(url, out / local_name, local_name)
|
||||
|
||||
print(f"\nDone. {sum(1 for _ in out.iterdir())} files in {out.resolve()}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user