chore: initial project scaffold for quant trading learning

Sets up project structure with yfinance-based OHLCV fetcher for top 100 S&P companies, Jupyter notebook scaffold, and uv-managed deps. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-25 00:58:02 +08:00
commit 60fa0c113d
7 changed files with 5406 additions and 0 deletions
@@ -0,0 +1 @@
+3.12
@@ -0,0 +1,67 @@
+import pandas as pd
+import yfinance as yf
+import pyarrow.parquet as pq
+from pathlib import Path
+import time
+
+
+TOP100_TICKERS = [
+    "AAPL", "MSFT", "NVDA", "AMZN", "META", "GOOGL", "GOOG", "TSLA", "BRK-B", "JPM",
+    "LLY", "V", "UNH", "XOM", "MA", "AVGO", "PG", "HD", "COST", "JNJ",
+    "ABBV", "MRK", "PEP", "KO", "CVX", "NFLX", "ADBE", "CRM", "AMD", "ACN",
+    "MCD", "WMT", "BAC", "CSCO", "TMO", "ABT", "DHR", "CMCSA", "UPS", "LIN",
+    "NEE", "PM", "TXN", "HON", "ORCL", "QCOM", "RTX", "INTU", "AMGN", "LOW",
+    "AMAT", "BMY", "UNP", "AXP", "CAT", "BKNG", "SPGI", "VRTX", "DE", "GILD",
+    "ELV", "ISRG", "ADP", "MDT", "LMT", "CVS", "CI", "ADI", "SYK", "SBUX",
+    "MMC", "C", "TJX", "GS", "BLK", "SCHW", "ZTS", "ETN", "PLD", "BDX",
+    "REGN", "ADI", "CL", "CME", "CB", "SO", "DUK", "NOC", "FIS", "ITW",
+    "APD", "NSC", "BSX", "GD", "FI", "MMM", "PGR", "WM", "AFL", "SLB",
+]
+
+
+def fetch_top100_ohlcv(output_path: str, period: str = "max") -> None:
+    output_path = Path(output_path)
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    print(f"Fetching OHLCV data for top 100 companies (period={period})...")
+    records = []
+    errors = []
+
+    for i, ticker in enumerate(TOP100_TICKERS, 1):
+        try:
+            print(f"[{i}/{len(TOP100_TICKERS)}] Downloading {ticker}...", end=" ")
+            hist = yf.Ticker(ticker).history(period=period, auto_adjust=False)
+            if hist.empty:
+                print(f"no data, skipping")
+                errors.append((ticker, "no data"))
+                continue
+            hist = hist.reset_index()
+            hist["Date"] = hist["Date"].astype(str)
+            hist["ticker"] = ticker
+            records.append(hist)
+            print(f"{len(hist)} rows")
+            time.sleep(0.1)
+        except Exception as e:
+            print(f"error: {e}")
+            errors.append((ticker, str(e)))
+
+    if not records:
+        print("No data fetched.")
+        return
+
+    df = pd.concat(records, ignore_index=True)
+    cols = ["ticker", "Date", "Open", "High", "Low", "Close", "Volume", "Dividends", "Stock Splits"]
+    cols = [c for c in cols if c in df.columns]
+    df = df[cols]
+
+    parquet_path = output_path / "top100_companies.parquet"
+    df.to_parquet(parquet_path, index=False)
+    print(f"\nSaved {len(df)} rows to {parquet_path}")
+    print(f"Date range: {df['Date'].min()} -> {df['Date'].max()}")
+    print(f"Tickers fetched: {df['ticker'].nunique()}")
+    if errors:
+        print(f"Errors ({len(errors)}): {errors[:10]}")
+
+
+if __name__ == "__main__":
+    fetch_top100_ohlcv("data/top100_ohlcv")
@@ -0,0 +1,6 @@
+def main():
+    print("Hello from learn-trading!")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,51 @@
+[project]
+name = "learn-trading"
+version = "0.0.1"
+description = "Combined trading research environment"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "marimo>=0.23.2",
+    "arch>=8.0.0",
+    "backtesting>=0.6.5",
+    "backtrader>=1.9.78.123",
+    "ccxt>=4.5.50",
+    "empyrical-reloaded>=0.5.12",
+    "finta>=1.3",
+    "hmmlearn>=0.3.3",
+    "jupyterlab>=4.5.6",
+    "lightgbm>=4.6.0",
+    "lxml>=6.1.0",
+    "matplotlib>=3.10.8",
+    "mplfinance>=0.12.10b0",
+    "numpy>=2.2.6,<2.3",
+    "pandas>=2.3.3",
+    "pandas-ta>=0.4.71b0",
+    "plotly>=6.7.0",
+    "polars>=1.40.1",
+    "pyarrow>=24.0.0",
+    "pyportfolioopt>=1.6.0",
+    "qlib>=0.0.2.dev20",
+    "quantstats>=0.0.81",
+    "riskfolio-lib>=7.2.1",
+    "scikit-learn>=1.7.2",
+    "scipy>=1.17.1",
+    "sktime>=0.40.1",
+    "statsmodels>=0.14.6",
+    "ta>=0.11.0",
+    "ta-lib>=0.6.8",
+    "torch>=2.0.0",
+    "transformers>=4.0.0",
+    "tscv>=0.1.3",
+    "vectorbt>=1.0.0",
+    "xgboost>=3.2.0",
+    "yfinance>=1.3.0",
+]
+
+[[tool.uv.index]]
+name = "pytorch-cpu"
+url = "https://download.pytorch.org/whl/cpu"
+explicit = true
+
+[tool.uv.sources]
+torch = { index = "pytorch-cpu" }
@@ -0,0 +1,636 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0229f5ae",
+   "metadata": {},
+   "source": [
+    "# Quant Trading Scaffold\n",
+    "## Data Ingestion → Indicators → Walk-Forward ML → Backtesting → Tearsheet\n",
+    "\n",
+    "Pipeline:\n",
+    "1. **Ingest** OHLCV data via yfinance\n",
+    "2. **Engineer features** — momentum, trend, volatility, volume indicators\n",
+    "3. **Label** — binary classification (next-N-day return > 0)\n",
+    "4. **Walk-forward split** with purging (no leakage)\n",
+    "5. **Train** XGBoost classifier per fold\n",
+    "6. **Evaluate** with quantstats tearsheet"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "43b4d162",
+   "metadata": {},
+   "source": [
+    "## 1. Config & Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "aab1cebb",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'numpy'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mModuleNotFoundError\u001b[39m                       Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 6\u001b[39m\n\u001b[32m      3\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mwarnings\u001b[39;00m\n\u001b[32m      4\u001b[39m warnings.filterwarnings(\u001b[33m\"\u001b[39m\u001b[33mignore\u001b[39m\u001b[33m\"\u001b[39m, category=\u001b[38;5;167;01mFutureWarning\u001b[39;00m)\n\u001b[32m----> \u001b[39m\u001b[32m6\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mnumpy\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mnp\u001b[39;00m\n\u001b[32m      7\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpandas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpd\u001b[39;00m\n\u001b[32m      8\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpandas_ta\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mta\u001b[39;00m\n",
+      "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'numpy'"
+     ]
+    }
+   ],
+   "source": [
+    "from __future__ import annotations\n",
+    "\n",
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\", category=FutureWarning)\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import pandas_ta as ta\n",
+    "import yfinance as yf\n",
+    "import plotly.graph_objects as go\n",
+    "from plotly.subplots import make_subplots\n",
+    "from sklearn.model_selection import TimeSeriesSplit\n",
+    "from sklearn.metrics import accuracy_score, classification_report\n",
+    "from xgboost import XGBClassifier\n",
+    "import quantstats as qs\n",
+    "\n",
+    "# ── Config ──────────────────────────────────────────────────────\n",
+    "TICKER = \"SPY\"\n",
+    "START = \"2015-01-01\"\n",
+    "END = \"2025-12-31\"\n",
+    "HORIZON = 5          # predict N-day forward return\n",
+    "PURGE_GAP = 5        # gap between train/test to prevent leakage\n",
+    "N_SPLITS = 5         # walk-forward folds\n",
+    "TRAIN_MIN = 504      # ~2 years minimum training window\n",
+    "\n",
+    "print(f\"Config: {TICKER} | {START}→{END} | horizon={HORIZON}d | {N_SPLITS} folds\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "28af2cae",
+   "metadata": {},
+   "source": [
+    "## 2. Data Ingestion"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b4d755da",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "raw = yf.download(TICKER, start=START, end=END, auto_adjust=True)\n",
+    "# yfinance may return MultiIndex columns for single ticker — flatten\n",
+    "if isinstance(raw.columns, pd.MultiIndex):\n",
+    "    raw.columns = raw.columns.droplevel(\"Ticker\")\n",
+    "raw.index = pd.DatetimeIndex(raw.index)\n",
+    "df = raw.copy()\n",
+    "print(f\"Downloaded {len(df)} bars: {df.index[0].date()} → {df.index[-1].date()}\")\n",
+    "df.tail(3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e9b1bad5",
+   "metadata": {},
+   "source": [
+    "## 3. Feature Engineering — Technical Indicators\n",
+    "\n",
+    "We compute features across 4 categories:\n",
+    "- **Momentum**: RSI, MACD, Stochastic, Williams %R, ROC\n",
+    "- **Trend**: SMA/EMA crossovers, ADX, Ichimoku\n",
+    "- **Volatility**: Bollinger Bands, ATR, Keltner Channels\n",
+    "- **Volume**: OBV, MFI, Accumulation/Distribution"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a83bf612",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ── Momentum ────────────────────────────────────────────────────\n",
+    "df[\"rsi_14\"] = ta.rsi(df[\"Close\"], length=14)\n",
+    "df[\"rsi_7\"] = ta.rsi(df[\"Close\"], length=7)\n",
+    "\n",
+    "macd = ta.macd(df[\"Close\"], fast=12, slow=26, signal=9)\n",
+    "df[\"macd\"] = macd.iloc[:, 0]          # MACD line\n",
+    "df[\"macd_signal\"] = macd.iloc[:, 1]   # signal line\n",
+    "df[\"macd_hist\"] = macd.iloc[:, 2]     # histogram\n",
+    "\n",
+    "stoch = ta.stoch(df[\"High\"], df[\"Low\"], df[\"Close\"])\n",
+    "df[\"stoch_k\"] = stoch.iloc[:, 0]\n",
+    "df[\"stoch_d\"] = stoch.iloc[:, 1]\n",
+    "\n",
+    "df[\"willr_14\"] = ta.willr(df[\"High\"], df[\"Low\"], df[\"Close\"], length=14)\n",
+    "df[\"roc_10\"] = ta.roc(df[\"Close\"], length=10)\n",
+    "df[\"roc_21\"] = ta.roc(df[\"Close\"], length=21)\n",
+    "df[\"mom_10\"] = ta.mom(df[\"Close\"], length=10)\n",
+    "\n",
+    "# ── Trend ───────────────────────────────────────────────────────\n",
+    "df[\"sma_20\"] = ta.sma(df[\"Close\"], length=20)\n",
+    "df[\"sma_50\"] = ta.sma(df[\"Close\"], length=50)\n",
+    "df[\"sma_200\"] = ta.sma(df[\"Close\"], length=200)\n",
+    "df[\"ema_12\"] = ta.ema(df[\"Close\"], length=12)\n",
+    "df[\"ema_26\"] = ta.ema(df[\"Close\"], length=26)\n",
+    "\n",
+    "# crossover features (price relative to MAs)\n",
+    "df[\"close_over_sma20\"] = (df[\"Close\"] / df[\"sma_20\"]) - 1\n",
+    "df[\"close_over_sma50\"] = (df[\"Close\"] / df[\"sma_50\"]) - 1\n",
+    "df[\"close_over_sma200\"] = (df[\"Close\"] / df[\"sma_200\"]) - 1\n",
+    "df[\"sma20_over_sma50\"] = (df[\"sma_20\"] / df[\"sma_50\"]) - 1\n",
+    "df[\"sma50_over_sma200\"] = (df[\"sma_50\"] / df[\"sma_200\"]) - 1\n",
+    "\n",
+    "adx = ta.adx(df[\"High\"], df[\"Low\"], df[\"Close\"], length=14)\n",
+    "df[\"adx\"] = adx.iloc[:, 0]\n",
+    "df[\"di_plus\"] = adx.iloc[:, 1]\n",
+    "df[\"di_minus\"] = adx.iloc[:, 2]\n",
+    "\n",
+    "# ── Volatility ──────────────────────────────────────────────────\n",
+    "bbands = ta.bbands(df[\"Close\"], length=20, std=2)\n",
+    "df[\"bb_upper\"] = bbands.iloc[:, 0]\n",
+    "df[\"bb_mid\"] = bbands.iloc[:, 1]\n",
+    "df[\"bb_lower\"] = bbands.iloc[:, 2]\n",
+    "df[\"bb_width\"] = bbands.iloc[:, 3]\n",
+    "df[\"bb_pctb\"] = bbands.iloc[:, 4]   # %B: where price is within bands\n",
+    "\n",
+    "df[\"atr_14\"] = ta.atr(df[\"High\"], df[\"Low\"], df[\"Close\"], length=14)\n",
+    "df[\"atr_pct\"] = df[\"atr_14\"] / df[\"Close\"]  # normalized ATR\n",
+    "\n",
+    "kc = ta.kc(df[\"High\"], df[\"Low\"], df[\"Close\"], length=20)\n",
+    "df[\"kc_upper\"] = kc.iloc[:, 0]\n",
+    "df[\"kc_lower\"] = kc.iloc[:, 1]\n",
+    "\n",
+    "# volatility: rolling std of returns\n",
+    "df[\"vol_10\"] = df[\"Close\"].pct_change().rolling(10).std()\n",
+    "df[\"vol_21\"] = df[\"Close\"].pct_change().rolling(21).std()\n",
+    "\n",
+    "# ── Volume ──────────────────────────────────────────────────────\n",
+    "df[\"obv\"] = ta.obv(df[\"Close\"], df[\"Volume\"])\n",
+    "df[\"obv_sma20\"] = ta.sma(df[\"obv\"], length=20)\n",
+    "df[\"mfi_14\"] = ta.mfi(df[\"High\"], df[\"Low\"], df[\"Close\"], df[\"Volume\"], length=14)\n",
+    "ad = ta.ad(df[\"High\"], df[\"Low\"], df[\"Close\"], df[\"Volume\"])\n",
+    "df[\"ad_line\"] = ad\n",
+    "\n",
+    "# volume relative to average\n",
+    "df[\"vol_ratio_20\"] = df[\"Volume\"] / df[\"Volume\"].rolling(20).mean()\n",
+    "\n",
+    "# ── Returns features ────────────────────────────────────────────\n",
+    "df[\"ret_1d\"] = df[\"Close\"].pct_change(1)\n",
+    "df[\"ret_5d\"] = df[\"Close\"].pct_change(5)\n",
+    "df[\"ret_10d\"] = df[\"Close\"].pct_change(10)\n",
+    "df[\"ret_21d\"] = df[\"Close\"].pct_change(21)\n",
+    "\n",
+    "print(f\"Total columns after feature engineering: {len(df.columns)}\")\n",
+    "df.tail(3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "907e377c",
+   "metadata": {},
+   "source": [
+    "## 4. Labeling — Forward Return Classification\n",
+    "\n",
+    "Target: is the N-day forward return positive? (buy signal = 1, sell/hold signal = 0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "81daaa5f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# forward return (what we're predicting)\n",
+    "df[\"fwd_ret\"] = df[\"Close\"].pct_change(HORIZON).shift(-HORIZON)\n",
+    "df[\"label\"] = (df[\"fwd_ret\"] > 0).astype(int)\n",
+    "\n",
+    "# ── Define feature columns (exclude raw OHLCV, target, and non-stationary cols)\n",
+    "EXCLUDE = {\n",
+    "    \"Open\", \"High\", \"Low\", \"Close\", \"Volume\",\n",
+    "    \"fwd_ret\", \"label\",\n",
+    "    \"sma_20\", \"sma_50\", \"sma_200\", \"ema_12\", \"ema_26\",  # non-stationary\n",
+    "    \"bb_upper\", \"bb_mid\", \"bb_lower\",                     # non-stationary\n",
+    "    \"kc_upper\", \"kc_lower\",                                # non-stationary\n",
+    "    \"obv\", \"obv_sma20\", \"ad_line\",                         # non-stationary\n",
+    "}\n",
+    "FEATURES = [c for c in df.columns if c not in EXCLUDE]\n",
+    "\n",
+    "# drop rows with NaN (from indicator warm-up + forward label)\n",
+    "model_df = df[FEATURES + [\"label\", \"fwd_ret\"]].dropna()\n",
+    "\n",
+    "print(f\"Features: {len(FEATURES)}\")\n",
+    "print(f\"Usable rows: {len(model_df)} ({model_df.index[0].date()} → {model_df.index[-1].date()})\")\n",
+    "print(f\"Label balance: {model_df['label'].value_counts(normalize=True).to_dict()}\")\n",
+    "print(f\"\\nFeature list:\\n{FEATURES}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "28769141",
+   "metadata": {},
+   "source": [
+    "## 5. Walk-Forward Split with Purge Gap\n",
+    "\n",
+    "Time series data **cannot** use random k-fold — future data would leak into training.\n",
+    "\n",
+    "We use **expanding-window walk-forward** with a **purge gap** between train/test:\n",
+    "\n",
+    "```\n",
+    "Fold 1: [====TRAIN====]--gap--[TEST]\n",
+    "Fold 2: [========TRAIN========]--gap--[TEST]\n",
+    "Fold 3: [============TRAIN============]--gap--[TEST]\n",
+    "```\n",
+    "\n",
+    "The gap prevents label leakage from overlapping forward-return windows."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "60594682",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def walk_forward_splits(n_samples: int, n_splits: int, test_size: int = 126,\n",
+    "                        purge_gap: int = 5, min_train: int = 504):\n",
+    "    \"\"\"\n",
+    "    Expanding-window walk-forward with purge gap.\n",
+    "    \n",
+    "    Yields (train_idx, test_idx) index arrays.\n",
+    "    test_size: ~6 months of trading days\n",
+    "    min_train: ~2 years of trading days\n",
+    "    purge_gap: days between train end and test start\n",
+    "    \"\"\"\n",
+    "    total_test = n_splits * test_size\n",
+    "    if min_train + total_test + n_splits * purge_gap > n_samples:\n",
+    "        raise ValueError(f\"Not enough data for {n_splits} splits. \"\n",
+    "                         f\"Need {min_train + total_test + n_splits * purge_gap}, have {n_samples}\")\n",
+    "    \n",
+    "    for i in range(n_splits):\n",
+    "        test_end = n_samples - (n_splits - 1 - i) * test_size\n",
+    "        test_start = test_end - test_size\n",
+    "        train_end = test_start - purge_gap\n",
+    "        train_start = 0  # expanding window (use max(0, train_end - fixed_window) for sliding)\n",
+    "        \n",
+    "        train_idx = np.arange(train_start, train_end)\n",
+    "        test_idx = np.arange(test_start, test_end)\n",
+    "        yield train_idx, test_idx\n",
+    "\n",
+    "\n",
+    "# ── Visualize the splits ────────────────────────────────────────\n",
+    "X = model_df[FEATURES].values\n",
+    "y = model_df[\"label\"].values\n",
+    "dates = model_df.index\n",
+    "\n",
+    "fig = go.Figure()\n",
+    "for fold, (tr_idx, te_idx) in enumerate(walk_forward_splits(len(X), N_SPLITS, purge_gap=PURGE_GAP, min_train=TRAIN_MIN)):\n",
+    "    fig.add_trace(go.Scatter(\n",
+    "        x=[dates[tr_idx[0]], dates[tr_idx[-1]]], y=[fold, fold],\n",
+    "        mode=\"lines\", line=dict(color=\"steelblue\", width=8),\n",
+    "        name=f\"Train {fold}\" if fold == 0 else None, showlegend=(fold == 0),\n",
+    "    ))\n",
+    "    fig.add_trace(go.Scatter(\n",
+    "        x=[dates[te_idx[0]], dates[te_idx[-1]]], y=[fold, fold],\n",
+    "        mode=\"lines\", line=dict(color=\"coral\", width=8),\n",
+    "        name=f\"Test {fold}\" if fold == 0 else None, showlegend=(fold == 0),\n",
+    "    ))\n",
+    "    print(f\"Fold {fold}: train {dates[tr_idx[0]].date()}→{dates[tr_idx[-1]].date()} \"\n",
+    "          f\"({len(tr_idx)}d) | test {dates[te_idx[0]].date()}→{dates[te_idx[-1]].date()} ({len(te_idx)}d)\")\n",
+    "\n",
+    "fig.update_layout(title=\"Walk-Forward Splits\", yaxis_title=\"Fold\", height=300)\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a80d23c9",
+   "metadata": {},
+   "source": [
+    "## 6. Train XGBoost per Fold — Walk-Forward\n",
+    "\n",
+    "Train on expanding window, predict test fold, collect out-of-sample predictions."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ca9b91e6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "oos_preds = []   # out-of-sample predictions\n",
+    "oos_proba = []   # predicted probabilities\n",
+    "oos_labels = []\n",
+    "oos_dates = []\n",
+    "oos_fwd_ret = []\n",
+    "fold_metrics = []\n",
+    "\n",
+    "for fold, (tr_idx, te_idx) in enumerate(walk_forward_splits(len(X), N_SPLITS, purge_gap=PURGE_GAP, min_train=TRAIN_MIN)):\n",
+    "    X_train, y_train = X[tr_idx], y[tr_idx]\n",
+    "    X_test, y_test = X[te_idx], y[te_idx]\n",
+    "    \n",
+    "    model = XGBClassifier(\n",
+    "        n_estimators=300,\n",
+    "        max_depth=4,\n",
+    "        learning_rate=0.05,\n",
+    "        subsample=0.8,\n",
+    "        colsample_bytree=0.8,\n",
+    "        reg_alpha=0.1,\n",
+    "        reg_lambda=1.0,\n",
+    "        random_state=42,\n",
+    "        eval_metric=\"logloss\",\n",
+    "        early_stopping_rounds=30,\n",
+    "    )\n",
+    "    model.fit(\n",
+    "        X_train, y_train,\n",
+    "        eval_set=[(X_test, y_test)],\n",
+    "        verbose=False,\n",
+    "    )\n",
+    "    \n",
+    "    preds = model.predict(X_test)\n",
+    "    proba = model.predict_proba(X_test)[:, 1]\n",
+    "    acc = accuracy_score(y_test, preds)\n",
+    "    \n",
+    "    oos_preds.extend(preds)\n",
+    "    oos_proba.extend(proba)\n",
+    "    oos_labels.extend(y_test)\n",
+    "    oos_dates.extend(dates[te_idx])\n",
+    "    oos_fwd_ret.extend(model_df[\"fwd_ret\"].values[te_idx])\n",
+    "    \n",
+    "    fold_metrics.append({\"fold\": fold, \"accuracy\": acc, \"train_size\": len(tr_idx), \"test_size\": len(te_idx)})\n",
+    "    print(f\"Fold {fold}: acc={acc:.3f} | train={len(tr_idx)} | test={len(te_idx)}\")\n",
+    "\n",
+    "print(f\"\\nOverall OOS accuracy: {accuracy_score(oos_labels, oos_preds):.3f}\")\n",
+    "print(classification_report(oos_labels, oos_preds, target_names=[\"SELL/HOLD\", \"BUY\"]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ea7d30fb",
+   "metadata": {},
+   "source": [
+    "## 7. Feature Importance (Last Fold)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "06f941b8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "imp = pd.Series(model.feature_importances_, index=FEATURES).sort_values(ascending=True)\n",
+    "fig = go.Figure(go.Bar(x=imp.tail(20), y=imp.tail(20).index, orientation=\"h\"))\n",
+    "fig.update_layout(title=\"Top 20 Feature Importances (last fold)\", height=500, margin=dict(l=150))\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1112fdda",
+   "metadata": {},
+   "source": [
+    "## 8. Strategy Simulation — Signal → Returns\n",
+    "\n",
+    "Convert model predictions to a strategy equity curve:\n",
+    "- **Signal = 1 (BUY)**: go long (earn the market return)\n",
+    "- **Signal = 0 (SELL/HOLD)**: stay in cash (earn 0)\n",
+    "\n",
+    "Compare against buy-and-hold benchmark."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0893ddb0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Build strategy returns series from OOS predictions\n",
+    "strat = pd.DataFrame({\n",
+    "    \"date\": oos_dates,\n",
+    "    \"signal\": oos_preds,\n",
+    "    \"proba\": oos_proba,\n",
+    "    \"fwd_ret\": oos_fwd_ret,\n",
+    "}).set_index(\"date\")\n",
+    "\n",
+    "# daily returns: we use daily close-to-close returns, masked by signal\n",
+    "# align with actual daily returns (not forward returns) for proper equity curve\n",
+    "daily_ret = df[\"Close\"].pct_change().reindex(strat.index)\n",
+    "\n",
+    "# strategy return: market return when signal=1, 0 when signal=0\n",
+    "strat[\"strat_ret\"] = daily_ret * strat[\"signal\"]\n",
+    "strat[\"bench_ret\"] = daily_ret\n",
+    "\n",
+    "# cumulative\n",
+    "strat[\"strat_equity\"] = (1 + strat[\"strat_ret\"]).cumprod()\n",
+    "strat[\"bench_equity\"] = (1 + strat[\"bench_ret\"]).cumprod()\n",
+    "\n",
+    "# plot\n",
+    "fig = go.Figure()\n",
+    "fig.add_trace(go.Scatter(x=strat.index, y=strat[\"strat_equity\"], name=\"Strategy\", line=dict(color=\"steelblue\")))\n",
+    "fig.add_trace(go.Scatter(x=strat.index, y=strat[\"bench_equity\"], name=\"Buy & Hold\", line=dict(color=\"gray\", dash=\"dot\")))\n",
+    "\n",
+    "# shade buy signals\n",
+    "in_market = strat[\"signal\"] == 1\n",
+    "changes = in_market.astype(int).diff().fillna(0)\n",
+    "entries = strat.index[changes == 1]\n",
+    "exits = strat.index[changes == -1]\n",
+    "# align: if first signal is 1, start from beginning\n",
+    "if in_market.iloc[0]:\n",
+    "    entries = entries.insert(0, strat.index[0])\n",
+    "if in_market.iloc[-1]:\n",
+    "    exits = exits.append(pd.DatetimeIndex([strat.index[-1]]))\n",
+    "for ent, ext in zip(entries, exits):\n",
+    "    fig.add_vrect(x0=ent, x1=ext, fillcolor=\"green\", opacity=0.07, line_width=0)\n",
+    "\n",
+    "fig.update_layout(\n",
+    "    title=\"Strategy vs Buy & Hold (OOS)\",\n",
+    "    yaxis_title=\"Equity ($1 start)\", height=450,\n",
+    ")\n",
+    "fig.show()\n",
+    "\n",
+    "print(f\"Strategy final: ${strat['strat_equity'].iloc[-1]:.2f}\")\n",
+    "print(f\"Benchmark final: ${strat['bench_equity'].iloc[-1]:.2f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d757116a",
+   "metadata": {},
+   "source": [
+    "## 9. QuantStats Tearsheet\n",
+    "\n",
+    "Full performance report: Sharpe, Sortino, max drawdown, rolling metrics, monthly heatmap."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "34fdc588",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# quantstats expects a returns series with datetime index\n",
+    "strategy_returns = strat[\"strat_ret\"].copy()\n",
+    "strategy_returns.index = pd.DatetimeIndex(strategy_returns.index)\n",
+    "benchmark_returns = strat[\"bench_ret\"].copy()\n",
+    "benchmark_returns.index = pd.DatetimeIndex(benchmark_returns.index)\n",
+    "\n",
+    "qs.extend_pandas()\n",
+    "\n",
+    "# key metrics\n",
+    "print(\"=\" * 50)\n",
+    "print(\"STRATEGY METRICS (out-of-sample)\")\n",
+    "print(\"=\" * 50)\n",
+    "print(f\"Sharpe:       {qs.stats.sharpe(strategy_returns):.2f}\")\n",
+    "print(f\"Sortino:      {qs.stats.sortino(strategy_returns):.2f}\")\n",
+    "print(f\"Max Drawdown: {qs.stats.max_drawdown(strategy_returns):.2%}\")\n",
+    "print(f\"CAGR:         {qs.stats.cagr(strategy_returns):.2%}\")\n",
+    "print(f\"Calmar:       {qs.stats.calmar(strategy_returns):.2f}\")\n",
+    "print(f\"Win Rate:     {qs.stats.win_rate(strategy_returns):.2%}\")\n",
+    "print(f\"Volatility:   {qs.stats.volatility(strategy_returns):.2%}\")\n",
+    "print(f\"Avg Win:      {qs.stats.avg_win(strategy_returns):.4f}\")\n",
+    "print(f\"Avg Loss:     {qs.stats.avg_loss(strategy_returns):.4f}\")\n",
+    "print(f\"Profit Factor:{qs.stats.profit_factor(strategy_returns):.2f}\")\n",
+    "print(\"=\" * 50)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6799c588",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# full HTML tearsheet — saved to file + displayed inline\n",
+    "qs.reports.html(strategy_returns, benchmark=benchmark_returns,\n",
+    "                title=f\"{TICKER} ML Signal Strategy (OOS Walk-Forward)\",\n",
+    "                output=\"tearsheet.html\")\n",
+    "print(\"Tearsheet saved to tearsheet.html\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4bb838bb",
+   "metadata": {},
+   "source": [
+    "## 10. Signal Dashboard — Price + Indicators + Buy/Sell Signals"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "67cae2a4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# show last fold's test period with signals overlaid on price\n",
+    "last_test_dates = strat.index[-126:]  # last ~6 months\n",
+    "viz = df.loc[last_test_dates].copy()\n",
+    "sig = strat.loc[last_test_dates]\n",
+    "\n",
+    "fig = make_subplots(\n",
+    "    rows=4, cols=1, shared_xaxes=True,\n",
+    "    row_heights=[0.4, 0.2, 0.2, 0.2],\n",
+    "    vertical_spacing=0.03,\n",
+    "    subplot_titles=[\"Price + Bollinger Bands + Signals\", \"RSI(14)\", \"MACD\", \"Volume\"]\n",
+    ")\n",
+    "\n",
+    "# Row 1: Candlestick + BB + signals\n",
+    "fig.add_trace(go.Candlestick(\n",
+    "    x=viz.index, open=viz[\"Open\"], high=viz[\"High\"], low=viz[\"Low\"], close=viz[\"Close\"],\n",
+    "    name=\"OHLC\", increasing_line_color=\"steelblue\", decreasing_line_color=\"salmon\",\n",
+    "), row=1, col=1)\n",
+    "fig.add_trace(go.Scatter(x=viz.index, y=viz[\"bb_upper\"], line=dict(color=\"gray\", width=1, dash=\"dot\"), name=\"BB Upper\"), row=1, col=1)\n",
+    "fig.add_trace(go.Scatter(x=viz.index, y=viz[\"bb_lower\"], line=dict(color=\"gray\", width=1, dash=\"dot\"), name=\"BB Lower\", fill=\"tonexty\", fillcolor=\"rgba(128,128,128,0.05)\"), row=1, col=1)\n",
+    "fig.add_trace(go.Scatter(x=viz.index, y=viz[\"sma_50\"], line=dict(color=\"orange\", width=1), name=\"SMA 50\"), row=1, col=1)\n",
+    "\n",
+    "# buy/sell markers\n",
+    "buy_mask = sig[\"signal\"] == 1\n",
+    "changes = buy_mask.astype(int).diff()\n",
+    "buy_entries = sig.index[changes == 1]\n",
+    "sell_entries = sig.index[changes == -1]\n",
+    "if len(buy_entries):\n",
+    "    fig.add_trace(go.Scatter(x=buy_entries, y=viz.loc[buy_entries, \"Low\"] * 0.995,\n",
+    "        mode=\"markers\", marker=dict(symbol=\"triangle-up\", size=10, color=\"green\"), name=\"BUY\"), row=1, col=1)\n",
+    "if len(sell_entries):\n",
+    "    fig.add_trace(go.Scatter(x=sell_entries, y=viz.loc[sell_entries, \"High\"] * 1.005,\n",
+    "        mode=\"markers\", marker=dict(symbol=\"triangle-down\", size=10, color=\"red\"), name=\"SELL\"), row=1, col=1)\n",
+    "\n",
+    "# Row 2: RSI\n",
+    "fig.add_trace(go.Scatter(x=viz.index, y=viz[\"rsi_14\"], line=dict(color=\"purple\", width=1.5), name=\"RSI 14\"), row=2, col=1)\n",
+    "fig.add_hline(y=70, line_dash=\"dash\", line_color=\"red\", opacity=0.5, row=2, col=1)\n",
+    "fig.add_hline(y=30, line_dash=\"dash\", line_color=\"green\", opacity=0.5, row=2, col=1)\n",
+    "\n",
+    "# Row 3: MACD\n",
+    "fig.add_trace(go.Scatter(x=viz.index, y=viz[\"macd\"], line=dict(color=\"blue\", width=1.5), name=\"MACD\"), row=3, col=1)\n",
+    "fig.add_trace(go.Scatter(x=viz.index, y=viz[\"macd_signal\"], line=dict(color=\"orange\", width=1), name=\"Signal\"), row=3, col=1)\n",
+    "colors = [\"green\" if v >= 0 else \"red\" for v in viz[\"macd_hist\"]]\n",
+    "fig.add_trace(go.Bar(x=viz.index, y=viz[\"macd_hist\"], marker_color=colors, name=\"Hist\", opacity=0.5), row=3, col=1)\n",
+    "\n",
+    "# Row 4: Volume\n",
+    "fig.add_trace(go.Bar(x=viz.index, y=viz[\"Volume\"], marker_color=\"steelblue\", name=\"Volume\", opacity=0.5), row=4, col=1)\n",
+    "fig.add_trace(go.Scatter(x=viz.index, y=viz[\"Volume\"].rolling(20).mean(), line=dict(color=\"orange\", width=1), name=\"Vol SMA20\"), row=4, col=1)\n",
+    "\n",
+    "fig.update_layout(height=900, title=f\"{TICKER} — Last Test Fold Signal Dashboard\", xaxis_rangeslider_visible=False, showlegend=False)\n",
+    "fig.update_xaxes(rangeslider_visible=False)\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5b25b6c4",
+   "metadata": {},
+   "source": [
+    "## Next Steps\n",
+    "\n",
+    "Things to iterate on from here:\n",
+    "\n",
+    "1. **Multi-asset**: swap `TICKER` to BTC-USD, QQQ, GLD, etc. or loop over a universe\n",
+    "2. **Probability threshold**: instead of binary 0/1, use `proba > 0.6` for higher-conviction signals\n",
+    "3. **Position sizing**: Kelly criterion via `PyPortfolioOpt` based on predicted probability\n",
+    "4. **Regime filter**: add ADX/volatility regime detection — only trade in trending regimes\n",
+    "5. **Transaction costs**: subtract realistic slippage (e.g., 5bps per trade) from returns\n",
+    "6. **Alternative splitters you have installed**:\n",
+    "   - `from tscv import GapWalkForward` — sklearn-compatible, handles gap + purge natively\n",
+    "   - `from sktime.split import ExpandingWindowSplitter, SlidingWindowSplitter`\n",
+    "   - `from sklearn.model_selection import TimeSeriesSplit` — basic but solid\n",
+    "7. **LightGBM**: drop-in replacement for XGBoost, often faster on large feature sets\n",
+    "8. **Meta-labeling** (Lopez de Prado): train a secondary model on whether the primary model's signals are correct"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}