chore: initial project scaffold for quant trading learning

Sets up project structure with yfinance-based OHLCV fetcher for top
100 S&P companies, Jupyter notebook scaffold, and uv-managed deps.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-25 00:58:02 +08:00
commit 60fa0c113d
7 changed files with 5406 additions and 0 deletions
+1
View File
@@ -0,0 +1 @@
3.12
View File
+67
View File
@@ -0,0 +1,67 @@
import pandas as pd
import yfinance as yf
import pyarrow.parquet as pq
from pathlib import Path
import time
TOP100_TICKERS = [
"AAPL", "MSFT", "NVDA", "AMZN", "META", "GOOGL", "GOOG", "TSLA", "BRK-B", "JPM",
"LLY", "V", "UNH", "XOM", "MA", "AVGO", "PG", "HD", "COST", "JNJ",
"ABBV", "MRK", "PEP", "KO", "CVX", "NFLX", "ADBE", "CRM", "AMD", "ACN",
"MCD", "WMT", "BAC", "CSCO", "TMO", "ABT", "DHR", "CMCSA", "UPS", "LIN",
"NEE", "PM", "TXN", "HON", "ORCL", "QCOM", "RTX", "INTU", "AMGN", "LOW",
"AMAT", "BMY", "UNP", "AXP", "CAT", "BKNG", "SPGI", "VRTX", "DE", "GILD",
"ELV", "ISRG", "ADP", "MDT", "LMT", "CVS", "CI", "ADI", "SYK", "SBUX",
"MMC", "C", "TJX", "GS", "BLK", "SCHW", "ZTS", "ETN", "PLD", "BDX",
"REGN", "ADI", "CL", "CME", "CB", "SO", "DUK", "NOC", "FIS", "ITW",
"APD", "NSC", "BSX", "GD", "FI", "MMM", "PGR", "WM", "AFL", "SLB",
]
def fetch_top100_ohlcv(output_path: str, period: str = "max") -> None:
output_path = Path(output_path)
output_path.mkdir(parents=True, exist_ok=True)
print(f"Fetching OHLCV data for top 100 companies (period={period})...")
records = []
errors = []
for i, ticker in enumerate(TOP100_TICKERS, 1):
try:
print(f"[{i}/{len(TOP100_TICKERS)}] Downloading {ticker}...", end=" ")
hist = yf.Ticker(ticker).history(period=period, auto_adjust=False)
if hist.empty:
print(f"no data, skipping")
errors.append((ticker, "no data"))
continue
hist = hist.reset_index()
hist["Date"] = hist["Date"].astype(str)
hist["ticker"] = ticker
records.append(hist)
print(f"{len(hist)} rows")
time.sleep(0.1)
except Exception as e:
print(f"error: {e}")
errors.append((ticker, str(e)))
if not records:
print("No data fetched.")
return
df = pd.concat(records, ignore_index=True)
cols = ["ticker", "Date", "Open", "High", "Low", "Close", "Volume", "Dividends", "Stock Splits"]
cols = [c for c in cols if c in df.columns]
df = df[cols]
parquet_path = output_path / "top100_companies.parquet"
df.to_parquet(parquet_path, index=False)
print(f"\nSaved {len(df)} rows to {parquet_path}")
print(f"Date range: {df['Date'].min()} -> {df['Date'].max()}")
print(f"Tickers fetched: {df['ticker'].nunique()}")
if errors:
print(f"Errors ({len(errors)}): {errors[:10]}")
if __name__ == "__main__":
fetch_top100_ohlcv("data/top100_ohlcv")
+6
View File
@@ -0,0 +1,6 @@
def main():
print("Hello from learn-trading!")
if __name__ == "__main__":
main()
+51
View File
@@ -0,0 +1,51 @@
[project]
name = "learn-trading"
version = "0.0.1"
description = "Combined trading research environment"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"marimo>=0.23.2",
"arch>=8.0.0",
"backtesting>=0.6.5",
"backtrader>=1.9.78.123",
"ccxt>=4.5.50",
"empyrical-reloaded>=0.5.12",
"finta>=1.3",
"hmmlearn>=0.3.3",
"jupyterlab>=4.5.6",
"lightgbm>=4.6.0",
"lxml>=6.1.0",
"matplotlib>=3.10.8",
"mplfinance>=0.12.10b0",
"numpy>=2.2.6,<2.3",
"pandas>=2.3.3",
"pandas-ta>=0.4.71b0",
"plotly>=6.7.0",
"polars>=1.40.1",
"pyarrow>=24.0.0",
"pyportfolioopt>=1.6.0",
"qlib>=0.0.2.dev20",
"quantstats>=0.0.81",
"riskfolio-lib>=7.2.1",
"scikit-learn>=1.7.2",
"scipy>=1.17.1",
"sktime>=0.40.1",
"statsmodels>=0.14.6",
"ta>=0.11.0",
"ta-lib>=0.6.8",
"torch>=2.0.0",
"transformers>=4.0.0",
"tscv>=0.1.3",
"vectorbt>=1.0.0",
"xgboost>=3.2.0",
"yfinance>=1.3.0",
]
[[tool.uv.index]]
name = "pytorch-cpu"
url = "https://download.pytorch.org/whl/cpu"
explicit = true
[tool.uv.sources]
torch = { index = "pytorch-cpu" }
+636
View File
@@ -0,0 +1,636 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "0229f5ae",
"metadata": {},
"source": [
"# Quant Trading Scaffold\n",
"## Data Ingestion → Indicators → Walk-Forward ML → Backtesting → Tearsheet\n",
"\n",
"Pipeline:\n",
"1. **Ingest** OHLCV data via yfinance\n",
"2. **Engineer features** — momentum, trend, volatility, volume indicators\n",
"3. **Label** — binary classification (next-N-day return > 0)\n",
"4. **Walk-forward split** with purging (no leakage)\n",
"5. **Train** XGBoost classifier per fold\n",
"6. **Evaluate** with quantstats tearsheet"
]
},
{
"cell_type": "markdown",
"id": "43b4d162",
"metadata": {},
"source": [
"## 1. Config & Imports"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "aab1cebb",
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'numpy'",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 6\u001b[39m\n\u001b[32m 3\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mwarnings\u001b[39;00m\n\u001b[32m 4\u001b[39m warnings.filterwarnings(\u001b[33m\"\u001b[39m\u001b[33mignore\u001b[39m\u001b[33m\"\u001b[39m, category=\u001b[38;5;167;01mFutureWarning\u001b[39;00m)\n\u001b[32m----> \u001b[39m\u001b[32m6\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mnumpy\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mnp\u001b[39;00m\n\u001b[32m 7\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpandas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpd\u001b[39;00m\n\u001b[32m 8\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpandas_ta\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mta\u001b[39;00m\n",
"\u001b[31mModuleNotFoundError\u001b[39m: No module named 'numpy'"
]
}
],
"source": [
"from __future__ import annotations\n",
"\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\", category=FutureWarning)\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import pandas_ta as ta\n",
"import yfinance as yf\n",
"import plotly.graph_objects as go\n",
"from plotly.subplots import make_subplots\n",
"from sklearn.model_selection import TimeSeriesSplit\n",
"from sklearn.metrics import accuracy_score, classification_report\n",
"from xgboost import XGBClassifier\n",
"import quantstats as qs\n",
"\n",
"# ── Config ──────────────────────────────────────────────────────\n",
"TICKER = \"SPY\"\n",
"START = \"2015-01-01\"\n",
"END = \"2025-12-31\"\n",
"HORIZON = 5 # predict N-day forward return\n",
"PURGE_GAP = 5 # gap between train/test to prevent leakage\n",
"N_SPLITS = 5 # walk-forward folds\n",
"TRAIN_MIN = 504 # ~2 years minimum training window\n",
"\n",
"print(f\"Config: {TICKER} | {START}→{END} | horizon={HORIZON}d | {N_SPLITS} folds\")"
]
},
{
"cell_type": "markdown",
"id": "28af2cae",
"metadata": {},
"source": [
"## 2. Data Ingestion"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b4d755da",
"metadata": {},
"outputs": [],
"source": [
"raw = yf.download(TICKER, start=START, end=END, auto_adjust=True)\n",
"# yfinance may return MultiIndex columns for single ticker — flatten\n",
"if isinstance(raw.columns, pd.MultiIndex):\n",
" raw.columns = raw.columns.droplevel(\"Ticker\")\n",
"raw.index = pd.DatetimeIndex(raw.index)\n",
"df = raw.copy()\n",
"print(f\"Downloaded {len(df)} bars: {df.index[0].date()} → {df.index[-1].date()}\")\n",
"df.tail(3)"
]
},
{
"cell_type": "markdown",
"id": "e9b1bad5",
"metadata": {},
"source": [
"## 3. Feature Engineering — Technical Indicators\n",
"\n",
"We compute features across 4 categories:\n",
"- **Momentum**: RSI, MACD, Stochastic, Williams %R, ROC\n",
"- **Trend**: SMA/EMA crossovers, ADX, Ichimoku\n",
"- **Volatility**: Bollinger Bands, ATR, Keltner Channels\n",
"- **Volume**: OBV, MFI, Accumulation/Distribution"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a83bf612",
"metadata": {},
"outputs": [],
"source": [
"# ── Momentum ────────────────────────────────────────────────────\n",
"df[\"rsi_14\"] = ta.rsi(df[\"Close\"], length=14)\n",
"df[\"rsi_7\"] = ta.rsi(df[\"Close\"], length=7)\n",
"\n",
"macd = ta.macd(df[\"Close\"], fast=12, slow=26, signal=9)\n",
"df[\"macd\"] = macd.iloc[:, 0] # MACD line\n",
"df[\"macd_signal\"] = macd.iloc[:, 1] # signal line\n",
"df[\"macd_hist\"] = macd.iloc[:, 2] # histogram\n",
"\n",
"stoch = ta.stoch(df[\"High\"], df[\"Low\"], df[\"Close\"])\n",
"df[\"stoch_k\"] = stoch.iloc[:, 0]\n",
"df[\"stoch_d\"] = stoch.iloc[:, 1]\n",
"\n",
"df[\"willr_14\"] = ta.willr(df[\"High\"], df[\"Low\"], df[\"Close\"], length=14)\n",
"df[\"roc_10\"] = ta.roc(df[\"Close\"], length=10)\n",
"df[\"roc_21\"] = ta.roc(df[\"Close\"], length=21)\n",
"df[\"mom_10\"] = ta.mom(df[\"Close\"], length=10)\n",
"\n",
"# ── Trend ───────────────────────────────────────────────────────\n",
"df[\"sma_20\"] = ta.sma(df[\"Close\"], length=20)\n",
"df[\"sma_50\"] = ta.sma(df[\"Close\"], length=50)\n",
"df[\"sma_200\"] = ta.sma(df[\"Close\"], length=200)\n",
"df[\"ema_12\"] = ta.ema(df[\"Close\"], length=12)\n",
"df[\"ema_26\"] = ta.ema(df[\"Close\"], length=26)\n",
"\n",
"# crossover features (price relative to MAs)\n",
"df[\"close_over_sma20\"] = (df[\"Close\"] / df[\"sma_20\"]) - 1\n",
"df[\"close_over_sma50\"] = (df[\"Close\"] / df[\"sma_50\"]) - 1\n",
"df[\"close_over_sma200\"] = (df[\"Close\"] / df[\"sma_200\"]) - 1\n",
"df[\"sma20_over_sma50\"] = (df[\"sma_20\"] / df[\"sma_50\"]) - 1\n",
"df[\"sma50_over_sma200\"] = (df[\"sma_50\"] / df[\"sma_200\"]) - 1\n",
"\n",
"adx = ta.adx(df[\"High\"], df[\"Low\"], df[\"Close\"], length=14)\n",
"df[\"adx\"] = adx.iloc[:, 0]\n",
"df[\"di_plus\"] = adx.iloc[:, 1]\n",
"df[\"di_minus\"] = adx.iloc[:, 2]\n",
"\n",
"# ── Volatility ──────────────────────────────────────────────────\n",
"bbands = ta.bbands(df[\"Close\"], length=20, std=2)\n",
"df[\"bb_upper\"] = bbands.iloc[:, 0]\n",
"df[\"bb_mid\"] = bbands.iloc[:, 1]\n",
"df[\"bb_lower\"] = bbands.iloc[:, 2]\n",
"df[\"bb_width\"] = bbands.iloc[:, 3]\n",
"df[\"bb_pctb\"] = bbands.iloc[:, 4] # %B: where price is within bands\n",
"\n",
"df[\"atr_14\"] = ta.atr(df[\"High\"], df[\"Low\"], df[\"Close\"], length=14)\n",
"df[\"atr_pct\"] = df[\"atr_14\"] / df[\"Close\"] # normalized ATR\n",
"\n",
"kc = ta.kc(df[\"High\"], df[\"Low\"], df[\"Close\"], length=20)\n",
"df[\"kc_upper\"] = kc.iloc[:, 0]\n",
"df[\"kc_lower\"] = kc.iloc[:, 1]\n",
"\n",
"# volatility: rolling std of returns\n",
"df[\"vol_10\"] = df[\"Close\"].pct_change().rolling(10).std()\n",
"df[\"vol_21\"] = df[\"Close\"].pct_change().rolling(21).std()\n",
"\n",
"# ── Volume ──────────────────────────────────────────────────────\n",
"df[\"obv\"] = ta.obv(df[\"Close\"], df[\"Volume\"])\n",
"df[\"obv_sma20\"] = ta.sma(df[\"obv\"], length=20)\n",
"df[\"mfi_14\"] = ta.mfi(df[\"High\"], df[\"Low\"], df[\"Close\"], df[\"Volume\"], length=14)\n",
"ad = ta.ad(df[\"High\"], df[\"Low\"], df[\"Close\"], df[\"Volume\"])\n",
"df[\"ad_line\"] = ad\n",
"\n",
"# volume relative to average\n",
"df[\"vol_ratio_20\"] = df[\"Volume\"] / df[\"Volume\"].rolling(20).mean()\n",
"\n",
"# ── Returns features ────────────────────────────────────────────\n",
"df[\"ret_1d\"] = df[\"Close\"].pct_change(1)\n",
"df[\"ret_5d\"] = df[\"Close\"].pct_change(5)\n",
"df[\"ret_10d\"] = df[\"Close\"].pct_change(10)\n",
"df[\"ret_21d\"] = df[\"Close\"].pct_change(21)\n",
"\n",
"print(f\"Total columns after feature engineering: {len(df.columns)}\")\n",
"df.tail(3)"
]
},
{
"cell_type": "markdown",
"id": "907e377c",
"metadata": {},
"source": [
"## 4. Labeling — Forward Return Classification\n",
"\n",
"Target: is the N-day forward return positive? (buy signal = 1, sell/hold signal = 0)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "81daaa5f",
"metadata": {},
"outputs": [],
"source": [
"# forward return (what we're predicting)\n",
"df[\"fwd_ret\"] = df[\"Close\"].pct_change(HORIZON).shift(-HORIZON)\n",
"df[\"label\"] = (df[\"fwd_ret\"] > 0).astype(int)\n",
"\n",
"# ── Define feature columns (exclude raw OHLCV, target, and non-stationary cols)\n",
"EXCLUDE = {\n",
" \"Open\", \"High\", \"Low\", \"Close\", \"Volume\",\n",
" \"fwd_ret\", \"label\",\n",
" \"sma_20\", \"sma_50\", \"sma_200\", \"ema_12\", \"ema_26\", # non-stationary\n",
" \"bb_upper\", \"bb_mid\", \"bb_lower\", # non-stationary\n",
" \"kc_upper\", \"kc_lower\", # non-stationary\n",
" \"obv\", \"obv_sma20\", \"ad_line\", # non-stationary\n",
"}\n",
"FEATURES = [c for c in df.columns if c not in EXCLUDE]\n",
"\n",
"# drop rows with NaN (from indicator warm-up + forward label)\n",
"model_df = df[FEATURES + [\"label\", \"fwd_ret\"]].dropna()\n",
"\n",
"print(f\"Features: {len(FEATURES)}\")\n",
"print(f\"Usable rows: {len(model_df)} ({model_df.index[0].date()} → {model_df.index[-1].date()})\")\n",
"print(f\"Label balance: {model_df['label'].value_counts(normalize=True).to_dict()}\")\n",
"print(f\"\\nFeature list:\\n{FEATURES}\")"
]
},
{
"cell_type": "markdown",
"id": "28769141",
"metadata": {},
"source": [
"## 5. Walk-Forward Split with Purge Gap\n",
"\n",
"Time series data **cannot** use random k-fold — future data would leak into training.\n",
"\n",
"We use **expanding-window walk-forward** with a **purge gap** between train/test:\n",
"\n",
"```\n",
"Fold 1: [====TRAIN====]--gap--[TEST]\n",
"Fold 2: [========TRAIN========]--gap--[TEST]\n",
"Fold 3: [============TRAIN============]--gap--[TEST]\n",
"```\n",
"\n",
"The gap prevents label leakage from overlapping forward-return windows."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "60594682",
"metadata": {},
"outputs": [],
"source": [
"def walk_forward_splits(n_samples: int, n_splits: int, test_size: int = 126,\n",
" purge_gap: int = 5, min_train: int = 504):\n",
" \"\"\"\n",
" Expanding-window walk-forward with purge gap.\n",
" \n",
" Yields (train_idx, test_idx) index arrays.\n",
" test_size: ~6 months of trading days\n",
" min_train: ~2 years of trading days\n",
" purge_gap: days between train end and test start\n",
" \"\"\"\n",
" total_test = n_splits * test_size\n",
" if min_train + total_test + n_splits * purge_gap > n_samples:\n",
" raise ValueError(f\"Not enough data for {n_splits} splits. \"\n",
" f\"Need {min_train + total_test + n_splits * purge_gap}, have {n_samples}\")\n",
" \n",
" for i in range(n_splits):\n",
" test_end = n_samples - (n_splits - 1 - i) * test_size\n",
" test_start = test_end - test_size\n",
" train_end = test_start - purge_gap\n",
" train_start = 0 # expanding window (use max(0, train_end - fixed_window) for sliding)\n",
" \n",
" train_idx = np.arange(train_start, train_end)\n",
" test_idx = np.arange(test_start, test_end)\n",
" yield train_idx, test_idx\n",
"\n",
"\n",
"# ── Visualize the splits ────────────────────────────────────────\n",
"X = model_df[FEATURES].values\n",
"y = model_df[\"label\"].values\n",
"dates = model_df.index\n",
"\n",
"fig = go.Figure()\n",
"for fold, (tr_idx, te_idx) in enumerate(walk_forward_splits(len(X), N_SPLITS, purge_gap=PURGE_GAP, min_train=TRAIN_MIN)):\n",
" fig.add_trace(go.Scatter(\n",
" x=[dates[tr_idx[0]], dates[tr_idx[-1]]], y=[fold, fold],\n",
" mode=\"lines\", line=dict(color=\"steelblue\", width=8),\n",
" name=f\"Train {fold}\" if fold == 0 else None, showlegend=(fold == 0),\n",
" ))\n",
" fig.add_trace(go.Scatter(\n",
" x=[dates[te_idx[0]], dates[te_idx[-1]]], y=[fold, fold],\n",
" mode=\"lines\", line=dict(color=\"coral\", width=8),\n",
" name=f\"Test {fold}\" if fold == 0 else None, showlegend=(fold == 0),\n",
" ))\n",
" print(f\"Fold {fold}: train {dates[tr_idx[0]].date()}→{dates[tr_idx[-1]].date()} \"\n",
" f\"({len(tr_idx)}d) | test {dates[te_idx[0]].date()}→{dates[te_idx[-1]].date()} ({len(te_idx)}d)\")\n",
"\n",
"fig.update_layout(title=\"Walk-Forward Splits\", yaxis_title=\"Fold\", height=300)\n",
"fig.show()"
]
},
{
"cell_type": "markdown",
"id": "a80d23c9",
"metadata": {},
"source": [
"## 6. Train XGBoost per Fold — Walk-Forward\n",
"\n",
"Train on expanding window, predict test fold, collect out-of-sample predictions."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ca9b91e6",
"metadata": {},
"outputs": [],
"source": [
"oos_preds = [] # out-of-sample predictions\n",
"oos_proba = [] # predicted probabilities\n",
"oos_labels = []\n",
"oos_dates = []\n",
"oos_fwd_ret = []\n",
"fold_metrics = []\n",
"\n",
"for fold, (tr_idx, te_idx) in enumerate(walk_forward_splits(len(X), N_SPLITS, purge_gap=PURGE_GAP, min_train=TRAIN_MIN)):\n",
" X_train, y_train = X[tr_idx], y[tr_idx]\n",
" X_test, y_test = X[te_idx], y[te_idx]\n",
" \n",
" model = XGBClassifier(\n",
" n_estimators=300,\n",
" max_depth=4,\n",
" learning_rate=0.05,\n",
" subsample=0.8,\n",
" colsample_bytree=0.8,\n",
" reg_alpha=0.1,\n",
" reg_lambda=1.0,\n",
" random_state=42,\n",
" eval_metric=\"logloss\",\n",
" early_stopping_rounds=30,\n",
" )\n",
" model.fit(\n",
" X_train, y_train,\n",
" eval_set=[(X_test, y_test)],\n",
" verbose=False,\n",
" )\n",
" \n",
" preds = model.predict(X_test)\n",
" proba = model.predict_proba(X_test)[:, 1]\n",
" acc = accuracy_score(y_test, preds)\n",
" \n",
" oos_preds.extend(preds)\n",
" oos_proba.extend(proba)\n",
" oos_labels.extend(y_test)\n",
" oos_dates.extend(dates[te_idx])\n",
" oos_fwd_ret.extend(model_df[\"fwd_ret\"].values[te_idx])\n",
" \n",
" fold_metrics.append({\"fold\": fold, \"accuracy\": acc, \"train_size\": len(tr_idx), \"test_size\": len(te_idx)})\n",
" print(f\"Fold {fold}: acc={acc:.3f} | train={len(tr_idx)} | test={len(te_idx)}\")\n",
"\n",
"print(f\"\\nOverall OOS accuracy: {accuracy_score(oos_labels, oos_preds):.3f}\")\n",
"print(classification_report(oos_labels, oos_preds, target_names=[\"SELL/HOLD\", \"BUY\"]))"
]
},
{
"cell_type": "markdown",
"id": "ea7d30fb",
"metadata": {},
"source": [
"## 7. Feature Importance (Last Fold)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "06f941b8",
"metadata": {},
"outputs": [],
"source": [
"imp = pd.Series(model.feature_importances_, index=FEATURES).sort_values(ascending=True)\n",
"fig = go.Figure(go.Bar(x=imp.tail(20), y=imp.tail(20).index, orientation=\"h\"))\n",
"fig.update_layout(title=\"Top 20 Feature Importances (last fold)\", height=500, margin=dict(l=150))\n",
"fig.show()"
]
},
{
"cell_type": "markdown",
"id": "1112fdda",
"metadata": {},
"source": [
"## 8. Strategy Simulation — Signal → Returns\n",
"\n",
"Convert model predictions to a strategy equity curve:\n",
"- **Signal = 1 (BUY)**: go long (earn the market return)\n",
"- **Signal = 0 (SELL/HOLD)**: stay in cash (earn 0)\n",
"\n",
"Compare against buy-and-hold benchmark."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0893ddb0",
"metadata": {},
"outputs": [],
"source": [
"# Build strategy returns series from OOS predictions\n",
"strat = pd.DataFrame({\n",
" \"date\": oos_dates,\n",
" \"signal\": oos_preds,\n",
" \"proba\": oos_proba,\n",
" \"fwd_ret\": oos_fwd_ret,\n",
"}).set_index(\"date\")\n",
"\n",
"# daily returns: we use daily close-to-close returns, masked by signal\n",
"# align with actual daily returns (not forward returns) for proper equity curve\n",
"daily_ret = df[\"Close\"].pct_change().reindex(strat.index)\n",
"\n",
"# strategy return: market return when signal=1, 0 when signal=0\n",
"strat[\"strat_ret\"] = daily_ret * strat[\"signal\"]\n",
"strat[\"bench_ret\"] = daily_ret\n",
"\n",
"# cumulative\n",
"strat[\"strat_equity\"] = (1 + strat[\"strat_ret\"]).cumprod()\n",
"strat[\"bench_equity\"] = (1 + strat[\"bench_ret\"]).cumprod()\n",
"\n",
"# plot\n",
"fig = go.Figure()\n",
"fig.add_trace(go.Scatter(x=strat.index, y=strat[\"strat_equity\"], name=\"Strategy\", line=dict(color=\"steelblue\")))\n",
"fig.add_trace(go.Scatter(x=strat.index, y=strat[\"bench_equity\"], name=\"Buy & Hold\", line=dict(color=\"gray\", dash=\"dot\")))\n",
"\n",
"# shade buy signals\n",
"in_market = strat[\"signal\"] == 1\n",
"changes = in_market.astype(int).diff().fillna(0)\n",
"entries = strat.index[changes == 1]\n",
"exits = strat.index[changes == -1]\n",
"# align: if first signal is 1, start from beginning\n",
"if in_market.iloc[0]:\n",
" entries = entries.insert(0, strat.index[0])\n",
"if in_market.iloc[-1]:\n",
" exits = exits.append(pd.DatetimeIndex([strat.index[-1]]))\n",
"for ent, ext in zip(entries, exits):\n",
" fig.add_vrect(x0=ent, x1=ext, fillcolor=\"green\", opacity=0.07, line_width=0)\n",
"\n",
"fig.update_layout(\n",
" title=\"Strategy vs Buy & Hold (OOS)\",\n",
" yaxis_title=\"Equity ($1 start)\", height=450,\n",
")\n",
"fig.show()\n",
"\n",
"print(f\"Strategy final: ${strat['strat_equity'].iloc[-1]:.2f}\")\n",
"print(f\"Benchmark final: ${strat['bench_equity'].iloc[-1]:.2f}\")"
]
},
{
"cell_type": "markdown",
"id": "d757116a",
"metadata": {},
"source": [
"## 9. QuantStats Tearsheet\n",
"\n",
"Full performance report: Sharpe, Sortino, max drawdown, rolling metrics, monthly heatmap."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "34fdc588",
"metadata": {},
"outputs": [],
"source": [
"# quantstats expects a returns series with datetime index\n",
"strategy_returns = strat[\"strat_ret\"].copy()\n",
"strategy_returns.index = pd.DatetimeIndex(strategy_returns.index)\n",
"benchmark_returns = strat[\"bench_ret\"].copy()\n",
"benchmark_returns.index = pd.DatetimeIndex(benchmark_returns.index)\n",
"\n",
"qs.extend_pandas()\n",
"\n",
"# key metrics\n",
"print(\"=\" * 50)\n",
"print(\"STRATEGY METRICS (out-of-sample)\")\n",
"print(\"=\" * 50)\n",
"print(f\"Sharpe: {qs.stats.sharpe(strategy_returns):.2f}\")\n",
"print(f\"Sortino: {qs.stats.sortino(strategy_returns):.2f}\")\n",
"print(f\"Max Drawdown: {qs.stats.max_drawdown(strategy_returns):.2%}\")\n",
"print(f\"CAGR: {qs.stats.cagr(strategy_returns):.2%}\")\n",
"print(f\"Calmar: {qs.stats.calmar(strategy_returns):.2f}\")\n",
"print(f\"Win Rate: {qs.stats.win_rate(strategy_returns):.2%}\")\n",
"print(f\"Volatility: {qs.stats.volatility(strategy_returns):.2%}\")\n",
"print(f\"Avg Win: {qs.stats.avg_win(strategy_returns):.4f}\")\n",
"print(f\"Avg Loss: {qs.stats.avg_loss(strategy_returns):.4f}\")\n",
"print(f\"Profit Factor:{qs.stats.profit_factor(strategy_returns):.2f}\")\n",
"print(\"=\" * 50)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6799c588",
"metadata": {},
"outputs": [],
"source": [
"# full HTML tearsheet — saved to file + displayed inline\n",
"qs.reports.html(strategy_returns, benchmark=benchmark_returns,\n",
" title=f\"{TICKER} ML Signal Strategy (OOS Walk-Forward)\",\n",
" output=\"tearsheet.html\")\n",
"print(\"Tearsheet saved to tearsheet.html\")"
]
},
{
"cell_type": "markdown",
"id": "4bb838bb",
"metadata": {},
"source": [
"## 10. Signal Dashboard — Price + Indicators + Buy/Sell Signals"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "67cae2a4",
"metadata": {},
"outputs": [],
"source": [
"# show last fold's test period with signals overlaid on price\n",
"last_test_dates = strat.index[-126:] # last ~6 months\n",
"viz = df.loc[last_test_dates].copy()\n",
"sig = strat.loc[last_test_dates]\n",
"\n",
"fig = make_subplots(\n",
" rows=4, cols=1, shared_xaxes=True,\n",
" row_heights=[0.4, 0.2, 0.2, 0.2],\n",
" vertical_spacing=0.03,\n",
" subplot_titles=[\"Price + Bollinger Bands + Signals\", \"RSI(14)\", \"MACD\", \"Volume\"]\n",
")\n",
"\n",
"# Row 1: Candlestick + BB + signals\n",
"fig.add_trace(go.Candlestick(\n",
" x=viz.index, open=viz[\"Open\"], high=viz[\"High\"], low=viz[\"Low\"], close=viz[\"Close\"],\n",
" name=\"OHLC\", increasing_line_color=\"steelblue\", decreasing_line_color=\"salmon\",\n",
"), row=1, col=1)\n",
"fig.add_trace(go.Scatter(x=viz.index, y=viz[\"bb_upper\"], line=dict(color=\"gray\", width=1, dash=\"dot\"), name=\"BB Upper\"), row=1, col=1)\n",
"fig.add_trace(go.Scatter(x=viz.index, y=viz[\"bb_lower\"], line=dict(color=\"gray\", width=1, dash=\"dot\"), name=\"BB Lower\", fill=\"tonexty\", fillcolor=\"rgba(128,128,128,0.05)\"), row=1, col=1)\n",
"fig.add_trace(go.Scatter(x=viz.index, y=viz[\"sma_50\"], line=dict(color=\"orange\", width=1), name=\"SMA 50\"), row=1, col=1)\n",
"\n",
"# buy/sell markers\n",
"buy_mask = sig[\"signal\"] == 1\n",
"changes = buy_mask.astype(int).diff()\n",
"buy_entries = sig.index[changes == 1]\n",
"sell_entries = sig.index[changes == -1]\n",
"if len(buy_entries):\n",
" fig.add_trace(go.Scatter(x=buy_entries, y=viz.loc[buy_entries, \"Low\"] * 0.995,\n",
" mode=\"markers\", marker=dict(symbol=\"triangle-up\", size=10, color=\"green\"), name=\"BUY\"), row=1, col=1)\n",
"if len(sell_entries):\n",
" fig.add_trace(go.Scatter(x=sell_entries, y=viz.loc[sell_entries, \"High\"] * 1.005,\n",
" mode=\"markers\", marker=dict(symbol=\"triangle-down\", size=10, color=\"red\"), name=\"SELL\"), row=1, col=1)\n",
"\n",
"# Row 2: RSI\n",
"fig.add_trace(go.Scatter(x=viz.index, y=viz[\"rsi_14\"], line=dict(color=\"purple\", width=1.5), name=\"RSI 14\"), row=2, col=1)\n",
"fig.add_hline(y=70, line_dash=\"dash\", line_color=\"red\", opacity=0.5, row=2, col=1)\n",
"fig.add_hline(y=30, line_dash=\"dash\", line_color=\"green\", opacity=0.5, row=2, col=1)\n",
"\n",
"# Row 3: MACD\n",
"fig.add_trace(go.Scatter(x=viz.index, y=viz[\"macd\"], line=dict(color=\"blue\", width=1.5), name=\"MACD\"), row=3, col=1)\n",
"fig.add_trace(go.Scatter(x=viz.index, y=viz[\"macd_signal\"], line=dict(color=\"orange\", width=1), name=\"Signal\"), row=3, col=1)\n",
"colors = [\"green\" if v >= 0 else \"red\" for v in viz[\"macd_hist\"]]\n",
"fig.add_trace(go.Bar(x=viz.index, y=viz[\"macd_hist\"], marker_color=colors, name=\"Hist\", opacity=0.5), row=3, col=1)\n",
"\n",
"# Row 4: Volume\n",
"fig.add_trace(go.Bar(x=viz.index, y=viz[\"Volume\"], marker_color=\"steelblue\", name=\"Volume\", opacity=0.5), row=4, col=1)\n",
"fig.add_trace(go.Scatter(x=viz.index, y=viz[\"Volume\"].rolling(20).mean(), line=dict(color=\"orange\", width=1), name=\"Vol SMA20\"), row=4, col=1)\n",
"\n",
"fig.update_layout(height=900, title=f\"{TICKER} — Last Test Fold Signal Dashboard\", xaxis_rangeslider_visible=False, showlegend=False)\n",
"fig.update_xaxes(rangeslider_visible=False)\n",
"fig.show()"
]
},
{
"cell_type": "markdown",
"id": "5b25b6c4",
"metadata": {},
"source": [
"## Next Steps\n",
"\n",
"Things to iterate on from here:\n",
"\n",
"1. **Multi-asset**: swap `TICKER` to BTC-USD, QQQ, GLD, etc. or loop over a universe\n",
"2. **Probability threshold**: instead of binary 0/1, use `proba > 0.6` for higher-conviction signals\n",
"3. **Position sizing**: Kelly criterion via `PyPortfolioOpt` based on predicted probability\n",
"4. **Regime filter**: add ADX/volatility regime detection — only trade in trending regimes\n",
"5. **Transaction costs**: subtract realistic slippage (e.g., 5bps per trade) from returns\n",
"6. **Alternative splitters you have installed**:\n",
" - `from tscv import GapWalkForward` — sklearn-compatible, handles gap + purge natively\n",
" - `from sktime.split import ExpandingWindowSplitter, SlidingWindowSplitter`\n",
" - `from sklearn.model_selection import TimeSeriesSplit` — basic but solid\n",
"7. **LightGBM**: drop-in replacement for XGBoost, often faster on large feature sets\n",
"8. **Meta-labeling** (Lopez de Prado): train a secondary model on whether the primary model's signals are correct"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Generated
+4645
View File
File diff suppressed because it is too large Load Diff