Source code for volkit.datasets.loader_spxw

# volkit/datasets/loader_spxw.py
from typing import Optional
import os
import pandas as pd


[docs] def spxw( min_volume: int = 0, D: Optional[int] = None, *, full: bool = False, ) -> pd.DataFrame: """ Load a sample SPXW options slice and apply simple filters. Parameters ---------- min_volume : int, default 0 Minimum per-option trade volume required **for both** the call and the put at a given strike. Rows are kept only if ``C_vol >= min_volume`` **and** ``P_vol >= min_volume``. Use ``0`` to disable the volume filter. D : int or None, default None If provided, keep only rows whose computed calendar days-to-expiry equals this value. For example, ``D=7`` keeps only the 7-calendar-day slice. Use ``None`` to keep all expiries. full : bool, default False Select between a compact, most-relevant schema (False) and a full schema (True). When ``full=False`` (default), return: - ``["K", "D", "T"]`` — strike and days/time to expiry - ``["F_bid", "F_ask"]`` — underlying future quotes - ``["C_bid", "C_ask", "P_bid", "P_ask"]`` — call/put quotes - ``["C_vol", "P_vol"]`` — call/put traded volume (day) When ``full=True``, additionally include: - ``["quote_date", "expiration_date"]`` — actual datetimes - ``["C_bid_size", "C_ask_size", "P_bid_size", "P_ask_size"]`` — order book sizes - ``["C_oi", "P_oi"]`` — open interest Returns ------- pandas.DataFrame One row per strike with at least: ``K``, ``D``, ``T`` and the selected extra columns. Notes ----- - ``T`` uses **252** trading days/year to align with options practice. If you prefer ACT/365 or ACT/ACT, convert after loading. - Date columns (``quote_date``, ``expiration_date``) are parsed as datetimes. Examples -------- Load defaults (compact schema): >>> df = spxw() Keep only options with at least 100 contracts traded on both sides: >>> df = spxw(min_volume=100) Work with a single tenor (e.g., 7 days to expiry): >>> df_7 = spxw(D=7) Get the full column set: >>> df_full = spxw(full=True) """ # columns to pivot and how they should be renamed rename_map = { "bid_size_1545": "bid_size", "bid_1545": "bid", "ask_size_1545": "ask_size", "ask_1545": "ask", "trade_volume": "vol", "open_interest": "oi", } idx = ["quote_date", "expiration", "strike"] # Locate CSV here = os.path.dirname(__file__) data_path = os.path.join(here, "data", "spxw20190626.csv") df = pd.read_csv(data_path, parse_dates=["quote_date", "expiration"]) # Wide pivot: values × option_type -> single level columns like C_bid, P_ask_size pt = df.pivot_table( index=idx, columns="option_type", values=list(rename_map.keys()), aggfunc="first", # if duplicates exist, take first ).sort_index(axis=1) # flatten MultiIndex columns: ('bid_1545','C') -> 'C_bid' pt.columns = [ f"{opt}_{rename_map[val]}" for (val, opt) in pt.columns.to_flat_index() ] pt = pt.reset_index() # Keep the non-pivot columns (take the first if duplicated across C/P rows) nonpivot = [ "quote_date", "expiration", "strike", "underlying_bid_1545", "underlying_ask_1545", ] base = df[nonpivot].drop_duplicates(subset=idx).groupby(idx, as_index=False).first() # Merge out = base.merge(pt, on=idx, how="left") # Rename to final schema out = out.rename( columns={ "strike": "K", "underlying_bid_1545": "F_bid", "underlying_ask_1545": "F_ask", "expiration": "expiration_date", } ) # Days and time to expiry out["D"] = (out["expiration_date"] - out["quote_date"]).dt.days out["T"] = out["D"] / 252.0 # Row filters if min_volume > 0: out = out[(out["C_vol"] >= min_volume) & (out["P_vol"] >= min_volume)] if D is not None: out = out[out["D"] == D] # Column selection --------------------------------------------------------- cols = [ "K", "D", "T", "F_bid", "F_ask", "C_bid", "C_ask", "P_bid", "P_ask", "C_vol", "P_vol", ] if full: cols += [ "quote_date", "expiration_date", "C_bid_size", "C_ask_size", "P_bid_size", "P_ask_size", "C_oi", "P_oi", ] # Be forgiving if a column is missing in a custom CSV cols = [c for c in cols if c in out.columns] # Stable ordering for reproducibility (these columns always exist in `out`) out = out.sort_values(["quote_date", "expiration_date", "K"]).reset_index(drop=True) return out[cols]