Walk Forward Validation (WFV) involves a training window that moves forward in time, training the model on historical data and then validating it on future, unseen data points. Unlike traditional cross-validation where data is randomly split, WFV respects the sequence of time, making it ideal for datasets with time-dependent features like stock prices, weather patterns, or sales figures.
Kaggle is hosting the Jane Street Real-Time Market Data Forecast. The dataset contains 79 features from multiple symbols over date/time and was partitioned into ten parts.
There are multiple ways to implement WFV in this dataset. I leveraged the partition already made in the dataset and created a loop over all partitions training a LGBM model on one partition and evaluating it on the next one.
The following class is the final implementation of the WFV concept. The core implementation is in the run method. You can also see the full notebook in this link:
class WalkForwardingValidation():
def __init__(self, files: List[str], x_column:List[str], y_column: str, lgb_params: dict):
self.files = files
self.x_column = x_column
self.y_column = y_column
self.lgb_params = lgb_params
def _create_folder(self, folder_name)->str:
folder_path = f'/kaggle/working/model/{folder_name}'
# Check if the folder exists
if os.path.exists(folder_path):
# If it exists, delete all contents
shutil.rmtree(folder_path)
# Create the folder (it will be empty)
os.makedirs(folder_path)
return folder_path
def _calculate_r2(self, y_true, y_pred, weights):
"""
Calculate the sample weighted zero-mean R-squared score (R2).
Parameters:
- y_true (pd.Series or np.array): Ground truth values.
- y_pred (pd.Series or np.array): Predicted values.
- weights (pd.Series or np.array): Sample weights.
Returns:
- float: R2 score.
"""
numerator = np.sum(weights * (y_true - y_pred) ** 2)
denominator = np.sum(weights * (y_true ** 2))
r2_score = 1 - (numerator / denominator)
return r2_score
def run(self) -> List[dict]:
n_files = len(self.files)
wfv_results = []
r2_score = []
for i in range(0, n_files - 1):
print(f"Initializing file {self.files[i]}")
folder_path = self._create_folder(f'booster_{i}')
df_train = pd.read_parquet(self.files[i])
df_valid = pd.read_parquet(self.files[i+1])
ds_train = lgb.Dataset(
data = df_train[self.x_column],
label = df_train[self.y_column]
)
ds_valid = lgb.Dataset(
data = df_valid[self.x_column],
label = df_valid[self.y_column]
)
del df_train
gc.collect()
#avoid override params in every fold
log_params = True if i == 0 else False
evals_result = {}
booster = lgb.train(
params = self.lgb_params,
train_set = ds_train,
valid_sets = [ds_valid],
callbacks = [
lgb.log_evaluation(20),
lgb.early_stopping(20),
_WandbCallback(log_params = log_params)
]
)
wfv_results.append(booster.best_score['valid_0']['rmse'])
booster.save_model(folder_path+"/model.txt", num_iteration=booster.best_iteration)
"""
Calculate R2 score
"""
y_valid_pred = booster.predict(df_valid[self.x_column])
r2_score.append(
self._calculate_r2(df_valid[self.y_column], y_valid_pred, df_valid['weight'])
)
return wfv_results, r2_score