|
11 | 11 |
|
12 | 12 | def prepreprocess():
|
13 | 13 | # Load the training data
|
14 |
| - train_df = pd.read_csv("/kaggle/input/optiver-realized-volatility-prediction/train.csv") |
| 14 | + train_df = pd.read_csv("/kaggle/input/train.csv").head(1000) |
15 | 15 |
|
16 | 16 | # Load book and trade data
|
17 |
| - book_train = pd.read_parquet("/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet") |
18 |
| - trade_train = pd.read_parquet("/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet") |
| 17 | + book_train = pd.read_parquet("/kaggle/input/book_train.parquet").head(1000) |
| 18 | + trade_train = pd.read_parquet("/kaggle/input/trade_train.parquet").head(1000) |
19 | 19 |
|
20 | 20 | # Merge book and trade data with train_df
|
21 | 21 | merged_df = pd.merge(train_df, book_train, on=["stock_id", "time_id"], how="left")
|
22 | 22 | merged_df = pd.merge(merged_df, trade_train, on=["stock_id", "time_id"], how="left")
|
23 | 23 |
|
| 24 | + print(merged_df.head()) |
| 25 | + |
24 | 26 | # Split the data
|
25 | 27 | X = merged_df.drop(["target"], axis=1)
|
26 | 28 | y = merged_df["target"]
|
@@ -83,8 +85,19 @@ def preprocess_script():
|
83 | 85 | X_valid = preprocess_transform(X_valid, preprocessor, numerical_cols, categorical_cols)
|
84 | 86 |
|
85 | 87 | submission_df = pd.read_csv("/kaggle/input/test.csv")
|
86 |
| - ids = submission_df["id"] |
87 |
| - submission_df = submission_df.drop(["id"], axis=1) |
| 88 | + |
| 89 | + ids = submission_df["row_id"] |
| 90 | + submission_df = submission_df.drop(["row_id"], axis=1) |
| 91 | + |
| 92 | + # Add missing columns to submission_df |
| 93 | + for col in X_train.columns: |
| 94 | + if col not in submission_df.columns: |
| 95 | + submission_df[col] = 0 # Fill with 0 or another appropriate value |
| 96 | + |
88 | 97 | X_test = preprocess_transform(submission_df, preprocessor, numerical_cols, categorical_cols)
|
89 | 98 |
|
| 99 | + # Handle missing values |
| 100 | + for df in [X_train, X_valid, X_test]: |
| 101 | + df.fillna(df.mean(), inplace=True) |
| 102 | + |
90 | 103 | return X_train, X_valid, y_train, y_valid, X_test, ids
|
0 commit comments