|
| 1 | +import importlib.util |
| 2 | +import random |
| 3 | +from pathlib import Path |
| 4 | + |
| 5 | +import numpy as np |
| 6 | +import pandas as pd |
| 7 | +from fea_share_preprocess import preprocess_script |
| 8 | +from sklearn.metrics import mean_squared_log_error |
| 9 | + |
| 10 | +# Set random seed for reproducibility |
| 11 | +SEED = 42 |
| 12 | +random.seed(SEED) |
| 13 | +np.random.seed(SEED) |
| 14 | +DIRNAME = Path(__file__).absolute().resolve().parent |
| 15 | + |
| 16 | + |
| 17 | +def compute_rmsle(y_true, y_pred): |
| 18 | + """Compute Root Mean Squared Logarithmic Error for regression.""" |
| 19 | + return np.sqrt(mean_squared_log_error(y_true, y_pred)) |
| 20 | + |
| 21 | + |
| 22 | +def import_module_from_path(module_name, module_path): |
| 23 | + spec = importlib.util.spec_from_file_location(module_name, module_path) |
| 24 | + module = importlib.util.module_from_spec(spec) |
| 25 | + spec.loader.exec_module(module) |
| 26 | + return module |
| 27 | + |
| 28 | + |
| 29 | +# 1) Preprocess the data |
| 30 | +X_train, X_valid, y_train, y_valid, X_test, forecast_ids = preprocess_script() |
| 31 | + |
| 32 | +# 2) Auto feature engineering |
| 33 | +X_train_l, X_valid_l = [], [] |
| 34 | +X_test_l = [] |
| 35 | + |
| 36 | +for f in DIRNAME.glob("feature/feat*.py"): |
| 37 | + cls = import_module_from_path(f.stem, f).feature_engineering_cls() |
| 38 | + cls.fit(X_train) |
| 39 | + X_train_f = cls.transform(X_train) |
| 40 | + X_valid_f = cls.transform(X_valid) |
| 41 | + X_test_f = cls.transform(X_test) |
| 42 | + |
| 43 | + if X_train_f.shape[-1] == X_valid_f.shape[-1] and X_train_f.shape[-1] == X_test_f.shape[-1]: |
| 44 | + X_train_l.append(X_train_f) |
| 45 | + X_valid_l.append(X_valid_f) |
| 46 | + X_test_l.append(X_test_f) |
| 47 | + |
| 48 | +X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))]) |
| 49 | +X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))]) |
| 50 | +X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))]) |
| 51 | + |
| 52 | +print(X_train.shape, X_valid.shape, X_test.shape) |
| 53 | + |
| 54 | +# Handle inf and -inf values |
| 55 | +X_train.replace([np.inf, -np.inf], np.nan, inplace=True) |
| 56 | +X_valid.replace([np.inf, -np.inf], np.nan, inplace=True) |
| 57 | +X_test.replace([np.inf, -np.inf], np.nan, inplace=True) |
| 58 | + |
| 59 | +from sklearn.impute import SimpleImputer |
| 60 | + |
| 61 | +imputer = SimpleImputer(strategy="mean") |
| 62 | + |
| 63 | +X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns) |
| 64 | +X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns) |
| 65 | +X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns) |
| 66 | + |
| 67 | +# Remove duplicate columns |
| 68 | +X_train = X_train.loc[:, ~X_train.columns.duplicated()] |
| 69 | +X_valid = X_valid.loc[:, ~X_valid.columns.duplicated()] |
| 70 | +X_test = X_test.loc[:, ~X_test.columns.duplicated()] |
| 71 | + |
| 72 | +# 3) Train the model |
| 73 | +model_l = [] # list[tuple[model, predict_func,]] |
| 74 | +for f in DIRNAME.glob("model/model*.py"): |
| 75 | + select_python_path = f.with_name(f.stem.replace("model", "select") + f.suffix) |
| 76 | + select_m = import_module_from_path(select_python_path.stem, select_python_path) |
| 77 | + X_train_selected = select_m.select(X_train.copy()) |
| 78 | + X_valid_selected = select_m.select(X_valid.copy()) |
| 79 | + |
| 80 | + m = import_module_from_path(f.stem, f) |
| 81 | + model_l.append((m.fit(X_train_selected, y_train, X_valid_selected, y_valid), m.predict, select_m)) |
| 82 | + |
| 83 | + |
| 84 | +# 4) Evaluate the model on the validation set |
| 85 | +metrics_all = [] |
| 86 | +for model, predict_func, select_m in model_l: |
| 87 | + X_valid_selected = select_m.select(X_valid.copy()) |
| 88 | + y_valid_pred = predict_func(model, X_valid_selected) |
| 89 | + |
| 90 | + # Add a small positive value to avoid negative or zero values |
| 91 | + epsilon = 1e-8 |
| 92 | + y_valid_cases = np.maximum(y_valid["ConfirmedCases"], epsilon) |
| 93 | + y_pred_cases = np.maximum(y_valid_pred["ConfirmedCases"], epsilon) |
| 94 | + |
| 95 | + rmsle_cases = compute_rmsle(y_valid_cases, y_pred_cases) |
| 96 | + rmsle_fatalities = compute_rmsle( |
| 97 | + np.maximum(y_valid["Fatalities"], epsilon), np.maximum(y_valid_pred["Fatalities"], epsilon) |
| 98 | + ) |
| 99 | + rmsle_avg = (rmsle_cases + rmsle_fatalities) / 2 |
| 100 | + print(f"Average RMSLE on valid set: {rmsle_avg}") |
| 101 | + metrics_all.append(rmsle_avg) |
| 102 | + |
| 103 | +# 5) Save the validation accuracy |
| 104 | +min_index = np.argmin(metrics_all) |
| 105 | +pd.Series(data=[metrics_all[min_index]], index=["RMSLE"]).to_csv("submission_score.csv") |
| 106 | + |
| 107 | +# 6) Make predictions on the test set and save them |
| 108 | +X_test_selected = model_l[min_index][2].select(X_test.copy()) |
| 109 | +y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test_selected) |
| 110 | + |
| 111 | +# 7) Submit predictions for the test set |
| 112 | +submission_result = pd.DataFrame( |
| 113 | + { |
| 114 | + "ForecastId": forecast_ids, |
| 115 | + "ConfirmedCases": y_test_pred["ConfirmedCases"], |
| 116 | + "Fatalities": y_test_pred["Fatalities"], |
| 117 | + } |
| 118 | +) |
| 119 | +submission_result.to_csv("submission.csv", index=False) |
0 commit comments