Skip to content

Commit a1b63db

Browse files
xisen-wTPLin22WinstonLiyt
authored
feat: Supporting COVID-19 competition (#374)
* Uploading the initially runnable template * CI Fixes * edit params * change the template * fix a bug in rag --------- Co-authored-by: TPLin22 <[email protected]> Co-authored-by: WinstonLiye <[email protected]>
1 parent e958a34 commit a1b63db

File tree

7 files changed

+260
-9
lines changed

7 files changed

+260
-9
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import os
2+
3+
import numpy as np
4+
import pandas as pd
5+
from sklearn.model_selection import train_test_split
6+
from sklearn.preprocessing import LabelEncoder
7+
8+
9+
def prepreprocess():
10+
# Load the data
11+
train = pd.read_csv("/kaggle/input/train.csv")
12+
test = pd.read_csv("/kaggle/input/test.csv")
13+
14+
# Combine train and test for preprocessing
15+
all_data = pd.concat([train, test], sort=False)
16+
17+
# Convert date to datetime
18+
all_data["Date"] = pd.to_datetime(all_data["Date"])
19+
20+
# Create new features
21+
all_data["Day"] = all_data["Date"].dt.day
22+
all_data["Month"] = all_data["Date"].dt.month
23+
all_data["Year"] = all_data["Date"].dt.year
24+
25+
# Encode categorical variables
26+
le = LabelEncoder()
27+
all_data["Country/Region"] = le.fit_transform(all_data["Country/Region"])
28+
all_data["Province/State"] = le.fit_transform(all_data["Province/State"].fillna("None"))
29+
30+
# Split back into train and test
31+
train = all_data[all_data["ForecastId"].isna()]
32+
test = all_data[all_data["ForecastId"].notna()]
33+
34+
# Prepare features and targets
35+
features = ["Country/Region", "Province/State", "Day", "Month", "Year"]
36+
X = train[features]
37+
y = train[["ConfirmedCases", "Fatalities"]]
38+
39+
# Split into train and validation sets
40+
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
41+
42+
return X_train, X_valid, y_train, y_valid, test[features], test["ForecastId"]
43+
44+
45+
def preprocess_script():
46+
if os.path.exists("/kaggle/input/X_train.pkl"):
47+
X_train = pd.read_pickle("/kaggle/input/X_train.pkl")
48+
X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl")
49+
y_train = pd.read_pickle("/kaggle/input/y_train.pkl")
50+
y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl")
51+
X_test = pd.read_pickle("/kaggle/input/X_test.pkl")
52+
forecast_ids = pd.read_pickle("/kaggle/input/forecast_ids.pkl")
53+
else:
54+
X_train, X_valid, y_train, y_valid, X_test, forecast_ids = prepreprocess()
55+
56+
# Save preprocessed data
57+
X_train.to_pickle("/kaggle/input/X_train.pkl")
58+
X_valid.to_pickle("/kaggle/input/X_valid.pkl")
59+
y_train.to_pickle("/kaggle/input/y_train.pkl")
60+
y_valid.to_pickle("/kaggle/input/y_valid.pkl")
61+
X_test.to_pickle("/kaggle/input/X_test.pkl")
62+
forecast_ids.to_pickle("/kaggle/input/forecast_ids.pkl")
63+
64+
return X_train, X_valid, y_train, y_valid, X_test, forecast_ids
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import pandas as pd
2+
3+
"""
4+
Here is the feature engineering code for each task, with a class that has a fit and transform method.
5+
Remember
6+
"""
7+
8+
9+
class IdentityFeature:
10+
def fit(self, train_df: pd.DataFrame):
11+
"""
12+
Fit the feature engineering model to the training data.
13+
"""
14+
pass
15+
16+
def transform(self, X: pd.DataFrame):
17+
"""
18+
Transform the input data.
19+
"""
20+
return X
21+
22+
23+
feature_engineering_cls = IdentityFeature
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import pandas as pd
2+
import xgboost as xgb
3+
4+
5+
def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
6+
"""Define and train the model for both ConfirmedCases and Fatalities."""
7+
models = {}
8+
for target in ["ConfirmedCases", "Fatalities"]:
9+
dtrain = xgb.DMatrix(X_train, label=y_train[target])
10+
dvalid = xgb.DMatrix(X_valid, label=y_valid[target])
11+
12+
params = {
13+
"objective": "reg:squarederror",
14+
"eval_metric": "rmse",
15+
"nthread": -1,
16+
"tree_method": "gpu_hist",
17+
"device": "cuda",
18+
}
19+
num_round = 1000
20+
21+
evallist = [(dtrain, "train"), (dvalid, "eval")]
22+
models[target] = xgb.train(params, dtrain, num_round, evallist, early_stopping_rounds=50)
23+
24+
return models
25+
26+
27+
def predict(models, X):
28+
"""Make predictions for both ConfirmedCases and Fatalities."""
29+
dtest = xgb.DMatrix(X)
30+
predictions = {}
31+
for target, model in models.items():
32+
predictions[target] = model.predict(dtest)
33+
return pd.DataFrame(predictions)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import pandas as pd
2+
3+
4+
def select(X: pd.DataFrame) -> pd.DataFrame:
5+
"""
6+
Select relevant features. To be used in fit & predict function.
7+
"""
8+
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
9+
if X.columns.nlevels == 1:
10+
return X
11+
X.columns = ["_".join(str(col)).strip() for col in X.columns.values]
12+
return X
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
import importlib.util
2+
import random
3+
from pathlib import Path
4+
5+
import numpy as np
6+
import pandas as pd
7+
from fea_share_preprocess import preprocess_script
8+
from sklearn.metrics import mean_squared_log_error
9+
10+
# Set random seed for reproducibility
11+
SEED = 42
12+
random.seed(SEED)
13+
np.random.seed(SEED)
14+
DIRNAME = Path(__file__).absolute().resolve().parent
15+
16+
17+
def compute_rmsle(y_true, y_pred):
18+
"""Compute Root Mean Squared Logarithmic Error for regression."""
19+
return np.sqrt(mean_squared_log_error(y_true, y_pred))
20+
21+
22+
def import_module_from_path(module_name, module_path):
23+
spec = importlib.util.spec_from_file_location(module_name, module_path)
24+
module = importlib.util.module_from_spec(spec)
25+
spec.loader.exec_module(module)
26+
return module
27+
28+
29+
# 1) Preprocess the data
30+
X_train, X_valid, y_train, y_valid, X_test, forecast_ids = preprocess_script()
31+
32+
# 2) Auto feature engineering
33+
X_train_l, X_valid_l = [], []
34+
X_test_l = []
35+
36+
for f in DIRNAME.glob("feature/feat*.py"):
37+
cls = import_module_from_path(f.stem, f).feature_engineering_cls()
38+
cls.fit(X_train)
39+
X_train_f = cls.transform(X_train)
40+
X_valid_f = cls.transform(X_valid)
41+
X_test_f = cls.transform(X_test)
42+
43+
if X_train_f.shape[-1] == X_valid_f.shape[-1] and X_train_f.shape[-1] == X_test_f.shape[-1]:
44+
X_train_l.append(X_train_f)
45+
X_valid_l.append(X_valid_f)
46+
X_test_l.append(X_test_f)
47+
48+
X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))])
49+
X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))])
50+
X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))])
51+
52+
print(X_train.shape, X_valid.shape, X_test.shape)
53+
54+
# Handle inf and -inf values
55+
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
56+
X_valid.replace([np.inf, -np.inf], np.nan, inplace=True)
57+
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
58+
59+
from sklearn.impute import SimpleImputer
60+
61+
imputer = SimpleImputer(strategy="mean")
62+
63+
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
64+
X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns)
65+
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)
66+
67+
# Remove duplicate columns
68+
X_train = X_train.loc[:, ~X_train.columns.duplicated()]
69+
X_valid = X_valid.loc[:, ~X_valid.columns.duplicated()]
70+
X_test = X_test.loc[:, ~X_test.columns.duplicated()]
71+
72+
# 3) Train the model
73+
model_l = [] # list[tuple[model, predict_func,]]
74+
for f in DIRNAME.glob("model/model*.py"):
75+
select_python_path = f.with_name(f.stem.replace("model", "select") + f.suffix)
76+
select_m = import_module_from_path(select_python_path.stem, select_python_path)
77+
X_train_selected = select_m.select(X_train.copy())
78+
X_valid_selected = select_m.select(X_valid.copy())
79+
80+
m = import_module_from_path(f.stem, f)
81+
model_l.append((m.fit(X_train_selected, y_train, X_valid_selected, y_valid), m.predict, select_m))
82+
83+
84+
# 4) Evaluate the model on the validation set
85+
metrics_all = []
86+
for model, predict_func, select_m in model_l:
87+
X_valid_selected = select_m.select(X_valid.copy())
88+
y_valid_pred = predict_func(model, X_valid_selected)
89+
90+
# Add a small positive value to avoid negative or zero values
91+
epsilon = 1e-8
92+
y_valid_cases = np.maximum(y_valid["ConfirmedCases"], epsilon)
93+
y_pred_cases = np.maximum(y_valid_pred["ConfirmedCases"], epsilon)
94+
95+
rmsle_cases = compute_rmsle(y_valid_cases, y_pred_cases)
96+
rmsle_fatalities = compute_rmsle(
97+
np.maximum(y_valid["Fatalities"], epsilon), np.maximum(y_valid_pred["Fatalities"], epsilon)
98+
)
99+
rmsle_avg = (rmsle_cases + rmsle_fatalities) / 2
100+
print(f"Average RMSLE on valid set: {rmsle_avg}")
101+
metrics_all.append(rmsle_avg)
102+
103+
# 5) Save the validation accuracy
104+
min_index = np.argmin(metrics_all)
105+
pd.Series(data=[metrics_all[min_index]], index=["RMSLE"]).to_csv("submission_score.csv")
106+
107+
# 6) Make predictions on the test set and save them
108+
X_test_selected = model_l[min_index][2].select(X_test.copy())
109+
y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test_selected)
110+
111+
# 7) Submit predictions for the test set
112+
submission_result = pd.DataFrame(
113+
{
114+
"ForecastId": forecast_ids,
115+
"ConfirmedCases": y_test_pred["ConfirmedCases"],
116+
"Fatalities": y_test_pred["Fatalities"],
117+
}
118+
)
119+
submission_result.to_csv("submission.csv", index=False)

rdagent/scenarios/kaggle/knowledge_management/vector_base.py

+9-8
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import pandas as pd
66
from _pytest.cacheprovider import json
7+
from tqdm import tqdm
78

89
from rdagent.components.knowledge_management.vector_base import Document, PDVectorBase
910
from rdagent.log import rdagent_logger as logger
@@ -189,20 +190,20 @@ def add_experience_to_vector_base(self, experiment_feedback=None):
189190
extracted_knowledge = extract_knowledge_from_feedback(experiment_feedback)
190191

191192
document = KGKnowledgeDocument(
192-
content=experiment_feedback.get("hypothesis_text", ""),
193-
label="Experiment Feedback",
194-
competition_name="Experiment Result",
195-
task_category=experiment_feedback.get("tasks_factors", "General Task"),
196-
field="Research Feedback",
197-
ranking=None,
198-
score=experiment_feedback.get("current_result", None),
193+
content=extracted_knowledge.get("content", ""),
194+
title=extracted_knowledge.get("title", "Experiment Feedback"),
195+
competition_name=extracted_knowledge.get("competition_name", "Unknown Competition"),
196+
task_category=extracted_knowledge.get("task_category", "General Task"),
197+
field=extracted_knowledge.get("field", None),
198+
ranking=extracted_knowledge.get("ranking", None),
199+
score=extracted_knowledge.get("score", None),
199200
)
200201
document.create_embedding()
201202
self.add(document)
202203
return
203204

204205
# Process Kaggle experience data
205-
for experience in self.kaggle_experience_data:
206+
for experience in tqdm(self.kaggle_experience_data):
206207
content = experience.get("content", "")
207208
label = experience.get("title", "Kaggle Experience")
208209
competition_name = experience.get("competition_name", "Unknown Competition")

rdagent/scenarios/kaggle/prompts.yaml

-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ KG_hypothesis_gen_RAG: |-
2828
hypothesis_and_feedback: |-
2929
{% for hypothesis, experiment, feedback in trace.hist %}
3030
Hypothesis {{ loop.index }}: {{ hypothesis }}
31-
Corresponding Code (that leads to the difference in performance): {{experiment.sub_workspace_list[0].code_dict.get("model.py")}}
3231
Observation on the result with the hypothesis: {{ feedback.observations }}
3332
Feedback on the original hypothesis: {{ feedback.hypothesis_evaluation }}
3433
New Feedback for Context (For your reference): {{ feedback.new_hypothesis }}

0 commit comments

Comments
 (0)