feat: Supporting COVID-19 competition (#374)

xisen-w · TPLin22 · WinstonLiyt · web-flow · commit a1b63db79600 · 2024-09-28T05:33:44.000+08:00
* Uploading the initially runnable template

* CI Fixes

* edit params

* change the template

* fix a bug in rag

---------

Co-authored-by: TPLin22 &lt;tplin2@163.com&gt;
Co-authored-by: WinstonLiye &lt;1957922024@qq.com&gt;
diff --git a/rdagent/scenarios/kaggle/experiment/covid19-global-forecasting-week-1_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/covid19-global-forecasting-week-1_template/fea_share_preprocess.py
@@ -0,0 +1,64 @@
+import os
+
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
+
+
+def prepreprocess():
+    # Load the data
+    train = pd.read_csv("/kaggle/input/train.csv")
+    test = pd.read_csv("/kaggle/input/test.csv")
+
+    # Combine train and test for preprocessing
+    all_data = pd.concat([train, test], sort=False)
+
+    # Convert date to datetime
+    all_data["Date"] = pd.to_datetime(all_data["Date"])
+
+    # Create new features
+    all_data["Day"] = all_data["Date"].dt.day
+    all_data["Month"] = all_data["Date"].dt.month
+    all_data["Year"] = all_data["Date"].dt.year
+
+    # Encode categorical variables
+    le = LabelEncoder()
+    all_data["Country/Region"] = le.fit_transform(all_data["Country/Region"])
+    all_data["Province/State"] = le.fit_transform(all_data["Province/State"].fillna("None"))
+
+    # Split back into train and test
+    train = all_data[all_data["ForecastId"].isna()]
+    test = all_data[all_data["ForecastId"].notna()]
+
+    # Prepare features and targets
+    features = ["Country/Region", "Province/State", "Day", "Month", "Year"]
+    X = train[features]
+    y = train[["ConfirmedCases", "Fatalities"]]
+
+    # Split into train and validation sets
+    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
+
+    return X_train, X_valid, y_train, y_valid, test[features], test["ForecastId"]
+
+
+def preprocess_script():
+    if os.path.exists("/kaggle/input/X_train.pkl"):
+        X_train = pd.read_pickle("/kaggle/input/X_train.pkl")
+        X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl")
+        y_train = pd.read_pickle("/kaggle/input/y_train.pkl")
+        y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl")
+        X_test = pd.read_pickle("/kaggle/input/X_test.pkl")
+        forecast_ids = pd.read_pickle("/kaggle/input/forecast_ids.pkl")
+    else:
+        X_train, X_valid, y_train, y_valid, X_test, forecast_ids = prepreprocess()
+
+        # Save preprocessed data
+        X_train.to_pickle("/kaggle/input/X_train.pkl")
+        X_valid.to_pickle("/kaggle/input/X_valid.pkl")
+        y_train.to_pickle("/kaggle/input/y_train.pkl")
+        y_valid.to_pickle("/kaggle/input/y_valid.pkl")
+        X_test.to_pickle("/kaggle/input/X_test.pkl")
+        forecast_ids.to_pickle("/kaggle/input/forecast_ids.pkl")
+
+    return X_train, X_valid, y_train, y_valid, X_test, forecast_ids
diff --git a/rdagent/scenarios/kaggle/experiment/covid19-global-forecasting-week-1_template/feature/feature.py b/rdagent/scenarios/kaggle/experiment/covid19-global-forecasting-week-1_template/feature/feature.py
@@ -0,0 +1,23 @@
+import pandas as pd
+
+"""
+Here is the feature engineering code for each task, with a class that has a fit and transform method.
+Remember
+"""
+
+
+class IdentityFeature:
+    def fit(self, train_df: pd.DataFrame):
+        """
+        Fit the feature engineering model to the training data.
+        """
+        pass
+
+    def transform(self, X: pd.DataFrame):
+        """
+        Transform the input data.
+        """
+        return X
+
+
+feature_engineering_cls = IdentityFeature
diff --git a/rdagent/scenarios/kaggle/experiment/covid19-global-forecasting-week-1_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/covid19-global-forecasting-week-1_template/model/model_xgboost.py
@@ -0,0 +1,33 @@
+import pandas as pd
+import xgboost as xgb
+
+
+def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
+    """Define and train the model for both ConfirmedCases and Fatalities."""
+    models = {}
+    for target in ["ConfirmedCases", "Fatalities"]:
+        dtrain = xgb.DMatrix(X_train, label=y_train[target])
+        dvalid = xgb.DMatrix(X_valid, label=y_valid[target])
+
+        params = {
+            "objective": "reg:squarederror",
+            "eval_metric": "rmse",
+            "nthread": -1,
+            "tree_method": "gpu_hist",
+            "device": "cuda",
+        }
+        num_round = 1000
+
+        evallist = [(dtrain, "train"), (dvalid, "eval")]
+        models[target] = xgb.train(params, dtrain, num_round, evallist, early_stopping_rounds=50)
+
+    return models
+
+
+def predict(models, X):
+    """Make predictions for both ConfirmedCases and Fatalities."""
+    dtest = xgb.DMatrix(X)
+    predictions = {}
+    for target, model in models.items():
+        predictions[target] = model.predict(dtest)
+    return pd.DataFrame(predictions)
diff --git a/rdagent/scenarios/kaggle/experiment/covid19-global-forecasting-week-1_template/model/select_xgboost.py b/rdagent/scenarios/kaggle/experiment/covid19-global-forecasting-week-1_template/model/select_xgboost.py
@@ -0,0 +1,12 @@
+import pandas as pd
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    """
+    Select relevant features. To be used in fit & predict function.
+    """
+    # For now, we assume all features are relevant. This can be expanded to feature selection logic.
+    if X.columns.nlevels == 1:
+        return X
+    X.columns = ["_".join(str(col)).strip() for col in X.columns.values]
+    return X
diff --git a/rdagent/scenarios/kaggle/experiment/covid19-global-forecasting-week-1_template/train.py b/rdagent/scenarios/kaggle/experiment/covid19-global-forecasting-week-1_template/train.py
@@ -0,0 +1,119 @@
+import importlib.util
+import random
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from fea_share_preprocess import preprocess_script
+from sklearn.metrics import mean_squared_log_error
+
+# Set random seed for reproducibility
+SEED = 42
+random.seed(SEED)
+np.random.seed(SEED)
+DIRNAME = Path(__file__).absolute().resolve().parent
+
+
+def compute_rmsle(y_true, y_pred):
+    """Compute Root Mean Squared Logarithmic Error for regression."""
+    return np.sqrt(mean_squared_log_error(y_true, y_pred))
+
+
+def import_module_from_path(module_name, module_path):
+    spec = importlib.util.spec_from_file_location(module_name, module_path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+# 1) Preprocess the data
+X_train, X_valid, y_train, y_valid, X_test, forecast_ids = preprocess_script()
+
+# 2) Auto feature engineering
+X_train_l, X_valid_l = [], []
+X_test_l = []
+
+for f in DIRNAME.glob("feature/feat*.py"):
+    cls = import_module_from_path(f.stem, f).feature_engineering_cls()
+    cls.fit(X_train)
+    X_train_f = cls.transform(X_train)
+    X_valid_f = cls.transform(X_valid)
+    X_test_f = cls.transform(X_test)
+
+    if X_train_f.shape[-1] == X_valid_f.shape[-1] and X_train_f.shape[-1] == X_test_f.shape[-1]:
+        X_train_l.append(X_train_f)
+        X_valid_l.append(X_valid_f)
+        X_test_l.append(X_test_f)
+
+X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))])
+X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))])
+X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))])
+
+print(X_train.shape, X_valid.shape, X_test.shape)
+
+# Handle inf and -inf values
+X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
+X_valid.replace([np.inf, -np.inf], np.nan, inplace=True)
+X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
+
+from sklearn.impute import SimpleImputer
+
+imputer = SimpleImputer(strategy="mean")
+
+X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
+X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns)
+X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)
+
+# Remove duplicate columns
+X_train = X_train.loc[:, ~X_train.columns.duplicated()]
+X_valid = X_valid.loc[:, ~X_valid.columns.duplicated()]
+X_test = X_test.loc[:, ~X_test.columns.duplicated()]
+
+# 3) Train the model
+model_l = []  # list[tuple[model, predict_func,]]
+for f in DIRNAME.glob("model/model*.py"):
+    select_python_path = f.with_name(f.stem.replace("model", "select") + f.suffix)
+    select_m = import_module_from_path(select_python_path.stem, select_python_path)
+    X_train_selected = select_m.select(X_train.copy())
+    X_valid_selected = select_m.select(X_valid.copy())
+
+    m = import_module_from_path(f.stem, f)
+    model_l.append((m.fit(X_train_selected, y_train, X_valid_selected, y_valid), m.predict, select_m))
+
+
+# 4) Evaluate the model on the validation set
+metrics_all = []
+for model, predict_func, select_m in model_l:
+    X_valid_selected = select_m.select(X_valid.copy())
+    y_valid_pred = predict_func(model, X_valid_selected)
+
+    # Add a small positive value to avoid negative or zero values
+    epsilon = 1e-8
+    y_valid_cases = np.maximum(y_valid["ConfirmedCases"], epsilon)
+    y_pred_cases = np.maximum(y_valid_pred["ConfirmedCases"], epsilon)
+
+    rmsle_cases = compute_rmsle(y_valid_cases, y_pred_cases)
+    rmsle_fatalities = compute_rmsle(
+        np.maximum(y_valid["Fatalities"], epsilon), np.maximum(y_valid_pred["Fatalities"], epsilon)
+    )
+    rmsle_avg = (rmsle_cases + rmsle_fatalities) / 2
+    print(f"Average RMSLE on valid set: {rmsle_avg}")
+    metrics_all.append(rmsle_avg)
+
+# 5) Save the validation accuracy
+min_index = np.argmin(metrics_all)
+pd.Series(data=[metrics_all[min_index]], index=["RMSLE"]).to_csv("submission_score.csv")
+
+# 6) Make predictions on the test set and save them
+X_test_selected = model_l[min_index][2].select(X_test.copy())
+y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test_selected)
+
+# 7) Submit predictions for the test set
+submission_result = pd.DataFrame(
+    {
+        "ForecastId": forecast_ids,
+        "ConfirmedCases": y_test_pred["ConfirmedCases"],
+        "Fatalities": y_test_pred["Fatalities"],
+    }
+)
+submission_result.to_csv("submission.csv", index=False)
diff --git a/rdagent/scenarios/kaggle/knowledge_management/vector_base.py b/rdagent/scenarios/kaggle/knowledge_management/vector_base.py
@@ -4,6 +4,7 @@
 
 import pandas as pd
 from _pytest.cacheprovider import json
+from tqdm import tqdm
 
 from rdagent.components.knowledge_management.vector_base import Document, PDVectorBase
 from rdagent.log import rdagent_logger as logger
@@ -189,20 +190,20 @@ def add_experience_to_vector_base(self, experiment_feedback=None):
             extracted_knowledge = extract_knowledge_from_feedback(experiment_feedback)
 
             document = KGKnowledgeDocument(
-                content=experiment_feedback.get("hypothesis_text", ""),
-                label="Experiment Feedback",
-                competition_name="Experiment Result",
-                task_category=experiment_feedback.get("tasks_factors", "General Task"),
-                field="Research Feedback",
-                ranking=None,
-                score=experiment_feedback.get("current_result", None),
+                content=extracted_knowledge.get("content", ""),
+                title=extracted_knowledge.get("title", "Experiment Feedback"),
+                competition_name=extracted_knowledge.get("competition_name", "Unknown Competition"),
+                task_category=extracted_knowledge.get("task_category", "General Task"),
+                field=extracted_knowledge.get("field", None),
+                ranking=extracted_knowledge.get("ranking", None),
+                score=extracted_knowledge.get("score", None),
             )
             document.create_embedding()
             self.add(document)
             return
 
         # Process Kaggle experience data
-        for experience in self.kaggle_experience_data:
+        for experience in tqdm(self.kaggle_experience_data):
             content = experience.get("content", "")
             label = experience.get("title", "Kaggle Experience")
             competition_name = experience.get("competition_name", "Unknown Competition")
diff --git a/rdagent/scenarios/kaggle/prompts.yaml b/rdagent/scenarios/kaggle/prompts.yaml
@@ -28,7 +28,6 @@ KG_hypothesis_gen_RAG: |-
 hypothesis_and_feedback: |-
   {% for hypothesis, experiment, feedback in trace.hist %}
   Hypothesis {{ loop.index }}: {{ hypothesis }}
-  Corresponding Code (that leads to the difference in performance): {{experiment.sub_workspace_list[0].code_dict.get("model.py")}}
   Observation on the result with the hypothesis: {{ feedback.observations }}
   Feedback on the original hypothesis:  {{ feedback.hypothesis_evaluation }}
   New Feedback for Context (For your reference):  {{ feedback.new_hypothesis }}