fix: improve_execution_time_in_kaggle_loop (#279)

peteryang1 · web-flow · commit 4c8f998c76f1 · 2024-09-20T12:07:13.000+08:00
* improve_execution_time_in_kaggle_loop

* fix CI

* fix CI

* fix CI
diff --git a/rdagent/app/kaggle/loop.py b/rdagent/app/kaggle/loop.py
@@ -7,7 +7,7 @@
 from rdagent.components.workflow.conf import BasePropSetting
 from rdagent.components.workflow.rd_loop import RDLoop
 from rdagent.core.developer import Developer
-from rdagent.core.exception import ModelEmptyError
+from rdagent.core.exception import FactorEmptyError, ModelEmptyError
 from rdagent.core.proposal import (
     Hypothesis2Experiment,
     HypothesisExperiment2Feedback,
@@ -71,7 +71,7 @@ def running(self, prev_out: dict[str, Any]):
             logger.log_object(exp, tag="runner result")
         return exp
 
-    skip_loop_error = (ModelEmptyError,)
+    skip_loop_error = (ModelEmptyError, FactorEmptyError)
 
 
 def main(path=None, step_n=None, competition=None):
diff --git a/rdagent/app/qlib_rd_loop/factor_from_report.py b/rdagent/app/qlib_rd_loop/factor_from_report.py
@@ -11,8 +11,6 @@
     extract_first_page_screenshot_from_pdf,
     load_and_process_pdfs_by_langchain,
 )
-from rdagent.components.workflow.rd_loop import RDLoop
-from rdagent.core.exception import FactorEmptyError
 from rdagent.core.prompts import Prompts
 from rdagent.core.proposal import Hypothesis
 from rdagent.log import rdagent_logger as logger
diff --git a/rdagent/components/coder/factor_coder/factor.py b/rdagent/components/coder/factor_coder/factor.py
@@ -87,19 +87,6 @@ def __init__(
         self.executed_factor_value_dataframe = executed_factor_value_dataframe
         self.raise_exception = raise_exception
 
-    @staticmethod
-    def link_data_to_workspace(data_path: Path, workspace_path: Path):
-        data_path = Path(data_path).absolute()  # in case of relative path that will be invalid when we change cwd.
-        workspace_path = Path(workspace_path)
-        for data_file_path in data_path.iterdir():
-            workspace_data_file_path = workspace_path / data_file_path.name
-            if workspace_data_file_path.exists():
-                workspace_data_file_path.unlink()
-            subprocess.run(
-                ["ln", "-s", data_file_path, workspace_data_file_path],
-                check=False,
-            )
-
     def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple[str, pd.DataFrame]:
         """
         execute the implementation and get the factor value by the following steps:
@@ -154,7 +141,7 @@ def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple
             source_data_path.mkdir(exist_ok=True, parents=True)
             code_path = self.workspace_path / f"factor.py"
 
-            self.link_data_to_workspace(source_data_path, self.workspace_path)
+            self.link_all_files_in_folder_to_workspace(source_data_path, self.workspace_path)
 
             execution_feedback = self.FB_EXECUTION_SUCCEEDED
             execution_success = False
diff --git a/rdagent/components/coder/factor_coder/factor_execution_template.txt b/rdagent/components/coder/factor_coder/factor_execution_template.txt
@@ -4,8 +4,8 @@ import numpy as np
 import pandas as pd
 from factor import feature_engineering_cls
 
-if os.path.exists("valid.pkl"):
-    valid_df = pd.read_pickle("valid.pkl")
+if os.path.exists("X_valid.pkl"):
+    valid_df = pd.read_pickle("X_valid.pkl").head(1000)
 else:
     raise FileNotFoundError("No valid data found.")
 
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
@@ -1,11 +1,13 @@
 from __future__ import annotations
 
+import os
 import shutil
 import uuid
 from abc import ABC, abstractmethod
+from collections.abc import Sequence
 from copy import deepcopy
 from pathlib import Path
-from typing import Any, Generic, Sequence, TypeVar
+from typing import Any, Generic, TypeVar
 
 from rdagent.core.conf import RD_AGENT_SETTINGS
 
@@ -111,6 +113,16 @@ def prepare(self) -> None:
         """
         self.workspace_path.mkdir(parents=True, exist_ok=True)
 
+    @staticmethod
+    def link_all_files_in_folder_to_workspace(data_path: Path, workspace_path: Path) -> None:
+        data_path = Path(data_path).absolute()  # in case of relative path that will be invalid when we change cwd.
+        workspace_path = Path(workspace_path)
+        for data_file_path in data_path.iterdir():
+            workspace_data_file_path = workspace_path / data_file_path.name
+            if workspace_data_file_path.exists():
+                workspace_data_file_path.unlink()
+            os.symlink(data_file_path, workspace_data_file_path)
+
     def inject_code(self, **files: str) -> None:
         """
         Inject the code into the folder.
diff --git a/rdagent/core/prompts.py b/rdagent/core/prompts.py
@@ -1,12 +1,11 @@
 from pathlib import Path  # noqa: I001
-from typing import Dict
 
 import yaml
 
 from rdagent.core.utils import SingletonBaseClass
 
 
-class Prompts(SingletonBaseClass, Dict[str, str]):
+class Prompts(SingletonBaseClass, dict[str, str]):
     def __init__(self, file_path: Path) -> None:
         super().__init__()
         with file_path.open(encoding="utf8") as file:
diff --git a/rdagent/core/utils.py b/rdagent/core/utils.py
@@ -30,7 +30,7 @@ def __new__(cls, *args: Any, **kwargs: Any) -> Any:
             raise RDAgentException(exception_message)
         class_name = [(-1, f"{cls.__module__}.{cls.__name__}")]
         args_l = [(i, args[i]) for i in args]
-        kwargs_l = list(sorted(kwargs.items()))
+        kwargs_l = sorted(kwargs.items())
         all_args = class_name + args_l + kwargs_l
         kwargs_hash = hash(tuple(all_args))
         if kwargs_hash not in cls._instance_dict:
diff --git a/rdagent/scenarios/data_mining/proposal/model_proposal.py b/rdagent/scenarios/data_mining/proposal/model_proposal.py
@@ -35,14 +35,18 @@ def __init__(self, scen: Scenario) -> Tuple[dict, bool]:
         super().__init__(scen)
 
     def prepare_context(self, trace: Trace) -> Tuple[dict, bool]:
-        hypothesis_feedback = (
-            Environment(undefined=StrictUndefined)
-            .from_string(prompt_dict["hypothesis_and_feedback"])
-            .render(trace=trace)
+        hypothesis_and_feedback = (
+            (
+                Environment(undefined=StrictUndefined)
+                .from_string(prompt_dict["hypothesis_and_feedback"])
+                .render(trace=trace)
+            )
+            if len(trace.hist) > 0
+            else "No previous hypothesis and feedback available since it's the first round."
         )
         context_dict = {
-            "hypothesis_and_feedback": hypothesis_feedback,
-            "RAG": "",
+            "hypothesis_and_feedback": hypothesis_and_feedback,
+            "RAG": None,
             "hypothesis_output_format": prompt_dict["hypothesis_output_format"],
             "hypothesis_specification": prompt_dict["model_hypothesis_specification"],
         }
@@ -67,9 +71,13 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b
         experiment_output_format = prompt_dict["model_experiment_output_format"]
 
         hypothesis_and_feedback = (
-            Environment(undefined=StrictUndefined)
-            .from_string(prompt_dict["hypothesis_and_feedback"])
-            .render(trace=trace)
+            (
+                Environment(undefined=StrictUndefined)
+                .from_string(prompt_dict["hypothesis_and_feedback"])
+                .render(trace=trace)
+            )
+            if len(trace.hist) > 0
+            else "No previous hypothesis and feedback available since it's the first round."
         )
 
         experiment_list: List[ModelExperiment] = [t[1] for t in trace.hist]
@@ -84,7 +92,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b
             "hypothesis_and_feedback": hypothesis_and_feedback,
             "experiment_output_format": experiment_output_format,
             "target_list": model_list,
-            "RAG": ...,
+            "RAG": None,
         }, True
 
     def convert_response(self, response: str, trace: Trace) -> ModelExperiment:
diff --git a/rdagent/scenarios/kaggle/experiment/meta_tpl/cross_validation_tpl.py b/rdagent/scenarios/kaggle/experiment/meta_tpl/cross_validation_tpl.py
@@ -2,10 +2,8 @@
 
 import numpy as np
 import pandas as pd
-import xgboost as xgb
-from sklearn.metrics import accuracy_score, matthews_corrcoef
 from sklearn.model_selection import KFold
-from sklearn.preprocessing import LabelEncoder, OneHotEncoder
+from sklearn.preprocessing import LabelEncoder
 
 from rdagent.scenarios.kaggle.experiment.meta_tpl.fea_share_preprocess import preprocess
 
diff --git a/rdagent/scenarios/kaggle/experiment/meta_tpl/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/meta_tpl/fea_share_preprocess.py
@@ -1,3 +1,5 @@
+import os
+
 import pandas as pd
 from sklearn.compose import ColumnTransformer
 from sklearn.impute import SimpleImputer
@@ -82,6 +84,15 @@ def preprocess_script():
     """
     This method applies the preprocessing steps to the training, validation, and test datasets.
     """
+    if os.path.exists("X_train.pkl"):
+        X_train = pd.read_pickle("X_train.pkl")
+        X_valid = pd.read_pickle("X_valid.pkl")
+        y_train = pd.read_pickle("y_train.pkl")
+        y_valid = pd.read_pickle("y_valid.pkl")
+        X_test = pd.read_pickle("X_test.pkl")
+        passenger_ids = pd.read_pickle("passenger_ids.pkl")
+
+        return X_train, X_valid, y_train, y_valid, X_test, passenger_ids
     X_train, X_valid, y_train, y_valid = prepreprocess()
 
     # Fit the preprocessor on the training data
diff --git a/rdagent/scenarios/kaggle/experiment/meta_tpl/model/model_rf.py b/rdagent/scenarios/kaggle/experiment/meta_tpl/model/model_rf.py
@@ -23,7 +23,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_vali
     Define and train the Random Forest model. Merge feature selection into the pipeline.
     """
     # Initialize the Random Forest model
-    model = RandomForestClassifier(n_estimators=100, random_state=32)
+    model = RandomForestClassifier(n_estimators=100, random_state=32, n_jobs=-1)
 
     # Select features (if any feature selection is needed)
     X_train_selected = select(X_train)
diff --git a/rdagent/scenarios/kaggle/experiment/meta_tpl/model/model_xgb.py b/rdagent/scenarios/kaggle/experiment/meta_tpl/model/model_xgb.py
@@ -6,21 +6,23 @@
 import xgboost as xgb
 
 
-def select(X):
-    """
-    Select relevant features. To be used in fit & predict function
-    """
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    # Ignore feature selection logic
     return X
 
 
 def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
     """Define and train the model. Merge feature_select"""
+    X_train = select(X_train)
+    X_valid = select(X_valid)
     dtrain = xgb.DMatrix(X_train, label=y_train)
     dvalid = xgb.DMatrix(X_valid, label=y_valid)
 
     # TODO: for quick running....
-    params = {}
-    num_round = 50
+    params = {
+        "nthred": -1,
+    }
+    num_round = 200
 
     evallist = [(dtrain, "train"), (dvalid, "eval")]
     bst = xgb.train(params, dtrain, num_round, evallist)
@@ -32,6 +34,7 @@ def predict(model, X):
     """
     Keep feature select's consistency.
     """
+    X = select(X)
     dtest = xgb.DMatrix(X)
     y_pred_prob = model.predict(dtest)
     return y_pred_prob
diff --git a/rdagent/scenarios/kaggle/experiment/prompts.yaml b/rdagent/scenarios/kaggle/experiment/prompts.yaml
@@ -10,6 +10,7 @@ kg_description_template:
       "Target Description": "A description of the target variable to be predicted",
       "Competition Features": "A dict of relevant features used in the competition and their descriptions (if available)", # if you are not sure about the meaning of the feature, please add a (guess) before the description. Importantly, your feature name should be exactly the same as the feature name in the dataset!
     }
+    Since these might be very similar column names in data like one_hot_encoded columns, you can use some regex to group them together.
 
 
   user: |-
@@ -144,7 +145,7 @@ kg_model_interface: |-
   from xgboost import DMatrix
 
 
-  def select(self, X: pd.DataFrame) -> pd.DataFrame: ...  # Implement feature selection logic
+  def select(X: pd.DataFrame) -> pd.DataFrame: ...  # Implement feature selection logic
 
 
   def fit(
@@ -178,7 +179,7 @@ kg_model_interface: |-
   from sklearn.metrics import accuracy_score
 
 
-  def select(self, X: pd.DataFrame) -> pd.DataFrame: ...  # Implement feature selection logic
+  def select(X: pd.DataFrame) -> pd.DataFrame: ...  # Implement feature selection logic
 
 
   def fit(
@@ -207,7 +208,7 @@ kg_model_interface: |-
   from lightgbm import LGBMClassifier, LGBMRegressor
 
 
-  def select(self, X: pd.DataFrame) -> pd.DataFrame: ...  # Implement feature selection logic
+  def select(X: pd.DataFrame) -> pd.DataFrame: ...  # Implement feature selection logic
 
 
   def fit(
@@ -247,7 +248,7 @@ kg_model_interface: |-
           return x
 
 
-  def select(self, X: pd.DataFrame) -> pd.DataFrame: ...  # Implement feature selection logic
+  def select(X: pd.DataFrame) -> pd.DataFrame: ...  # Implement feature selection logic
 
 
   def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame) -> torch.nn.Module:
diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py
@@ -1,4 +1,6 @@
+import io
 import json
+import pickle
 from pathlib import Path
 
 import pandas as pd
@@ -93,9 +95,12 @@ def background(self) -> str:
     def source_data(self) -> str:
         data_folder = Path(FACTOR_IMPLEMENT_SETTINGS.data_folder) / self.competition
 
-        if (data_folder / "valid.pkl").exists():
-            X_valid = pd.read_pickle(data_folder / "valid.pkl")
-            return X_valid.head()
+        if (data_folder / "X_valid.pkl").exists():
+            X_valid = pd.read_pickle(data_folder / "X_valid.pkl")
+            buffer = io.StringIO()
+            X_valid.info(verbose=True, buf=buffer, show_counts=True)
+            data_info = buffer.getvalue()
+            return data_info
 
         preprocess_experiment = KGFactorExperiment([])
         (
@@ -108,8 +113,17 @@ def source_data(self) -> str:
         ) = preprocess_experiment.experiment_workspace.generate_preprocess_data()
 
         data_folder.mkdir(exist_ok=True, parents=True)
-        X_valid.to_pickle(data_folder / "valid.pkl")
-        return X_valid.head()
+        pickle.dump(X_train, open(data_folder / "X_train.pkl", "wb"))
+        pickle.dump(X_valid, open(data_folder / "X_valid.pkl", "wb"))
+        pickle.dump(y_train, open(data_folder / "y_train.pkl", "wb"))
+        pickle.dump(y_valid, open(data_folder / "y_valid.pkl", "wb"))
+        pickle.dump(X_test, open(data_folder / "X_test.pkl", "wb"))
+        pickle.dump(passenger_ids, open(data_folder / "passenger_ids.pkl", "wb"))
+
+        buffer = io.StringIO()
+        X_valid.info(verbose=True, buf=buffer, show_counts=True)
+        data_info = buffer.getvalue()
+        return data_info
 
     @property
     def output_format(self) -> str:
diff --git a/rdagent/scenarios/kaggle/experiment/workspace.py b/rdagent/scenarios/kaggle/experiment/workspace.py
@@ -5,6 +5,7 @@
 import pandas as pd
 
 from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
+from rdagent.components.coder.factor_coder.config import FACTOR_IMPLEMENT_SETTINGS
 from rdagent.core.experiment import FBWorkspace
 from rdagent.log import rdagent_logger as logger
 from rdagent.utils.env import KGDockerEnv
@@ -58,6 +59,11 @@ def generate_preprocess_data(
 
     def execute(self, run_env: dict = {}, *args, **kwargs) -> str:
         logger.info(f"Running the experiment in {self.workspace_path}")
+
+        # link the data to the workspace to speed up the preprocessing
+        source_data_path = Path(FACTOR_IMPLEMENT_SETTINGS.data_folder) / KAGGLE_IMPLEMENT_SETTING.competition
+        self.link_all_files_in_folder_to_workspace(source_data_path, self.workspace_path)
+
         kgde = KGDockerEnv(KAGGLE_IMPLEMENT_SETTING.competition)
         kgde.prepare()
 
diff --git a/rdagent/scenarios/kaggle/proposal/proposal.py b/rdagent/scenarios/kaggle/proposal/proposal.py
diff --git a/rdagent/scenarios/qlib/proposal/factor_proposal.py b/rdagent/scenarios/qlib/proposal/factor_proposal.py
diff --git a/rdagent/scenarios/qlib/proposal/model_proposal.py b/rdagent/scenarios/qlib/proposal/model_proposal.py

Original file line number	Diff line number	Diff line change
`@@ -11,8 +11,6 @@`
`11`	`11`	`extract_first_page_screenshot_from_pdf,`
`12`	`12`	`load_and_process_pdfs_by_langchain,`
`13`	`13`	`)`
`14`		`-from rdagent.components.workflow.rd_loop import RDLoop`
`15`		`-from rdagent.core.exception import FactorEmptyError`
`16`	`14`	`from rdagent.core.prompts import Prompts`
`17`	`15`	`from rdagent.core.proposal import Hypothesis`
`18`	`16`	`from rdagent.log import rdagent_logger as logger`