microsoft
diff --git a/‎rdagent/app/kaggle/conf.py
+3-2 b/‎rdagent/app/kaggle/conf.py
+3-2
diff --git a/‎rdagent/app/kaggle/loop.py
+7 b/‎rdagent/app/kaggle/loop.py
+7
diff --git a/‎rdagent/components/coder/model_coder/CoSTEER/evolving_strategy.py
+3-8 b/‎rdagent/components/coder/model_coder/CoSTEER/evolving_strategy.py
+3-8
diff --git a/‎rdagent/components/coder/model_coder/model_execute_template_v2.txt
+1-1 b/‎rdagent/components/coder/model_coder/model_execute_template_v2.txt
+1-1
diff --git a/‎rdagent/components/proposal/prompts.yaml
+1-1 b/‎rdagent/components/proposal/prompts.yaml
+1-1
diff --git a/‎rdagent/scenarios/kaggle/developer/coder.py
+67 b/‎rdagent/scenarios/kaggle/developer/coder.py
+67
diff --git a/‎rdagent/scenarios/kaggle/developer/feedback.py
+5-4 b/‎rdagent/scenarios/kaggle/developer/feedback.py
+5-4
diff --git a/‎rdagent/scenarios/kaggle/developer/runner.py
+9-91 b/‎rdagent/scenarios/kaggle/developer/runner.py
+9-91
diff --git a/‎rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py
-1 b/‎rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py
-1
diff --git a/‎rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model_randomforest.py
+2-16 b/‎rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model_randomforest.py
+2-16
@@ -32,6 +32,9 @@ class Config:
     feature_coder: str = "rdagent.scenarios.kaggle.developer.coder.KGFactorCoSTEER"
     """Feature Coder class"""
 
+    model_feature_selection_coder: str = "rdagent.scenarios.kaggle.developer.coder.KGModelFeatureSelectionCoder"
+    """Model Feature Selection Coder class"""
+
     model_coder: str = "rdagent.scenarios.kaggle.developer.coder.KGModelCoSTEER"
     """Model Coder class"""
 
@@ -57,8 +60,6 @@ class Config:
 
     if_action_choosing_based_on_UCB: bool = False
 
-    if_using_feature_selection: bool = False
-
     if_using_graph_rag: bool = False
 
     if_using_vector_rag: bool = False
 
@@ -23,6 +23,7 @@
 from rdagent.scenarios.kaggle.proposal.proposal import (
     KG_ACTION_FEATURE_ENGINEERING,
     KG_ACTION_FEATURE_PROCESSING,
+    KG_ACTION_MODEL_FEATURE_SELECTION,
     KGTrace,
 )
 
@@ -49,6 +50,10 @@ def __init__(self, PROP_SETTING: BasePropSetting):
 
             self.feature_coder: Developer = import_class(PROP_SETTING.feature_coder)(scen)
             logger.log_object(self.feature_coder, tag="feature coder")
+            self.model_feature_selection_coder: Developer = import_class(PROP_SETTING.model_feature_selection_coder)(
+                scen
+            )
+            logger.log_object(self.model_feature_selection_coder, tag="model feature selection coder")
             self.model_coder: Developer = import_class(PROP_SETTING.model_coder)(scen)
             logger.log_object(self.model_coder, tag="model coder")
 
@@ -67,6 +72,8 @@ def coding(self, prev_out: dict[str, Any]):
         with logger.tag("d"):  # develop
             if prev_out["propose"].action in [KG_ACTION_FEATURE_ENGINEERING, KG_ACTION_FEATURE_PROCESSING]:
                 exp = self.feature_coder.develop(prev_out["exp_gen"])
+            elif prev_out["propose"].action == KG_ACTION_MODEL_FEATURE_SELECTION:
+                exp = self.model_feature_selection_coder.develop(prev_out["exp_gen"])
             else:
                 exp = self.model_coder.develop(prev_out["exp_gen"])
             logger.log_object(exp.sub_workspace_list, tag="coder result")
 
@@ -21,6 +21,7 @@
 from rdagent.core.prompts import Prompts
 from rdagent.core.utils import multiprocessing_wrapper
 from rdagent.oai.llm_utils import APIBackend
+from rdagent.scenarios.kaggle.experiment.kaggle_experiment import KG_MODEL_MAPPING
 
 coder_prompts = Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml")
 
@@ -41,14 +42,8 @@ def implement_one_model(
             current_code = ""
             sota_exp_code_dict = current_exp.based_experiments[-1].experiment_workspace.code_dict
             if target_task.version == 2:
-                model_file_mapping = {
-                    "XGBoost": "model/model_xgboost.py",
-                    "RandomForest": "model/model_randomforest.py",
-                    "LightGBM": "model/model_lightgbm.py",
-                    "NN": "model/model_nn.py",
-                }
-                if model_type in model_file_mapping:
-                    current_code = sota_exp_code_dict.get(model_file_mapping[model_type], None)
+                if model_type in KG_MODEL_MAPPING:
+                    current_code = sota_exp_code_dict.get(KG_MODEL_MAPPING[model_type], None)
                 elif "model.py" in sota_exp_code_dict:
                     current_code = sota_exp_code_dict["model.py"]
                 else:
 
@@ -4,7 +4,7 @@ import pickle
 import numpy as np
 import pandas as pd
 import torch
-from model import fit, predict, select
+from model import fit, predict
 
 train_X = pd.DataFrame(np.random.randn(8, 30), columns=[f"{i}" for i in range(30)])
 train_y = pd.Series(np.random.randint(0, 2, 8))
 
@@ -12,7 +12,7 @@ hypothesis_gen:
     {{ hypothesis_output_format }}
 
   user_prompt: |-
-    {% if hypothesis_and_feedback|length == 0 %}    It is the first round of hypothesis generation. The user has no hypothesis on this scenario yet.
+    {% if hypothesis_and_feedback|length == 0 %}It is the first round of hypothesis generation. The user has no hypothesis on this scenario yet.
     {% else %}It is not the first round, the user has made several hypothesis on this scenario and did several evaluation on them.
     The former hypothesis and the corresponding feedbacks are as follows (focus on the last one & the new hypothesis that it provides and reasoning to see if you agree):
     {{ hypothesis_and_feedback }}
 
@@ -1,5 +1,72 @@
+import json
+from pathlib import Path
+
+from jinja2 import Environment, StrictUndefined
+
 from rdagent.components.coder.factor_coder.CoSTEER import FactorCoSTEER
 from rdagent.components.coder.model_coder.CoSTEER import ModelCoSTEER
+from rdagent.core.developer import Developer
+from rdagent.core.prompts import Prompts
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.scenarios.kaggle.experiment.kaggle_experiment import (
+    KG_SELECT_MAPPING,
+    KGModelExperiment,
+)
 
 KGModelCoSTEER = ModelCoSTEER
 KGFactorCoSTEER = FactorCoSTEER
+
+prompt_dict = Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml")
+
+DEFAULT_SELECTION_CODE = """
+import pandas as pd
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    \"""
+    Select relevant features. To be used in fit & predict function.
+    \"""
+    if X.columns.nlevels == 1:
+        return X
+    {% if feature_index_list is not none %}
+    X = X.loc[:, X.columns.levels[0][{{feature_index_list}}].tolist()]
+    {% endif %}
+    X.columns = ["_".join(str(col)).strip() for col in X.columns.values]
+    return X
+"""
+
+
+class KGModelFeatureSelectionCoder(Developer[KGModelExperiment]):
+    def develop(self, exp: KGModelExperiment) -> KGModelExperiment:
+        target_model_type = exp.sub_tasks[0].model_type
+        assert target_model_type in KG_SELECT_MAPPING
+        if len(exp.experiment_workspace.data_description) == 1:
+            code = (
+                Environment(undefined=StrictUndefined)
+                .from_string(DEFAULT_SELECTION_CODE)
+                .render(feature_index_list=None)
+            )
+        else:
+            system_prompt = (
+                Environment(undefined=StrictUndefined)
+                .from_string(prompt_dict["model_feature_selection"]["system"])
+                .render(scenario=self.scen.get_scenario_all_desc(), model_type=exp.sub_tasks[0].model_type)
+            )
+            user_prompt = (
+                Environment(undefined=StrictUndefined)
+                .from_string(prompt_dict["model_feature_selection"]["user"])
+                .render(feature_groups=[desc[0] for desc in exp.experiment_workspace.data_description])
+            )
+
+            chosen_index = json.loads(
+                APIBackend().build_messages_and_create_chat_completion(
+                    user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+                )
+            ).get("Selected Group Index", [i + 1 for i in range(len(exp.experiment_workspace.data_description))])
+            chosen_index_to_list_index = [i - 1 for i in chosen_index]
+
+            code = (
+                Environment(undefined=StrictUndefined)
+                .from_string(DEFAULT_SELECTION_CODE)
+                .render(feature_index_list=chosen_index_to_list_index)
+            )
+        exp.experiment_workspace.inject_code(**{KG_SELECT_MAPPING[target_model_type]: code})
+        return exp
@@ -117,9 +117,9 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
             "last_hypothesis": trace.hist[-1][0] if trace.hist else None,
             "last_task_and_code": last_task_and_code,
             "last_result": trace.hist[-1][1].result if trace.hist else None,
-            "sota_task_and_code": exp.based_experiments[-1].experiment_workspace.data_description
-            if exp.based_experiments
-            else None,
+            "sota_task_and_code": (
+                exp.based_experiments[-1].experiment_workspace.data_description if exp.based_experiments else None
+            ),
             "sota_result": exp.based_experiments[-1].result if exp.based_experiments else None,
             "hypothesis": hypothesis,
             "exp": exp,
@@ -150,6 +150,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
         decision = convert2bool(response_json.get("Replace Best Result", "no"))
 
         experiment_feedback = {
+            "current_competition": self.scen.get_competition_full_desc(),
             "hypothesis_text": hypothesis_text,
             "current_result": current_result,
             "model_code": model_code,
@@ -163,7 +164,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
             self.scen.vector_base.add_experience_to_vector_base(experiment_feedback)
             self.scen.vector_base.save()
         elif self.scen.if_using_graph_rag:
-            trace.knowledge_base.load_from_documents([experiment_feedback], self.scen)
+            trace.knowledge_base.add_document(experiment_feedback, self.scen)
 
         return HypothesisFeedback(
             observations=observations,
 
@@ -3,17 +3,12 @@
 import shutil
 from pathlib import Path
 
-from jinja2 import Environment, StrictUndefined
-
-from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
-from rdagent.components.coder.factor_coder.factor import FactorTask
-from rdagent.components.coder.model_coder.model import ModelTask
 from rdagent.components.runner import CachedRunner
 from rdagent.components.runner.conf import RUNNER_SETTINGS
 from rdagent.core.exception import CoderError, FactorEmptyError, ModelEmptyError
 from rdagent.core.experiment import ASpecificExp
 from rdagent.core.prompts import Prompts
-from rdagent.oai.llm_utils import APIBackend, md5_hash
+from rdagent.oai.llm_utils import md5_hash
 from rdagent.scenarios.kaggle.experiment.kaggle_experiment import (
     KGFactorExperiment,
     KGModelExperiment,
@@ -32,48 +27,6 @@ def get_cache_key(self, exp: ASpecificExp) -> str:
         codes = "\n".join(codes)
         return md5_hash(codes)
 
-    def extract_model_task_from_code(self, code: str) -> str:
-        sys_prompt = (
-            Environment(undefined=StrictUndefined)
-            .from_string(prompt_dict["extract_model_task_from_code"]["system"])
-            .render()
-        )
-
-        user_prompt = (
-            Environment(undefined=StrictUndefined)
-            .from_string(prompt_dict["extract_model_task_from_code"]["user"])
-            .render(file_content=code)
-        )
-
-        model_task_description = APIBackend().build_messages_and_create_chat_completion(
-            user_prompt=user_prompt,
-            system_prompt=sys_prompt,
-            json_mode=True,
-        )
-
-        try:
-            response_json_analysis = json.loads(model_task_description)
-            task_desc = f"""name: {response_json_analysis['name']}
-        description: {response_json_analysis['description']}
-        """
-            task_desc += (
-                f"formulation: {response_json_analysis['formulation']}\n"
-                if response_json_analysis.get("formulation")
-                else ""
-            )
-            task_desc += f"architecture: {response_json_analysis['architecture']}\n"
-            task_desc += (
-                f"variables: {json.dumps(response_json_analysis['variables'], indent=4)}\n"
-                if response_json_analysis.get("variables")
-                else ""
-            )
-            task_desc += f"hyperparameters: {json.dumps(response_json_analysis['hyperparameters'], indent=4)}\n"
-            task_desc += f"model_type: {response_json_analysis['model_type']}\n"
-        except json.JSONDecodeError:
-            task_desc = "Failed to parse LLM's response as JSON"
-
-        return task_desc
-
     def init_develop(self, exp: KGFactorExperiment | KGModelExperiment) -> KGFactorExperiment | KGModelExperiment:
         """
         For the initial development, the experiment serves as a benchmark for feature engineering.
@@ -89,39 +42,6 @@ def init_develop(self, exp: KGFactorExperiment | KGModelExperiment) -> KGFactorE
         result = exp.experiment_workspace.execute(run_env=env_to_use)
 
         exp.result = result
-        sub_task = FactorTask(
-            factor_name="original features", factor_description="here is the original features", factor_formulation=""
-        )
-
-        org_data_path = (
-            Path(KAGGLE_IMPLEMENT_SETTING.local_data_path) / KAGGLE_IMPLEMENT_SETTING.competition / "X_valid.pkl"
-        )
-        with open(org_data_path, "rb") as f:
-            org_data = pickle.load(f)
-        feature_shape = org_data.shape[-1]
-        exp.experiment_workspace.data_description.append((sub_task.get_task_information(), feature_shape))
-
-        model_map = {
-            "XGBoost": "model_xgboost.py",
-            "RandomForest": "model_randomforest.py",
-            "LightGBM": "model_lightgbm.py",
-            "NN": "model_nn.py",
-        }
-
-        workspace_path = exp.experiment_workspace.workspace_path / "model"
-
-        for model_name, model_file in model_map.items():
-            model_file_path = workspace_path / model_file
-
-            if model_file_path.exists():
-                model_description = (
-                    self.extract_model_task_from_code(model_file_path.read_text())
-                    + f"""code: {model_file_path.read_text()}"""
-                )
-            else:
-                model_description = ""
-
-            exp.experiment_workspace.model_description[model_name] = model_description
 
         if RUNNER_SETTINGS.cache_result:
             self.dump_cache_result(exp, result)
@@ -135,17 +55,15 @@ def develop(self, exp: KGModelExperiment) -> KGModelExperiment:
             exp.based_experiments[-1] = self.init_develop(exp.based_experiments[-1])
 
         sub_ws = exp.sub_workspace_list[0]
-        # TODO: There's a possibility of generating a hybrid model (lightgbm + xgboost), which results in having two items in the model_type list.
-        model_type = sub_ws.target_task.model_type
-
-        if sub_ws.code_dict == {}:
-            raise ModelEmptyError("No model is implemented.")
-        else:
-            model_file_name = f"model/model_{model_type.lower()}.py"
-            exp.experiment_workspace.inject_code(**{model_file_name: sub_ws.code_dict["model.py"]})
+        if sub_ws is not None:
+            # TODO: There's a possibility of generating a hybrid model (lightgbm + xgboost), which results in having two items in the model_type list.
+            model_type = sub_ws.target_task.model_type
 
-            model_description = sub_ws.target_task.get_task_information()
-            exp.experiment_workspace.model_description[model_type] = model_description
+            if sub_ws.code_dict == {}:
+                raise ModelEmptyError("No model is implemented.")
+            else:
+                model_file_name = f"model/model_{model_type.lower()}.py"
+                exp.experiment_workspace.inject_code(**{model_file_name: sub_ws.code_dict["model.py"]})
 
         if RUNNER_SETTINGS.cache_result:
             cache_hit, result = self.get_cache_result(exp)
 
@@ -3,7 +3,6 @@
 
 import numpy as np  # linear algebra
 import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
-from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.model_selection import train_test_split
 
 
 
@@ -3,26 +3,15 @@
 from sklearn.ensemble import RandomForestRegressor
 
 
-def select(X: pd.DataFrame) -> pd.DataFrame:
-    """
-    Select relevant features. To be used in fit & predict function.
-    """
-    # For now, we assume all features are relevant. This can be expanded to feature selection logic.
-    return X
-
-
 def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
     """
     Define and train the Random Forest model. Merge feature selection into the pipeline.
     """
     # Initialize the Random Forest model
     model = RandomForestRegressor(n_estimators=100, random_state=32, n_jobs=-1)
 
-    # Select features (if any feature selection is needed)
-    X_train_selected = select(X_train)
-
     # Fit the model
-    model.fit(X_train_selected, y_train)
+    model.fit(X_train, y_train)
 
     return model
 
@@ -31,10 +20,7 @@ def predict(model, X):
     """
     Keep feature selection's consistency and make predictions.
     """
-    # Select features (if any feature selection is needed)
-    X_selected = select(X)
-
     # Predict using the trained model
-    y_pred = model.predict(X_selected)
+    y_pred = model.predict(X)
 
     return y_pred