Skip to content

Commit cf1292d

Browse files
feat: implement isolated model feature selection loop (#370)
* rename meta_tpl * use a isolated coder to deal with model feature selection and refine the structure * fix CI * fix: fix some errors in scenario.py, proposal.py and runner.py and several complex competition scenarios(#365) * fix several bugs in proposal and runner * fix a bug in feedback-prize-english-language-learning * fix some bugs and templates * fix the bug in optiver and nlp problem * delete unnecessary codes * remove unnecessary codes * complete forest and s4e8 * push * feedback & s4e8 & forest * optiver finished * s3e11 & s3e26 * s4e9 finished * sf-crime finished * the last one finished --------- Co-authored-by: WinstonLiyt <[email protected]> Co-authored-by: WinstonLiyte <[email protected]>
1 parent 2e383b1 commit cf1292d

File tree

72 files changed

+800
-955
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

72 files changed

+800
-955
lines changed

rdagent/app/kaggle/conf.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ class Config:
3232
feature_coder: str = "rdagent.scenarios.kaggle.developer.coder.KGFactorCoSTEER"
3333
"""Feature Coder class"""
3434

35+
model_feature_selection_coder: str = "rdagent.scenarios.kaggle.developer.coder.KGModelFeatureSelectionCoder"
36+
"""Model Feature Selection Coder class"""
37+
3538
model_coder: str = "rdagent.scenarios.kaggle.developer.coder.KGModelCoSTEER"
3639
"""Model Coder class"""
3740

@@ -57,8 +60,6 @@ class Config:
5760

5861
if_action_choosing_based_on_UCB: bool = False
5962

60-
if_using_feature_selection: bool = False
61-
6263
if_using_graph_rag: bool = False
6364

6465
if_using_vector_rag: bool = False

rdagent/app/kaggle/loop.py

+7
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from rdagent.scenarios.kaggle.proposal.proposal import (
2424
KG_ACTION_FEATURE_ENGINEERING,
2525
KG_ACTION_FEATURE_PROCESSING,
26+
KG_ACTION_MODEL_FEATURE_SELECTION,
2627
KGTrace,
2728
)
2829

@@ -49,6 +50,10 @@ def __init__(self, PROP_SETTING: BasePropSetting):
4950

5051
self.feature_coder: Developer = import_class(PROP_SETTING.feature_coder)(scen)
5152
logger.log_object(self.feature_coder, tag="feature coder")
53+
self.model_feature_selection_coder: Developer = import_class(PROP_SETTING.model_feature_selection_coder)(
54+
scen
55+
)
56+
logger.log_object(self.model_feature_selection_coder, tag="model feature selection coder")
5257
self.model_coder: Developer = import_class(PROP_SETTING.model_coder)(scen)
5358
logger.log_object(self.model_coder, tag="model coder")
5459

@@ -67,6 +72,8 @@ def coding(self, prev_out: dict[str, Any]):
6772
with logger.tag("d"): # develop
6873
if prev_out["propose"].action in [KG_ACTION_FEATURE_ENGINEERING, KG_ACTION_FEATURE_PROCESSING]:
6974
exp = self.feature_coder.develop(prev_out["exp_gen"])
75+
elif prev_out["propose"].action == KG_ACTION_MODEL_FEATURE_SELECTION:
76+
exp = self.model_feature_selection_coder.develop(prev_out["exp_gen"])
7077
else:
7178
exp = self.model_coder.develop(prev_out["exp_gen"])
7279
logger.log_object(exp.sub_workspace_list, tag="coder result")

rdagent/components/coder/model_coder/CoSTEER/evolving_strategy.py

+3-8
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from rdagent.core.prompts import Prompts
2222
from rdagent.core.utils import multiprocessing_wrapper
2323
from rdagent.oai.llm_utils import APIBackend
24+
from rdagent.scenarios.kaggle.experiment.kaggle_experiment import KG_MODEL_MAPPING
2425

2526
coder_prompts = Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml")
2627

@@ -41,14 +42,8 @@ def implement_one_model(
4142
current_code = ""
4243
sota_exp_code_dict = current_exp.based_experiments[-1].experiment_workspace.code_dict
4344
if target_task.version == 2:
44-
model_file_mapping = {
45-
"XGBoost": "model/model_xgboost.py",
46-
"RandomForest": "model/model_randomforest.py",
47-
"LightGBM": "model/model_lightgbm.py",
48-
"NN": "model/model_nn.py",
49-
}
50-
if model_type in model_file_mapping:
51-
current_code = sota_exp_code_dict.get(model_file_mapping[model_type], None)
45+
if model_type in KG_MODEL_MAPPING:
46+
current_code = sota_exp_code_dict.get(KG_MODEL_MAPPING[model_type], None)
5247
elif "model.py" in sota_exp_code_dict:
5348
current_code = sota_exp_code_dict["model.py"]
5449
else:

rdagent/components/coder/model_coder/model_execute_template_v2.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import pickle
44
import numpy as np
55
import pandas as pd
66
import torch
7-
from model import fit, predict, select
7+
from model import fit, predict
88

99
train_X = pd.DataFrame(np.random.randn(8, 30), columns=[f"{i}" for i in range(30)])
1010
train_y = pd.Series(np.random.randint(0, 2, 8))

rdagent/components/proposal/prompts.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ hypothesis_gen:
1212
{{ hypothesis_output_format }}
1313
1414
user_prompt: |-
15-
{% if hypothesis_and_feedback|length == 0 %} It is the first round of hypothesis generation. The user has no hypothesis on this scenario yet.
15+
{% if hypothesis_and_feedback|length == 0 %}It is the first round of hypothesis generation. The user has no hypothesis on this scenario yet.
1616
{% else %}It is not the first round, the user has made several hypothesis on this scenario and did several evaluation on them.
1717
The former hypothesis and the corresponding feedbacks are as follows (focus on the last one & the new hypothesis that it provides and reasoning to see if you agree):
1818
{{ hypothesis_and_feedback }}
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,72 @@
1+
import json
2+
from pathlib import Path
3+
4+
from jinja2 import Environment, StrictUndefined
5+
16
from rdagent.components.coder.factor_coder.CoSTEER import FactorCoSTEER
27
from rdagent.components.coder.model_coder.CoSTEER import ModelCoSTEER
8+
from rdagent.core.developer import Developer
9+
from rdagent.core.prompts import Prompts
10+
from rdagent.oai.llm_utils import APIBackend
11+
from rdagent.scenarios.kaggle.experiment.kaggle_experiment import (
12+
KG_SELECT_MAPPING,
13+
KGModelExperiment,
14+
)
315

416
KGModelCoSTEER = ModelCoSTEER
517
KGFactorCoSTEER = FactorCoSTEER
18+
19+
prompt_dict = Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml")
20+
21+
DEFAULT_SELECTION_CODE = """
22+
import pandas as pd
23+
def select(X: pd.DataFrame) -> pd.DataFrame:
24+
\"""
25+
Select relevant features. To be used in fit & predict function.
26+
\"""
27+
if X.columns.nlevels == 1:
28+
return X
29+
{% if feature_index_list is not none %}
30+
X = X.loc[:, X.columns.levels[0][{{feature_index_list}}].tolist()]
31+
{% endif %}
32+
X.columns = ["_".join(str(col)).strip() for col in X.columns.values]
33+
return X
34+
"""
35+
36+
37+
class KGModelFeatureSelectionCoder(Developer[KGModelExperiment]):
38+
def develop(self, exp: KGModelExperiment) -> KGModelExperiment:
39+
target_model_type = exp.sub_tasks[0].model_type
40+
assert target_model_type in KG_SELECT_MAPPING
41+
if len(exp.experiment_workspace.data_description) == 1:
42+
code = (
43+
Environment(undefined=StrictUndefined)
44+
.from_string(DEFAULT_SELECTION_CODE)
45+
.render(feature_index_list=None)
46+
)
47+
else:
48+
system_prompt = (
49+
Environment(undefined=StrictUndefined)
50+
.from_string(prompt_dict["model_feature_selection"]["system"])
51+
.render(scenario=self.scen.get_scenario_all_desc(), model_type=exp.sub_tasks[0].model_type)
52+
)
53+
user_prompt = (
54+
Environment(undefined=StrictUndefined)
55+
.from_string(prompt_dict["model_feature_selection"]["user"])
56+
.render(feature_groups=[desc[0] for desc in exp.experiment_workspace.data_description])
57+
)
58+
59+
chosen_index = json.loads(
60+
APIBackend().build_messages_and_create_chat_completion(
61+
user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
62+
)
63+
).get("Selected Group Index", [i + 1 for i in range(len(exp.experiment_workspace.data_description))])
64+
chosen_index_to_list_index = [i - 1 for i in chosen_index]
65+
66+
code = (
67+
Environment(undefined=StrictUndefined)
68+
.from_string(DEFAULT_SELECTION_CODE)
69+
.render(feature_index_list=chosen_index_to_list_index)
70+
)
71+
exp.experiment_workspace.inject_code(**{KG_SELECT_MAPPING[target_model_type]: code})
72+
return exp

rdagent/scenarios/kaggle/developer/feedback.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,9 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
117117
"last_hypothesis": trace.hist[-1][0] if trace.hist else None,
118118
"last_task_and_code": last_task_and_code,
119119
"last_result": trace.hist[-1][1].result if trace.hist else None,
120-
"sota_task_and_code": exp.based_experiments[-1].experiment_workspace.data_description
121-
if exp.based_experiments
122-
else None,
120+
"sota_task_and_code": (
121+
exp.based_experiments[-1].experiment_workspace.data_description if exp.based_experiments else None
122+
),
123123
"sota_result": exp.based_experiments[-1].result if exp.based_experiments else None,
124124
"hypothesis": hypothesis,
125125
"exp": exp,
@@ -150,6 +150,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
150150
decision = convert2bool(response_json.get("Replace Best Result", "no"))
151151

152152
experiment_feedback = {
153+
"current_competition": self.scen.get_competition_full_desc(),
153154
"hypothesis_text": hypothesis_text,
154155
"current_result": current_result,
155156
"model_code": model_code,
@@ -163,7 +164,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
163164
self.scen.vector_base.add_experience_to_vector_base(experiment_feedback)
164165
self.scen.vector_base.save()
165166
elif self.scen.if_using_graph_rag:
166-
trace.knowledge_base.load_from_documents([experiment_feedback], self.scen)
167+
trace.knowledge_base.add_document(experiment_feedback, self.scen)
167168

168169
return HypothesisFeedback(
169170
observations=observations,

rdagent/scenarios/kaggle/developer/runner.py

+9-91
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,12 @@
33
import shutil
44
from pathlib import Path
55

6-
from jinja2 import Environment, StrictUndefined
7-
8-
from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
9-
from rdagent.components.coder.factor_coder.factor import FactorTask
10-
from rdagent.components.coder.model_coder.model import ModelTask
116
from rdagent.components.runner import CachedRunner
127
from rdagent.components.runner.conf import RUNNER_SETTINGS
138
from rdagent.core.exception import CoderError, FactorEmptyError, ModelEmptyError
149
from rdagent.core.experiment import ASpecificExp
1510
from rdagent.core.prompts import Prompts
16-
from rdagent.oai.llm_utils import APIBackend, md5_hash
11+
from rdagent.oai.llm_utils import md5_hash
1712
from rdagent.scenarios.kaggle.experiment.kaggle_experiment import (
1813
KGFactorExperiment,
1914
KGModelExperiment,
@@ -32,48 +27,6 @@ def get_cache_key(self, exp: ASpecificExp) -> str:
3227
codes = "\n".join(codes)
3328
return md5_hash(codes)
3429

35-
def extract_model_task_from_code(self, code: str) -> str:
36-
sys_prompt = (
37-
Environment(undefined=StrictUndefined)
38-
.from_string(prompt_dict["extract_model_task_from_code"]["system"])
39-
.render()
40-
)
41-
42-
user_prompt = (
43-
Environment(undefined=StrictUndefined)
44-
.from_string(prompt_dict["extract_model_task_from_code"]["user"])
45-
.render(file_content=code)
46-
)
47-
48-
model_task_description = APIBackend().build_messages_and_create_chat_completion(
49-
user_prompt=user_prompt,
50-
system_prompt=sys_prompt,
51-
json_mode=True,
52-
)
53-
54-
try:
55-
response_json_analysis = json.loads(model_task_description)
56-
task_desc = f"""name: {response_json_analysis['name']}
57-
description: {response_json_analysis['description']}
58-
"""
59-
task_desc += (
60-
f"formulation: {response_json_analysis['formulation']}\n"
61-
if response_json_analysis.get("formulation")
62-
else ""
63-
)
64-
task_desc += f"architecture: {response_json_analysis['architecture']}\n"
65-
task_desc += (
66-
f"variables: {json.dumps(response_json_analysis['variables'], indent=4)}\n"
67-
if response_json_analysis.get("variables")
68-
else ""
69-
)
70-
task_desc += f"hyperparameters: {json.dumps(response_json_analysis['hyperparameters'], indent=4)}\n"
71-
task_desc += f"model_type: {response_json_analysis['model_type']}\n"
72-
except json.JSONDecodeError:
73-
task_desc = "Failed to parse LLM's response as JSON"
74-
75-
return task_desc
76-
7730
def init_develop(self, exp: KGFactorExperiment | KGModelExperiment) -> KGFactorExperiment | KGModelExperiment:
7831
"""
7932
For the initial development, the experiment serves as a benchmark for feature engineering.
@@ -89,39 +42,6 @@ def init_develop(self, exp: KGFactorExperiment | KGModelExperiment) -> KGFactorE
8942
result = exp.experiment_workspace.execute(run_env=env_to_use)
9043

9144
exp.result = result
92-
sub_task = FactorTask(
93-
factor_name="original features", factor_description="here is the original features", factor_formulation=""
94-
)
95-
96-
org_data_path = (
97-
Path(KAGGLE_IMPLEMENT_SETTING.local_data_path) / KAGGLE_IMPLEMENT_SETTING.competition / "X_valid.pkl"
98-
)
99-
with open(org_data_path, "rb") as f:
100-
org_data = pickle.load(f)
101-
feature_shape = org_data.shape[-1]
102-
exp.experiment_workspace.data_description.append((sub_task.get_task_information(), feature_shape))
103-
104-
model_map = {
105-
"XGBoost": "model_xgboost.py",
106-
"RandomForest": "model_randomforest.py",
107-
"LightGBM": "model_lightgbm.py",
108-
"NN": "model_nn.py",
109-
}
110-
111-
workspace_path = exp.experiment_workspace.workspace_path / "model"
112-
113-
for model_name, model_file in model_map.items():
114-
model_file_path = workspace_path / model_file
115-
116-
if model_file_path.exists():
117-
model_description = (
118-
self.extract_model_task_from_code(model_file_path.read_text())
119-
+ f"""code: {model_file_path.read_text()}"""
120-
)
121-
else:
122-
model_description = ""
123-
124-
exp.experiment_workspace.model_description[model_name] = model_description
12545

12646
if RUNNER_SETTINGS.cache_result:
12747
self.dump_cache_result(exp, result)
@@ -135,17 +55,15 @@ def develop(self, exp: KGModelExperiment) -> KGModelExperiment:
13555
exp.based_experiments[-1] = self.init_develop(exp.based_experiments[-1])
13656

13757
sub_ws = exp.sub_workspace_list[0]
138-
# TODO: There's a possibility of generating a hybrid model (lightgbm + xgboost), which results in having two items in the model_type list.
139-
model_type = sub_ws.target_task.model_type
140-
141-
if sub_ws.code_dict == {}:
142-
raise ModelEmptyError("No model is implemented.")
143-
else:
144-
model_file_name = f"model/model_{model_type.lower()}.py"
145-
exp.experiment_workspace.inject_code(**{model_file_name: sub_ws.code_dict["model.py"]})
58+
if sub_ws is not None:
59+
# TODO: There's a possibility of generating a hybrid model (lightgbm + xgboost), which results in having two items in the model_type list.
60+
model_type = sub_ws.target_task.model_type
14661

147-
model_description = sub_ws.target_task.get_task_information()
148-
exp.experiment_workspace.model_description[model_type] = model_description
62+
if sub_ws.code_dict == {}:
63+
raise ModelEmptyError("No model is implemented.")
64+
else:
65+
model_file_name = f"model/model_{model_type.lower()}.py"
66+
exp.experiment_workspace.inject_code(**{model_file_name: sub_ws.code_dict["model.py"]})
14967

15068
if RUNNER_SETTINGS.cache_result:
15169
cache_hit, result = self.get_cache_result(exp)

rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/fea_share_preprocess.py

-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33

44
import numpy as np # linear algebra
55
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
6-
from sklearn.feature_extraction.text import TfidfVectorizer
76
from sklearn.model_selection import train_test_split
87

98

rdagent/scenarios/kaggle/experiment/feedback-prize-english-language-learning_template/model/model_randomforest.py

+2-16
Original file line numberDiff line numberDiff line change
@@ -3,26 +3,15 @@
33
from sklearn.ensemble import RandomForestRegressor
44

55

6-
def select(X: pd.DataFrame) -> pd.DataFrame:
7-
"""
8-
Select relevant features. To be used in fit & predict function.
9-
"""
10-
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
11-
return X
12-
13-
146
def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
157
"""
168
Define and train the Random Forest model. Merge feature selection into the pipeline.
179
"""
1810
# Initialize the Random Forest model
1911
model = RandomForestRegressor(n_estimators=100, random_state=32, n_jobs=-1)
2012

21-
# Select features (if any feature selection is needed)
22-
X_train_selected = select(X_train)
23-
2413
# Fit the model
25-
model.fit(X_train_selected, y_train)
14+
model.fit(X_train, y_train)
2615

2716
return model
2817

@@ -31,10 +20,7 @@ def predict(model, X):
3120
"""
3221
Keep feature selection's consistency and make predictions.
3322
"""
34-
# Select features (if any feature selection is needed)
35-
X_selected = select(X)
36-
3723
# Predict using the trained model
38-
y_pred = model.predict(X_selected)
24+
y_pred = model.predict(X)
3925

4026
return y_pred

0 commit comments

Comments
 (0)