microsoft · WinstonLiyt · Sep 29, 2024 · Sep 29, 2024 · Sep 29, 2024
diff --git a/rdagent/app/kaggle/loop.py b/rdagent/app/kaggle/loop.py
@@ -19,14 +19,14 @@
 from rdagent.core.utils import import_class
 from rdagent.log import rdagent_logger as logger
 from rdagent.log.time import measure_time
-from rdagent.scenarios.kaggle.experiment.utils import python_files_to_notebook
-from rdagent.scenarios.kaggle.kaggle_crawler import download_data
-from rdagent.scenarios.kaggle.proposal.proposal import (
+from rdagent.scenarios.kaggle.experiment.scenario import (
     KG_ACTION_FEATURE_ENGINEERING,
     KG_ACTION_FEATURE_PROCESSING,
     KG_ACTION_MODEL_FEATURE_SELECTION,
-    KGTrace,
 )
+from rdagent.scenarios.kaggle.experiment.utils import python_files_to_notebook
+from rdagent.scenarios.kaggle.kaggle_crawler import download_data
+from rdagent.scenarios.kaggle.proposal.proposal import KGTrace
 
 
 class KaggleRDLoop(RDLoop):

diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py b/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py
@@ -90,6 +90,7 @@ def evolve(
             ],
             n=RD_AGENT_SETTINGS.multi_proc_n,
         )
+        from rdagent.components.coder.factor_coder.factor import FactorFBWorkspace
 
         for index, target_index in enumerate(to_be_finished_task_index):
             if evo.sub_workspace_list[target_index] is None:

diff --git a/rdagent/components/proposal/model_proposal.py b/rdagent/components/proposal/model_proposal.py
@@ -39,7 +39,7 @@ def gen(self, trace: Trace) -> ModelHypothesis:
             Environment(undefined=StrictUndefined)
             .from_string(ModelHypothesisGen.prompts["hypothesis_gen"]["system_prompt"])
             .render(
-                targets="feature engineering and model building",
+                targets="model tuning",
                 scenario=self.scen.get_scenario_all_desc(),
                 hypothesis_output_format=context_dict["hypothesis_output_format"],
                 hypothesis_specification=context_dict["hypothesis_specification"],
@@ -49,7 +49,7 @@ def gen(self, trace: Trace) -> ModelHypothesis:
             Environment(undefined=StrictUndefined)
             .from_string(ModelHypothesisGen.prompts["hypothesis_gen"]["user_prompt"])
             .render(
-                targets="feature engineering and model building",
+                targets="model tuning",
                 RAG=context_dict["RAG"],
             )
         )

diff --git a/rdagent/components/proposal/prompts.yaml b/rdagent/components/proposal/prompts.yaml
@@ -1,19 +1,24 @@
 hypothesis_gen:
   system_prompt: |-
-    The user is trying to generate new hypothesis on the {{targets}} in data-driven research and development.
-    The {{targets}} are used in a certain scenario, the scenario is as follows:
-    {{ scenario }}
-    The user has made several hypothesis on this scenario and did several evaluation on them. The user will provide this information to you. Check if a new hypothesis has already been proposed. If it is already generated and you agree with it, just use it. If you don't agree, generate a better one.
+    The user is working on generating new hypotheses for the {{targets}} in a data-driven research and development process. 
+    The {{targets}} are used in the following scenario:
+    {{scenario}}
+    The user has already proposed several hypotheses and conducted evaluations on them. This information will be provided to you. Your task is to check whether a similar hypothesis has already been generated. 
+    If one exists and you agree with it, feel free to use it. If you disagree, please generate an improved version.
     {% if hypothesis_specification %}
-    To help you generate new hypothesis, the user has prepared some additional information for you. You should use this information to help generate new {{targets}}.
-    Here are the specifications: {{ hypothesis_specification }}
+    To assist you in formulating new hypotheses, the user has provided some additional information: {{hypothesis_specification}}.
+    **Important:** If the hypothesis_specification outlines the next steps you need to follow, ensure you adhere to those instructions.
     {% endif %}
-    Please generate the output following the format and specifications below:
+    Please generate the output using the following format and specifications:
     {{ hypothesis_output_format }}
 
   user_prompt: |-
-    {% if RAG %}To help you generate new {{targets}}, we have prepared the following information for you:
-    {{ RAG }}{% endif %}
+    {% if RAG %}
+    To assist you in generating new {{targets}}, we have provided the following information: {{RAG}}.
+    **Note:** The provided RAG is for reference only. 
+    You must carefully assess whether the RAG aligns with the {{targets}}. 
+    If it does not, it should not be used. Exercise caution and make your own judgment.
+    {% endif %}
     Also generate the relevant keys for the reasoning and the distilled knowledge that follows. For those keys, in particular for knowledge, explain in the context of the specific scenario to build up domain knowledge in the specific field rather than general knowledge.
 
 hypothesis2experiment:
@@ -35,8 +40,4 @@ hypothesis2experiment:
     {{ target_hypothesis }}
     The former hypothesis and the corresponding feedbacks are as follows:
     {{ hypothesis_and_feedback }}
-    The former proposed {{targets}} on similar hypothesis are as follows:
-    {{ target_list }}
-    To help you generate new {{targets}}, we have prepared the following information for you:
-    {{ RAG }}
     Please generate the new {{targets}} based on the information above.
diff --git a/rdagent/core/knowledge_base.py b/rdagent/core/knowledge_base.py
@@ -14,12 +14,12 @@ def load(self) -> None:
         if self.path is not None and self.path.exists():
             with self.path.open("rb") as f:
                 self.__dict__.update(
-                    pickle.load(f),
+                    pickle.load(f).__dict__,
                 )  # TODO: because we need to align with init function, we need a less hacky way to do this
 
     def dump(self) -> None:
         if self.path is not None:
             self.path.parent.mkdir(parents=True, exist_ok=True)
-            pickle.dump(self.__dict__, self.path.open("wb"))
+            pickle.dump(self, self.path.open("wb"))
         else:
             logger.warning("KnowledgeBase path is not set, dump failed.")
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/fea_share_preprocess.py
@@ -23,6 +23,7 @@ def preprocess_script():
 
     # train
     train = pd.read_csv("/kaggle/input/train.csv")
+    train = train.drop(["id"], axis=1)
     train["store_sqft"] = train["store_sqft"].astype("category")
     train["salad"] = (train["salad_bar"] + train["prepared_food"]) / 2
     train["log_cost"] = np.log1p(train["cost"])

diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py
@@ -20,6 +20,17 @@
 
 prompt_dict = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
 
+KG_ACTION_FEATURE_PROCESSING = "Feature processing"
+KG_ACTION_FEATURE_ENGINEERING = "Feature engineering"
+KG_ACTION_MODEL_FEATURE_SELECTION = "Model feature selection"
+KG_ACTION_MODEL_TUNING = "Model tuning"
+KG_ACTION_LIST = [
+    KG_ACTION_FEATURE_PROCESSING,
+    KG_ACTION_FEATURE_ENGINEERING,
+    KG_ACTION_MODEL_FEATURE_SELECTION,
+    KG_ACTION_MODEL_TUNING,
+]
+
 
 class KGScenario(Scenario):
     def __init__(self, competition: str) -> None:
@@ -54,6 +65,13 @@ def __init__(self, competition: str) -> None:
         self._simulator = self.simulator
         self._background = self.background
 
+        self.action_counts = dict.fromkeys(KG_ACTION_LIST, 0)
+        self.reward_estimates = {action: 0.0 for action in KG_ACTION_LIST}
+        self.reward_estimates["Model feature selection"] = 0.2
+        self.reward_estimates["Model tuning"] = 1.0
+        self.confidence_parameter = 1.0
+        self.initial_performance = 0.0
+
     def _analysis_competition_description(self):
         sys_prompt = (
             Environment(undefined=StrictUndefined)

diff --git a/rdagent/scenarios/kaggle/knowledge_management/prompts.yaml b/rdagent/scenarios/kaggle/knowledge_management/prompts.yaml
@@ -6,7 +6,7 @@ extract_kaggle_knowledge_prompts:
 
     Please provide the analysis in the following JSON format:
     {
-      "content": "all provided content",
+      "content": "Put the provided content here",
       "title": "extracted title, if available",
       "competition_name": "extracted competition name",
       "task_category": "extracted task type, e.g., Classification, Regression",
@@ -80,4 +80,14 @@ extract_knowledge_graph_from_document:
     If you find no valuable insights in the document, please return an empty dict.
 
   user: |-
-    Document content: {{ document_content }}
+    Document content: {{ document_content }}
+
+refine_with_LLM:
+  system: |-
+    You are an experienced data science expert and an assistant, helping the user evaluate and improve content.
+
+  user: |-
+    Here is the target: {{ target }}. 
+    Please evaluate whether the following RAG query result aligns with the target. 
+    If it does not, simply respond with "There are no relevant RAG results to support."
+    RAG query result: {{ text }}.
diff --git a/rdagent/scenarios/kaggle/knowledge_management/vector_base.py b/rdagent/scenarios/kaggle/knowledge_management/vector_base.py
@@ -4,8 +4,10 @@
 
 import pandas as pd
 from _pytest.cacheprovider import json
+from jinja2 import Environment, StrictUndefined
 
 from rdagent.components.knowledge_management.vector_base import Document, PDVectorBase
+from rdagent.core.prompts import Prompts
 from rdagent.log import rdagent_logger as logger
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.scenarios.kaggle.knowledge_management.extract_knowledge import (
@@ -225,12 +227,14 @@ def add_experience_to_vector_base(self, experiment_feedback=None):
             document.create_embedding()
             self.add(document)
 
-    def search_experience(self, query: str, topk_k: int = 5, similarity_threshold: float = 0.1):
+    def search_experience(self, target: str, query: str, topk_k: int = 5, similarity_threshold: float = 0.1):
         """
-        Search for Kaggle experience posts related to the query
+        Search for Kaggle experience posts related to the query, initially filtered by the target.
 
         Parameters:
         ----------
+        target: str
+            The target context to refine the search query.
         query: str
             The search query to find relevant experience posts.
         topk_k: int, optional
@@ -243,15 +247,49 @@ def search_experience(self, query: str, topk_k: int = 5, similarity_threshold: f
         List[KGKnowledgeMetaData], List[float]:
             A list of the most relevant documents and their similarities.
         """
-        search_results, similarities = super().search(query, topk_k=topk_k, similarity_threshold=similarity_threshold)
 
+        # Modify the query to include the target
+        modified_query = f"The target is {target}. And I need you to query {query} based on the {target}."
+
+        # First, search based on the modified query
+        search_results, similarities = super().search(
+            modified_query, topk_k=topk_k, similarity_threshold=similarity_threshold
+        )
+
+        # If the results do not match the target well, refine the search using LLM or further adjustment
         kaggle_docs = []
         for result in search_results:
             kg_doc = KGKnowledgeDocument().from_dict(result.__dict__)
+
+            gpt_feedback = self.refine_with_LLM(target, kg_doc)
+            if gpt_feedback:
+                kg_doc.content = gpt_feedback
+
             kaggle_docs.append(kg_doc)
 
         return kaggle_docs, similarities
 
+    def refine_with_LLM(self, target: str, text: str) -> str:
+        prompt_dict = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
+
+        sys_prompt = (
+            Environment(undefined=StrictUndefined).from_string(prompt_dict["refine_with_LLM"]["system"]).render()
+        )
+
+        user_prompt = (
+            Environment(undefined=StrictUndefined)
+            .from_string(prompt_dict["refine_with_LLM"]["user"])
+            .render(target=target, text=text)
+        )
+
+        response = APIBackend().build_messages_and_create_chat_completion(
+            user_prompt=user_prompt,
+            system_prompt=sys_prompt,
+            json_mode=False,
+        )
+
+        return response
+
     def save(self, vector_df_path: Union[str, Path]):
         """
         Save the vector DataFrame to a file