Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: fix some bugs in rag #399

Merged
merged 2 commits into from
Sep 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions rdagent/app/kaggle/loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@
from rdagent.core.utils import import_class
from rdagent.log import rdagent_logger as logger
from rdagent.log.time import measure_time
from rdagent.scenarios.kaggle.experiment.utils import python_files_to_notebook
from rdagent.scenarios.kaggle.kaggle_crawler import download_data
from rdagent.scenarios.kaggle.proposal.proposal import (
from rdagent.scenarios.kaggle.experiment.scenario import (
KG_ACTION_FEATURE_ENGINEERING,
KG_ACTION_FEATURE_PROCESSING,
KG_ACTION_MODEL_FEATURE_SELECTION,
KGTrace,
)
from rdagent.scenarios.kaggle.experiment.utils import python_files_to_notebook
from rdagent.scenarios.kaggle.kaggle_crawler import download_data
from rdagent.scenarios.kaggle.proposal.proposal import KGTrace


class KaggleRDLoop(RDLoop):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def evolve(
],
n=RD_AGENT_SETTINGS.multi_proc_n,
)
from rdagent.components.coder.factor_coder.factor import FactorFBWorkspace

for index, target_index in enumerate(to_be_finished_task_index):
if evo.sub_workspace_list[target_index] is None:
Expand Down
4 changes: 2 additions & 2 deletions rdagent/components/proposal/model_proposal.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def gen(self, trace: Trace) -> ModelHypothesis:
Environment(undefined=StrictUndefined)
.from_string(ModelHypothesisGen.prompts["hypothesis_gen"]["system_prompt"])
.render(
targets="feature engineering and model building",
targets="model tuning",
scenario=self.scen.get_scenario_all_desc(),
hypothesis_output_format=context_dict["hypothesis_output_format"],
hypothesis_specification=context_dict["hypothesis_specification"],
Expand All @@ -49,7 +49,7 @@ def gen(self, trace: Trace) -> ModelHypothesis:
Environment(undefined=StrictUndefined)
.from_string(ModelHypothesisGen.prompts["hypothesis_gen"]["user_prompt"])
.render(
targets="feature engineering and model building",
targets="model tuning",
RAG=context_dict["RAG"],
)
)
Expand Down
27 changes: 14 additions & 13 deletions rdagent/components/proposal/prompts.yaml
Original file line number Diff line number Diff line change
@@ -1,19 +1,24 @@
hypothesis_gen:
system_prompt: |-
The user is trying to generate new hypothesis on the {{targets}} in data-driven research and development.
The {{targets}} are used in a certain scenario, the scenario is as follows:
{{ scenario }}
The user has made several hypothesis on this scenario and did several evaluation on them. The user will provide this information to you. Check if a new hypothesis has already been proposed. If it is already generated and you agree with it, just use it. If you don't agree, generate a better one.
The user is working on generating new hypotheses for the {{targets}} in a data-driven research and development process.
The {{targets}} are used in the following scenario:
{{scenario}}
The user has already proposed several hypotheses and conducted evaluations on them. This information will be provided to you. Your task is to check whether a similar hypothesis has already been generated.
If one exists and you agree with it, feel free to use it. If you disagree, please generate an improved version.
{% if hypothesis_specification %}
To help you generate new hypothesis, the user has prepared some additional information for you. You should use this information to help generate new {{targets}}.
Here are the specifications: {{ hypothesis_specification }}
To assist you in formulating new hypotheses, the user has provided some additional information: {{hypothesis_specification}}.
**Important:** If the hypothesis_specification outlines the next steps you need to follow, ensure you adhere to those instructions.
{% endif %}
Please generate the output following the format and specifications below:
Please generate the output using the following format and specifications:
{{ hypothesis_output_format }}

user_prompt: |-
{% if RAG %}To help you generate new {{targets}}, we have prepared the following information for you:
{{ RAG }}{% endif %}
{% if RAG %}
To assist you in generating new {{targets}}, we have provided the following information: {{RAG}}.
**Note:** The provided RAG is for reference only.
You must carefully assess whether the RAG aligns with the {{targets}}.
If it does not, it should not be used. Exercise caution and make your own judgment.
{% endif %}
Also generate the relevant keys for the reasoning and the distilled knowledge that follows. For those keys, in particular for knowledge, explain in the context of the specific scenario to build up domain knowledge in the specific field rather than general knowledge.

hypothesis2experiment:
Expand All @@ -35,8 +40,4 @@ hypothesis2experiment:
{{ target_hypothesis }}
The former hypothesis and the corresponding feedbacks are as follows:
{{ hypothesis_and_feedback }}
The former proposed {{targets}} on similar hypothesis are as follows:
{{ target_list }}
To help you generate new {{targets}}, we have prepared the following information for you:
{{ RAG }}
Please generate the new {{targets}} based on the information above.
4 changes: 2 additions & 2 deletions rdagent/core/knowledge_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ def load(self) -> None:
if self.path is not None and self.path.exists():
with self.path.open("rb") as f:
self.__dict__.update(
pickle.load(f),
pickle.load(f).__dict__,
) # TODO: because we need to align with init function, we need a less hacky way to do this

def dump(self) -> None:
if self.path is not None:
self.path.parent.mkdir(parents=True, exist_ok=True)
pickle.dump(self.__dict__, self.path.open("wb"))
pickle.dump(self, self.path.open("wb"))
else:
logger.warning("KnowledgeBase path is not set, dump failed.")
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def preprocess_script():

# train
train = pd.read_csv("/kaggle/input/train.csv")
train = train.drop(["id"], axis=1)
train["store_sqft"] = train["store_sqft"].astype("category")
train["salad"] = (train["salad_bar"] + train["prepared_food"]) / 2
train["log_cost"] = np.log1p(train["cost"])
Expand Down
18 changes: 18 additions & 0 deletions rdagent/scenarios/kaggle/experiment/scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,17 @@

prompt_dict = Prompts(file_path=Path(__file__).parent / "prompts.yaml")

KG_ACTION_FEATURE_PROCESSING = "Feature processing"
KG_ACTION_FEATURE_ENGINEERING = "Feature engineering"
KG_ACTION_MODEL_FEATURE_SELECTION = "Model feature selection"
KG_ACTION_MODEL_TUNING = "Model tuning"
KG_ACTION_LIST = [
KG_ACTION_FEATURE_PROCESSING,
KG_ACTION_FEATURE_ENGINEERING,
KG_ACTION_MODEL_FEATURE_SELECTION,
KG_ACTION_MODEL_TUNING,
]


class KGScenario(Scenario):
def __init__(self, competition: str) -> None:
Expand Down Expand Up @@ -54,6 +65,13 @@ def __init__(self, competition: str) -> None:
self._simulator = self.simulator
self._background = self.background

self.action_counts = dict.fromkeys(KG_ACTION_LIST, 0)
self.reward_estimates = {action: 0.0 for action in KG_ACTION_LIST}
self.reward_estimates["Model feature selection"] = 0.2
self.reward_estimates["Model tuning"] = 1.0
self.confidence_parameter = 1.0
self.initial_performance = 0.0

def _analysis_competition_description(self):
sys_prompt = (
Environment(undefined=StrictUndefined)
Expand Down
14 changes: 12 additions & 2 deletions rdagent/scenarios/kaggle/knowledge_management/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ extract_kaggle_knowledge_prompts:

Please provide the analysis in the following JSON format:
{
"content": "all provided content",
"content": "Put the provided content here",
"title": "extracted title, if available",
"competition_name": "extracted competition name",
"task_category": "extracted task type, e.g., Classification, Regression",
Expand Down Expand Up @@ -80,4 +80,14 @@ extract_knowledge_graph_from_document:
If you find no valuable insights in the document, please return an empty dict.

user: |-
Document content: {{ document_content }}
Document content: {{ document_content }}

refine_with_LLM:
system: |-
You are an experienced data science expert and an assistant, helping the user evaluate and improve content.

user: |-
Here is the target: {{ target }}.
Please evaluate whether the following RAG query result aligns with the target.
If it does not, simply respond with "There are no relevant RAG results to support."
RAG query result: {{ text }}.
44 changes: 41 additions & 3 deletions rdagent/scenarios/kaggle/knowledge_management/vector_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@

import pandas as pd
from _pytest.cacheprovider import json
from jinja2 import Environment, StrictUndefined

from rdagent.components.knowledge_management.vector_base import Document, PDVectorBase
from rdagent.core.prompts import Prompts
from rdagent.log import rdagent_logger as logger
from rdagent.oai.llm_utils import APIBackend
from rdagent.scenarios.kaggle.knowledge_management.extract_knowledge import (
Expand Down Expand Up @@ -225,12 +227,14 @@ def add_experience_to_vector_base(self, experiment_feedback=None):
document.create_embedding()
self.add(document)

def search_experience(self, query: str, topk_k: int = 5, similarity_threshold: float = 0.1):
def search_experience(self, target: str, query: str, topk_k: int = 5, similarity_threshold: float = 0.1):
"""
Search for Kaggle experience posts related to the query
Search for Kaggle experience posts related to the query, initially filtered by the target.

Parameters:
----------
target: str
The target context to refine the search query.
query: str
The search query to find relevant experience posts.
topk_k: int, optional
Expand All @@ -243,15 +247,49 @@ def search_experience(self, query: str, topk_k: int = 5, similarity_threshold: f
List[KGKnowledgeMetaData], List[float]:
A list of the most relevant documents and their similarities.
"""
search_results, similarities = super().search(query, topk_k=topk_k, similarity_threshold=similarity_threshold)

# Modify the query to include the target
modified_query = f"The target is {target}. And I need you to query {query} based on the {target}."

# First, search based on the modified query
search_results, similarities = super().search(
modified_query, topk_k=topk_k, similarity_threshold=similarity_threshold
)

# If the results do not match the target well, refine the search using LLM or further adjustment
kaggle_docs = []
for result in search_results:
kg_doc = KGKnowledgeDocument().from_dict(result.__dict__)

gpt_feedback = self.refine_with_LLM(target, kg_doc)
if gpt_feedback:
kg_doc.content = gpt_feedback

kaggle_docs.append(kg_doc)

return kaggle_docs, similarities

def refine_with_LLM(self, target: str, text: str) -> str:
prompt_dict = Prompts(file_path=Path(__file__).parent / "prompts.yaml")

sys_prompt = (
Environment(undefined=StrictUndefined).from_string(prompt_dict["refine_with_LLM"]["system"]).render()
)

user_prompt = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["refine_with_LLM"]["user"])
.render(target=target, text=text)
)

response = APIBackend().build_messages_and_create_chat_completion(
user_prompt=user_prompt,
system_prompt=sys_prompt,
json_mode=False,
)

return response

def save(self, vector_df_path: Union[str, Path]):
"""
Save the vector DataFrame to a file
Expand Down
Loading