Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Added support for loading and storing RAG in Kaggle scenarios. #269

Merged
merged 9 commits into from
Sep 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion rdagent/app/kaggle/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class Config:
env_prefix = "KG_"
"""Use `KG_` as prefix for environment variables"""
protected_namespaces = ()
"""Add 'model_' to the protected namespaces"""
"""Do not allow overriding of these namespaces"""

# 1) overriding the default
scen: str = "rdagent.scenarios.kaggle.experiment.scenario.KGScenario"
Expand Down Expand Up @@ -42,5 +42,7 @@ class Config:

competition: str = ""

rag_path: str = "git_ignore_folder/rag"


KAGGLE_IMPLEMENT_SETTING = KaggleBasePropSetting()
7 changes: 7 additions & 0 deletions rdagent/app/kaggle/loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
from rdagent.core.scenario import Scenario
from rdagent.core.utils import import_class
from rdagent.log import rdagent_logger as logger
from rdagent.scenarios.kaggle.knowledge_management.vector_base import (
KaggleExperienceBase,
)
from rdagent.scenarios.kaggle.proposal.proposal import (
KG_ACTION_FEATURE_ENGINEERING,
KG_ACTION_FEATURE_PROCESSING,
Expand All @@ -29,6 +32,10 @@ def __init__(self, PROP_SETTING: BasePropSetting):
scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition)
logger.log_object(scen, tag="scenario")

self.vector_base = KaggleExperienceBase()
if KAGGLE_IMPLEMENT_SETTING.rag_path:
self.vector_base.load(KAGGLE_IMPLEMENT_SETTING.rag_path)

self.hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.hypothesis_gen)(scen)
logger.log_object(self.hypothesis_gen, tag="hypothesis generator")

Expand Down
20 changes: 17 additions & 3 deletions rdagent/scenarios/kaggle/developer/feedback.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,12 @@
)
from rdagent.log import rdagent_logger as logger
from rdagent.oai.llm_utils import APIBackend
from rdagent.scenarios.kaggle.knowledge_management.extract_knowledge import (
extract_knowledge_from_feedback,
)
from rdagent.utils import convert2bool

feedback_prompts = Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml")
prompt_dict = Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml")
DIRNAME = Path(__file__).absolute().resolve().parent


Expand Down Expand Up @@ -84,7 +87,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
# Generate the system prompt
sys_prompt = (
Environment(undefined=StrictUndefined)
.from_string(feedback_prompts["factor_feedback_generation"]["system"])
.from_string(prompt_dict["factor_feedback_generation"]["system"])
.render(scenario=self.scen.get_scenario_all_desc())
)

Expand All @@ -97,7 +100,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
# Generate the user prompt
usr_prompt = (
Environment(undefined=StrictUndefined)
.from_string(feedback_prompts[prompt_key]["user"])
.from_string(prompt_dict[prompt_key]["user"])
.render(
hypothesis_text=hypothesis_text,
task_details=tasks_factors,
Expand All @@ -122,6 +125,17 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
reason = response_json.get("Reasoning", "No reasoning provided")
decision = convert2bool(response_json.get("Replace Best Result", "no"))

experiment_feedback = {
"hypothesis_text": hypothesis_text,
"current_result": current_result,
"tasks_factors": tasks_factors,
"observations": observations,
"hypothesis_evaluation": hypothesis_evaluation,
"reason": reason,
}

self.scen.vector_base.add_experience_to_vector_base(experiment_feedback)

return HypothesisFeedback(
observations=observations,
hypothesis_evaluation=hypothesis_evaluation,
Expand Down
9 changes: 9 additions & 0 deletions rdagent/scenarios/kaggle/experiment/scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,16 @@
import pandas as pd
from jinja2 import Environment, StrictUndefined

from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
from rdagent.components.coder.factor_coder.config import FACTOR_IMPLEMENT_SETTINGS
from rdagent.core.prompts import Prompts
from rdagent.core.scenario import Scenario
from rdagent.oai.llm_utils import APIBackend
from rdagent.scenarios.kaggle.experiment.kaggle_experiment import KGFactorExperiment
from rdagent.scenarios.kaggle.kaggle_crawler import crawl_descriptions
from rdagent.scenarios.kaggle.knowledge_management.vector_base import (
KaggleExperienceBase,
)

prompt_dict = Prompts(file_path=Path(__file__).parent / "prompts.yaml")

Expand All @@ -32,6 +36,11 @@ def __init__(self, competition: str) -> None:

self._background = self.background

# all competitions are based on the same vector base
self.vector_base = KaggleExperienceBase()
if KAGGLE_IMPLEMENT_SETTING.rag_path:
self.vector_base.load(KAGGLE_IMPLEMENT_SETTING.rag_path)

def _analysis_competition_description(self):
sys_prompt = (
Environment(undefined=StrictUndefined)
Expand Down
8 changes: 8 additions & 0 deletions rdagent/scenarios/kaggle/knowledge_management/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
## Usage

This folder implements a knowledge base using RAG based on Kaggle competitions.
It allows you to store Kaggle competition experiences into the knowledge base, as well as store experimental experiences from RD-Agent.

1. First, generate a knowledge base (in JSON format) by running the `main` function in `extract_knowledge.py`.
2. Then, create a vector base in `vector_base.py` and save it.
3. Finally, add the field `KG_RAG_PATH="xxx.pkl"` (the path to the saved vector base) in your `.env` file.
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
prompt_dict = Prompts(file_path=Path(__file__).parent / "prompts.yaml")


def process_with_gpt(content: str):
def extract_knowledge_from_high_score_answers(content: str):
sys_prompt = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["extract_kaggle_knowledge_prompts"]["system"])
Expand All @@ -37,6 +37,36 @@ def process_with_gpt(content: str):
return response_json_analysis


def extract_knowledge_from_feedback(feedback_response: dict) -> dict:
"""
Extracts knowledge from LLM-generated feedback and structures it.
"""
sys_prompt = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["extract_kaggle_knowledge_from_feedback_prompts"]["system"])
.render()
)

user_prompt = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["extract_kaggle_knowledge_from_feedback_prompts"]["user"])
.render(experiment_strategy=feedback_response)
)

response_analysis = APIBackend().build_messages_and_create_chat_completion(
user_prompt=user_prompt,
system_prompt=sys_prompt,
json_mode=True,
)

try:
response_json_analysis = json.loads(response_analysis)
except json.JSONDecodeError:
response_json_analysis = {"error": "Failed to parse LLM's response as JSON"}

return response_json_analysis


def process_all_case_files(directory_path: str):
output_file = Path(directory_path) / "kaggle_experience_results.json"
json_output = []
Expand All @@ -46,8 +76,8 @@ def process_all_case_files(directory_path: str):

with open(file_path, "r", encoding="utf-8") as file:
content = file.read()
gpt_response = process_with_gpt(content)
json_output.append(gpt_response)
knowladge = extract_knowledge_from_high_score_answers(content)
json_output.append(knowladge)

with open(output_file, "w", encoding="utf-8") as json_file:
json.dump(json_output, json_file, ensure_ascii=False)
Expand Down
22 changes: 21 additions & 1 deletion rdagent/scenarios/kaggle/knowledge_management/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,24 @@ extract_kaggle_knowledge_prompts:
}

user: |-
High-ranking Kaggle notebooks or competition strategies: {{ file_content }}
High-ranking Kaggle notebooks or competition strategies: {{ file_content }}

extract_kaggle_knowledge_from_feedback_prompts:
system: |-
You are a Kaggle competition expert with extensive experience in analyzing Kaggle notebooks and competition strategies.
Your task is to summarize or infer key information such as the competition name, task type, and specific techniques employed in the notebook or strategy.
For each provided content, you are expected to extract valuable insights and organize the analysis in the structured format outlined below.

Please provide the analysis in the following JSON format:
{
"content": "all provided content",
"title": "extracted title, if available",
"competition_name": "extracted competition name",
"task_category": "extracted task type, e.g., Classification, Regression",
"field": "field of focus, e.g., Feature Engineering, Modeling",
"ranking": "extracted ranking, if available",
"score": "extracted score or metric, if available"
}

user: |-
Experiment strategy: {{ experiment_strategy }}
31 changes: 29 additions & 2 deletions rdagent/scenarios/kaggle/knowledge_management/vector_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
)
from rdagent.log import rdagent_logger as logger
from rdagent.oai.llm_utils import APIBackend
from rdagent.scenarios.kaggle.knowledge_management.extract_knowledge import (
extract_knowledge_from_feedback,
)


class KGKnowledgeMetaData(KnowledgeMetaData):
Expand Down Expand Up @@ -178,10 +181,32 @@ def load_kaggle_experience(self, kaggle_experience_path: Union[str, Path]):
logger.error(f"Kaggle experience data not found at {kaggle_experience_path}")
self.kaggle_experience_data = []

def add_experience_to_vector_base(self):
def add_experience_to_vector_base(self, experiment_feedback=None):
"""
Process the Kaggle experience data and add relevant information to the vector base
Process Kaggle experience data or experiment feedback and add relevant information to the vector base.

Args:
experiment_feedback (dict, optional): A dictionary containing experiment feedback.
If provided, this feedback will be processed and added to the vector base.
"""
# If experiment feedback is provided, extract relevant knowledge and add it to the vector base
if experiment_feedback:
extracted_knowledge = extract_knowledge_from_feedback(experiment_feedback)

document = KGKnowledgeMetaData(
content=experiment_feedback.get("hypothesis_text", ""),
label="Experiment Feedback",
competition_name="Experiment Result",
task_category=experiment_feedback.get("tasks_factors", "General Task"),
field="Research Feedback",
ranking=None,
score=experiment_feedback.get("current_result", None),
)
document.create_embedding()
self.add(document)
return

# Process Kaggle experience data
for experience in self.kaggle_experience_data:
content = experience.get("content", "")
label = experience.get("title", "Kaggle Experience")
Expand Down Expand Up @@ -238,6 +263,8 @@ def search_experience(self, query: str, topk_k: int = 5, similarity_threshold: f

kaggle_base.add_experience_to_vector_base()

kaggle_base.save("git_ignore_folder/experience/tabular_cases/kaggle_vector_base.pkl")

print(f"There are {kaggle_base.shape()[0]} records in the vector base.")

search_results, similarities = kaggle_base.search_experience(query="image classification", topk_k=3)
Expand Down
16 changes: 13 additions & 3 deletions rdagent/scenarios/kaggle/proposal/proposal.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@

from jinja2 import Environment, StrictUndefined

from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
from rdagent.components.coder.factor_coder.factor import FactorTask
from rdagent.components.coder.model_coder.model import ModelExperiment, ModelTask
from rdagent.components.knowledge_management.vector_base import VectorBase
from rdagent.components.proposal.model_proposal import (
ModelHypothesis,
ModelHypothesis2Experiment,
Expand All @@ -17,6 +19,9 @@
KGFactorExperiment,
KGModelExperiment,
)
from rdagent.scenarios.kaggle.knowledge_management.vector_base import (
KaggleExperienceBase,
)

prompt_dict = Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml")

Expand Down Expand Up @@ -68,22 +73,27 @@ class KGHypothesisGen(ModelHypothesisGen):

.. code-block:: python

class XXXDMModelHypothesisGen(DMModelHypothesisGen):
class KGHypothesisGen(ModelHypothesisGen):
prompts: Prompts = a_specifc_prompt_dict
"""

def __init__(self, scen: Scenario) -> Tuple[dict, bool]:
def __init__(self, scen: Scenario, knowledge: VectorBase = None) -> Tuple[dict, bool]:
super().__init__(scen)
self.scen.vector_base.save(KAGGLE_IMPLEMENT_SETTING.rag_path)

def prepare_context(self, trace: Trace) -> Tuple[dict, bool]:
hypothesis_feedback = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["hypothesis_and_feedback"])
.render(trace=trace)
)

rag_results, _ = self.scen.vector_base.search_experience(hypothesis_feedback, topk_k=5)
rag_content = "\n".join([doc.content for doc in rag_results])

context_dict = {
"hypothesis_and_feedback": hypothesis_feedback,
"RAG": None,
"RAG": rag_content,
"hypothesis_output_format": prompt_dict["hypothesis_output_format"],
"hypothesis_specification": None,
}
Expand Down