Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Initial version if Graph RAG in KAGGLE scenario #301

Merged
merged 6 commits into from
Sep 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ mypy:
# First deal with the core folder, and then gradually increase the scope of detection,
# and eventually realize the detection of the complete project.
ruff:
$(PIPRUN) ruff check rdagent/core --ignore FBT001,FBT002 # --exclude rdagent/scripts,git_ignore_folder
$(PIPRUN) ruff check rdagent/core --ignore FBT001,FBT002,I001 # --exclude rdagent/scripts,git_ignore_folder

# Check lint with toml-sort.
toml-sort:
Expand Down
7 changes: 7 additions & 0 deletions rdagent/app/kaggle/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,13 @@ class Config:
scen: str = "rdagent.scenarios.kaggle.experiment.scenario.KGScenario"
"""Scenario class for data mining model"""

knowledge_base: str = "" # TODO enable this line to use the knowledge base
# knowledge_base: str = "rdagent.scenarios.kaggle.knowledge_management.graph.KGKnowledgeGraph"
"""Knowledge base class"""

knowledge_base_path: str = "kg_graph.pkl"
"""Knowledge base path"""

hypothesis_gen: str = "rdagent.scenarios.kaggle.proposal.proposal.KGHypothesisGen"
"""Hypothesis generation class"""

Expand Down
10 changes: 9 additions & 1 deletion rdagent/app/kaggle/loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from rdagent.scenarios.kaggle.proposal.proposal import (
KG_ACTION_FEATURE_ENGINEERING,
KG_ACTION_FEATURE_PROCESSING,
KGTrace,
)


Expand All @@ -32,6 +33,13 @@ def __init__(self, PROP_SETTING: BasePropSetting):
scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition)
logger.log_object(scen, tag="scenario")

knowledge_base = (
import_class(PROP_SETTING.knowledge_base)(PROP_SETTING.knowledge_base_path, scen)
if PROP_SETTING.knowledge_base != ""
else None
)
logger.log_object(knowledge_base, tag="knowledge_base")

self.hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.hypothesis_gen)(scen)
logger.log_object(self.hypothesis_gen, tag="hypothesis generator")

Expand All @@ -50,7 +58,7 @@ def __init__(self, PROP_SETTING: BasePropSetting):

self.summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.summarizer)(scen)
logger.log_object(self.summarizer, tag="summarizer")
self.trace = Trace(scen=scen)
self.trace = KGTrace(scen=scen, knowledge_base=knowledge_base)
super(RDLoop, self).__init__()

@measure_time
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@
)
from rdagent.core.evolving_framework import (
EvolvableSubjects,
EvolvingKnowledgeBase,
EvoStep,
Knowledge,
KnowledgeBase,
QueriedKnowledge,
RAGStrategy,
)
Expand Down Expand Up @@ -71,12 +71,13 @@ def __init__(self, success_task_to_knowledge_dict: dict = {}, failed_task_info_s
self.failed_task_info_set = failed_task_info_set


class FactorKnowledgeBaseV1(KnowledgeBase):
def __init__(self) -> None:
class FactorKnowledgeBaseV1(EvolvingKnowledgeBase):
def __init__(self, path: str | Path = None) -> None:
self.implementation_trace: dict[str, FactorKnowledge] = dict()
self.success_task_info_set: set[str] = set()

self.task_to_embedding = dict()
super().__init__(path)

def query(self) -> QueriedKnowledge | None:
"""
Expand Down Expand Up @@ -746,12 +747,12 @@ def dataset_query(
return factor_implementation_queried_graph_knowledge


class FactorGraphKnowledgeBase(KnowledgeBase):
def __init__(self, init_component_list=None, data_set_knowledge_path=None) -> None:
class FactorGraphKnowledgeBase(EvolvingKnowledgeBase):
def __init__(self, init_component_list=None, path: str | Path = None, data_set_knowledge_path=None) -> None:
"""
Load knowledge, offer brief information of knowledge and common handle interfaces
"""
self.graph: UndirectedGraph = UndirectedGraph.load(Path.cwd() / "graph.pkl")
self.graph: UndirectedGraph = UndirectedGraph(Path.cwd() / "graph.pkl")
logger.info(f"Knowledge Graph loaded, size={self.graph.size()}")

if init_component_list:
Expand Down Expand Up @@ -780,6 +781,7 @@ def __init__(self, init_component_list=None, data_set_knowledge_path=None) -> No
if data_set_knowledge_path:
with open(data_set_knowledge_path, "r") as f:
self.data_set_knowledge_dict = json.load(f)
super().__init__(path)

def get_all_nodes_by_label(self, label: str) -> list[UndirectedNode]:
return self.graph.get_all_nodes_by_label(label)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from pathlib import Path

from rdagent.components.coder.model_coder.conf import MODEL_IMPL_SETTINGS
from rdagent.components.coder.model_coder.CoSTEER.evaluators import ModelCoderFeedback
from rdagent.components.coder.model_coder.model import ModelTask
from rdagent.core.evolving_framework import (
EvolvableSubjects,
EvolvingKnowledgeBase,
EvoStep,
Knowledge,
KnowledgeBase,
QueriedKnowledge,
RAGStrategy,
)
Expand Down Expand Up @@ -49,13 +51,15 @@ def __init__(self, success_task_to_knowledge_dict: dict = {}, failed_task_info_s
self.working_task_to_similar_successful_knowledge_dict = dict()


class ModelKnowledgeBase(KnowledgeBase):
def __init__(self) -> None:
class ModelKnowledgeBase(EvolvingKnowledgeBase):
def __init__(self, path: str | Path = None) -> None:
self.implementation_trace: dict[str, ModelKnowledge] = dict()
self.success_task_info_set: set[str] = set()

self.task_to_embedding = dict()

super().__init__(path)

def query(self) -> QueriedKnowledge | None:
"""
Query the knowledge base to get the queried knowledge. So far is handled in RAG strategy.
Expand Down
33 changes: 4 additions & 29 deletions rdagent/components/knowledge_management/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
VectorBase,
cosine,
)
from rdagent.core.knowledge_base import KnowledgeBase
from rdagent.oai.llm_utils import APIBackend

Node = KnowledgeMetaData
Expand Down Expand Up @@ -47,14 +48,14 @@ def __repr__(self) -> str:
)


class Graph:
class Graph(KnowledgeBase):
"""
base Graph class for Knowledge Graph Search
"""

def __init__(self, path: str | Path | None = None) -> None:
self.path = path
self.nodes = {}
super().__init__(path=path)

def size(self) -> int:
return len(self.nodes)
Expand All @@ -77,22 +78,6 @@ def find_node(self, content: str, label: str) -> Node | None:
return node
return None

@classmethod
def load(cls: type[Graph], path: str | Path) -> Graph:
"""use pickle as the default load method"""
path = path if isinstance(path, Path) else Path(path)
if not path.exists():
return cls(path=path)

with path.open("rb") as f:
return pickle.load(f)

def save(self, path: str | Path) -> None:
"""use pickle as the default save method"""
Path.mkdir(path.parent, exist_ok=True)
with path.open("wb") as f:
pickle.dump(self, f)

@staticmethod
def batch_embedding(nodes: list[Node]) -> list[Node]:
contents = [node.content for node in nodes]
Expand All @@ -119,8 +104,8 @@ class UndirectedGraph(Graph):
"""

def __init__(self, path: str | Path | None = None) -> None:
super().__init__(path=path)
self.vector_base: VectorBase = PDVectorBase()
super().__init__(path=path)

def __str__(self) -> str:
return f"UndirectedGraph(nodes={self.nodes})"
Expand Down Expand Up @@ -174,16 +159,6 @@ def add_node(

node.add_neighbor(neighbor)

@classmethod
def load(cls: type[UndirectedGraph], path: str | Path) -> UndirectedGraph:
"""use pickle as the default load method"""
path = path if isinstance(path, Path) else Path(path)
if not path.exists():
return cls(path=path)

with path.open("rb") as f:
return pickle.load(f)

def add_nodes(self, node: UndirectedNode, neighbors: list[UndirectedNode]) -> None:
if not neighbors:
self.add_node(node)
Expand Down
32 changes: 5 additions & 27 deletions rdagent/components/knowledge_management/vector_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pandas as pd
from scipy.spatial.distance import cosine

from rdagent.core.knowledge_base import KnowledgeBase
from rdagent.log import rdagent_logger as logger
from rdagent.oai.llm_utils import APIBackend

Expand Down Expand Up @@ -68,14 +69,11 @@ def contents_to_documents(contents: List[str], label: str = None) -> List[Docume
return docs


class VectorBase:
class VectorBase(KnowledgeBase):
"""
This class is used for handling vector storage and query
"""

def __init__(self, vector_df_path: Union[str, Path] = None, **kwargs):
pass

def add(self, document: Union[Document, List[Document]]):
"""
add new node to vector_df
Expand Down Expand Up @@ -104,28 +102,15 @@ def search(self, content: str, topk_k: int = 5, similarity_threshold: float = 0)
"""
pass

def load(self, **kwargs):
"""load vector_df"""

def save(self, **kwargs):
"""save vector_df"""


class PDVectorBase(VectorBase):
"""
Implement of VectorBase using Pandas
"""

def __init__(self, vector_df_path: Union[str, Path] = None):
super().__init__(vector_df_path)

if vector_df_path:
try:
self.vector_df = self.load(vector_df_path)
except FileNotFoundError:
self.vector_df = pd.DataFrame(columns=["id", "label", "content", "embedding"])
else:
self.vector_df = pd.DataFrame(columns=["id", "label", "content", "embedding"])
def __init__(self, path: Union[str, Path] = None):
self.vector_df = pd.DataFrame(columns=["id", "label", "content", "embedding"])
super().__init__(path)

def shape(self):
return self.vector_df.shape
Expand Down Expand Up @@ -196,10 +181,3 @@ def search(self, content: str, topk_k: int = 5, similarity_threshold: float = 0)
for _, similar_docs in most_similar_docs.iterrows():
docs.append(Document().from_dict(similar_docs.to_dict()))
return docs, searched_similarities.to_list()

def load(self, vector_df_path, **kwargs):
vector_df = pd.read_pickle(vector_df_path)
return vector_df

def save(self, vector_df_path, **kwargs):
self.vector_df.to_pickle(vector_df_path)
2 changes: 2 additions & 0 deletions rdagent/components/workflow/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ class Config:
"""

scen: str = ""
knowledge_base: str = ""
knowledge_base_path: str = ""
hypothesis_gen: str = ""
hypothesis2experiment: str = ""
coder: str = ""
Expand Down
6 changes: 4 additions & 2 deletions rdagent/core/evolving_framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any

from rdagent.core.knowledge_base import KnowledgeBase

if TYPE_CHECKING:
from rdagent.core.evaluation import Feedback
from rdagent.core.scenario import Scenario
Expand All @@ -18,7 +20,7 @@ class QueriedKnowledge:
pass


class KnowledgeBase(ABC):
class EvolvingKnowledgeBase(KnowledgeBase):
@abstractmethod
def query(
self,
Expand Down Expand Up @@ -78,7 +80,7 @@ def evolve(
class RAGStrategy(ABC):
"""Retrieval Augmentation Generation Strategy"""

def __init__(self, knowledgebase: KnowledgeBase) -> None:
def __init__(self, knowledgebase: EvolvingKnowledgeBase) -> None:
self.knowledgebase = knowledgebase

@abstractmethod
Expand Down
25 changes: 25 additions & 0 deletions rdagent/core/knowledge_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from pathlib import Path

import dill as pickle # type: ignore[import-untyped]

from rdagent.log import rdagent_logger as logger


class KnowledgeBase:
def __init__(self, path: str | Path | None = None) -> None:
self.path = Path(path) if path else None
self.load()

def load(self) -> None:
if self.path is not None and self.path.exists():
with self.path.open("rb") as f:
self.__dict__.update(
pickle.load(f).__dict__,
) # TODO: because we need to align with init function, we need a less hacky way to do this

def dump(self) -> None:
if self.path is not None:
self.path.parent.mkdir(parents=True, exist_ok=True)
pickle.dump(self, self.path.open("wb"))
else:
logger.warning("KnowledgeBase path is not set, dump failed.")
2 changes: 1 addition & 1 deletion rdagent/core/prompts.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from pathlib import Path # noqa: I001
from pathlib import Path

import yaml

Expand Down
7 changes: 5 additions & 2 deletions rdagent/core/proposal.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from rdagent.core.evaluation import Feedback
from rdagent.core.experiment import ASpecificExp, Experiment
from rdagent.core.knowledge_base import KnowledgeBase
from rdagent.core.scenario import Scenario

if TYPE_CHECKING:
Expand Down Expand Up @@ -83,12 +84,14 @@ def __str__(self) -> str:


ASpecificScen = TypeVar("ASpecificScen", bound=Scenario)
ASpecificKB = TypeVar("ASpecificKB", bound=KnowledgeBase)


class Trace(Generic[ASpecificScen]):
def __init__(self, scen: ASpecificScen) -> None:
class Trace(Generic[ASpecificScen, ASpecificKB]):
def __init__(self, scen: ASpecificScen, knowledge_base: ASpecificKB | None = None) -> None:
self.scen: ASpecificScen = scen
self.hist: list[tuple[Hypothesis, Experiment, HypothesisFeedback]] = []
self.knowledge_base: ASpecificKB | None = knowledge_base

def get_sota_hypothesis_and_experiment(self) -> tuple[Hypothesis | None, Experiment | None]:
"""Access the last experiment result, sub-task, and the corresponding hypothesis."""
Expand Down
2 changes: 1 addition & 1 deletion rdagent/oai/llm_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def __init__(self, cache_location: str) -> None:
self.cache_location = cache_location
db_file_exist = Path(cache_location).exists()
# TODO: sqlite3 does not support multiprocessing.
self.conn = sqlite3.connect(cache_location)
self.conn = sqlite3.connect(cache_location, timeout=20)
self.c = self.conn.cursor()
if not db_file_exist:
self.c.execute(
Expand Down
Loading