Skip to content

Commit

Permalink
Adding custom dataset file (#659)
Browse files Browse the repository at this point in the history
  • Loading branch information
goswamig authored Sep 3, 2024
1 parent 778e31e commit ccd4741
Showing 1 changed file with 37 additions and 0 deletions.
37 changes: 37 additions & 0 deletions src/llama_recipes/datasets/custom_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import importlib
from pathlib import Path

def load_module_from_py_file(py_file: str) -> object:
"""
This method loads a module from a py file which is not in the Python path
"""
module_name = Path(py_file).name
loader = importlib.machinery.SourceFileLoader(module_name, py_file)
spec = importlib.util.spec_from_loader(module_name, loader)
module = importlib.util.module_from_spec(spec)

loader.exec_module(module)

return module


def get_custom_dataset(dataset_config, tokenizer, split: str):
if ":" in dataset_config.file:
module_path, func_name = dataset_config.file.split(":")
else:
module_path, func_name = dataset_config.file, "get_custom_dataset"

if not module_path.endswith(".py"):
raise ValueError(f"Dataset file {module_path} is not a .py file.")

module_path = Path(module_path)
if not module_path.is_file():
raise FileNotFoundError(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.")

module = load_module_from_py_file(module_path.as_posix())
try:
return getattr(module, func_name)(dataset_config, tokenizer, split)
except AttributeError as e:
print(f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()}).")
raise e

0 comments on commit ccd4741

Please sign in to comment.