removed redundant data download arg - book corpus only has a train split

SebastianScherer88 · Feb 4, 2025 · 56a6ec5 · 56a6ec5
1 parent ba4ca40
commit 56a6ec5
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 5 deletions.
diff --git a/sdk/bettmensch_ai/pipelines/component/examples/gpt_1/gpt_1.py b/sdk/bettmensch_ai/pipelines/component/examples/gpt_1/gpt_1.py
@@ -11,13 +11,12 @@
 
 
 def get_source_data_split(
-    data_split: InputParameter = "train",  # "train" / "validation"
     data_out: OutputArtifact = None,
 ):
     from datasets import load_dataset
 
     data = load_dataset(
-        "bookcorpus/bookcorpus", split=data_split, trust_remote_code=True
+        "bookcorpus/bookcorpus", split="train", trust_remote_code=True
     )
     data.save_to_disk(data_out.path)
 

diff --git a/sdk/bettmensch_ai/pipelines/component/examples/gpt_1/test_train.ipynb b/sdk/bettmensch_ai/pipelines/component/examples/gpt_1/test_train.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -16,7 +16,7 @@
     }
    ],
    "source": [
-    "from bettmensch_ai.pipelines import OutputArtifact, InputArtifact, InputParameter\n",
+    "from bettmensch_ai.pipelines import OutputArtifact, InputArtifact\n",
     "\n",
     "from gpt_1 import get_source_data_split, get_tokenized_data_split_and_tokenizer, pretrain_and_checkpoint"
    ]
@@ -31,7 +31,6 @@
     "\n",
     "if get_source_data:\n",
     "    get_source_data_split(\n",
-    "        data_split=\"train\",\n",
     "        data_out=OutputArtifact(name=\"source_data_train\")\n",
     "    )"
    ]