BPE: fixed typo (#492)

* fixed typo * use rel path if exists * mod gitignore and use existing vocab files --------- Co-authored-by: rasbt <[email protected]>
rasbt · Jan 21, 2025 · 60acb94 · 60acb94
1 parent 0d4967e
commit 60acb94
Show file tree

Hide file tree

Showing 2 changed files with 33 additions and 16 deletions.
diff --git a/.gitignore b/.gitignore
@@ -101,6 +101,13 @@ ch07/02_dataset-utilities/instruction-examples-modified.json
 ch07/04_preference-tuning-with-dpo/gpt2-medium355M-sft.pth
 ch07/04_preference-tuning-with-dpo/loss-plot.pdf
 
+# Tokenizer files
+ch02/05_bpe-from-scratch/bpe_merges.txt
+ch02/05_bpe-from-scratch/encoder.json
+ch02/05_bpe-from-scratch/vocab.bpe
+ch02/05_bpe-from-scratch/vocab.json
+
+
 # Other
 ch0?/0?_user_interface/.chainlit/
 ch0?/0?_user_interface/chainlit.md

diff --git a/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb b/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb
@@ -722,14 +722,14 @@
     "import os\n",
     "import urllib.request\n",
     "\n",
-    "if not os.path.exists(\"the-verdict.txt\"):\n",
+    "if not os.path.exists(\"../01_main-chapter-code/the-verdict.txt\"):\n",
     "    url = (\"https://raw.githubusercontent.com/rasbt/\"\n",
     "           \"LLMs-from-scratch/main/ch02/01_main-chapter-code/\"\n",
     "           \"the-verdict.txt\")\n",
-    "    file_path = \"the-verdict.txt\"\n",
+    "    file_path = \"../01_main-chapter-code/the-verdict.txt\"\n",
     "    urllib.request.urlretrieve(url, file_path)\n",
     "\n",
-    "with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
+    "with open(\"../01_main-chapter-code/the-verdict.txt\", \"r\", encoding=\"utf-8\") as f: # added ../01_main-chapter-code/\n",
     "    text = f.read()"
    ]
   },
@@ -876,7 +876,7 @@
    "id": "252693ee-e806-4dac-ab76-2c69086360f4",
    "metadata": {},
    "source": [
-    "- Note that the vocabulary itself is used in the `decoder()` method, which allows us to map the token IDs back into text:"
+    "- Note that the vocabulary itself is used in the `decode()` method, which allows us to map the token IDs back into text:"
    ]
   },
   {
@@ -1099,24 +1099,34 @@
     "import os\n",
     "import urllib.request\n",
     "\n",
-    "def download_file_if_absent(url, filename):\n",
-    "    if not os.path.exists(filename):\n",
-    "        try:\n",
-    "            with urllib.request.urlopen(url) as response, open(filename, 'wb') as out_file:\n",
-    "                out_file.write(response.read())\n",
-    "            print(f\"Downloaded {filename}\")\n",
-    "        except Exception as e:\n",
-    "            print(f\"Failed to download {filename}. Error: {e}\")\n",
-    "    else:\n",
-    "        print(f\"{filename} already exists\")\n",
+    "def download_file_if_absent(url, filename, search_dirs):\n",
+    "    for directory in search_dirs:\n",
+    "        file_path = os.path.join(directory, filename)\n",
+    "        if os.path.exists(file_path):\n",
+    "            print(f\"{filename} already exists in {file_path}\")\n",
+    "            return file_path\n",
+    "\n",
+    "    target_path = os.path.join(search_dirs[0], filename)\n",
+    "    try:\n",
+    "        with urllib.request.urlopen(url) as response, open(target_path, \"wb\") as out_file:\n",
+    "            out_file.write(response.read())\n",
+    "        print(f\"Downloaded {filename} to {target_path}\")\n",
+    "    except Exception as e:\n",
+    "        print(f\"Failed to download {filename}. Error: {e}\")\n",
+    "    return target_path\n",
+    "\n",
+    "# Define the directories to search and the files to download\n",
+    "search_directories = [\".\", \"../02_bonus_bytepair-encoder/gpt2_model/\"]\n",
     "\n",
     "files_to_download = {\n",
     "    \"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/vocab.bpe\": \"vocab.bpe\",\n",
     "    \"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/encoder.json\": \"encoder.json\"\n",
     "}\n",
     "\n",
+    "# Ensure directories exist and download files if needed\n",
+    "paths = {}\n",
     "for url, filename in files_to_download.items():\n",
-    "    download_file_if_absent(url, filename)"
+    "    paths[filename] = download_file_if_absent(url, filename, search_directories)"
    ]
   },
   {
@@ -1136,7 +1146,7 @@
    "source": [
     "tokenizer_gpt2 = BPETokenizerSimple()\n",
     "tokenizer_gpt2.load_vocab_and_merges_from_openai(\n",
-    "    vocab_path=\"encoder.json\", bpe_merges_path=\"vocab.bpe\"\n",
+    "    vocab_path=paths[\"encoder.json\"], bpe_merges_path=paths[\"vocab.bpe\"]\n",
     ")"
    ]
   },