Skip to content

Commit

Permalink
BPE: fixed typo (#492)
Browse files Browse the repository at this point in the history
* fixed typo

* use rel path if exists

* mod gitignore and use existing vocab files

---------

Co-authored-by: rasbt <[email protected]>
  • Loading branch information
d-kleine and rasbt authored Jan 21, 2025
1 parent 0d4967e commit 60acb94
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 16 deletions.
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,13 @@ ch07/02_dataset-utilities/instruction-examples-modified.json
ch07/04_preference-tuning-with-dpo/gpt2-medium355M-sft.pth
ch07/04_preference-tuning-with-dpo/loss-plot.pdf

# Tokenizer files
ch02/05_bpe-from-scratch/bpe_merges.txt
ch02/05_bpe-from-scratch/encoder.json
ch02/05_bpe-from-scratch/vocab.bpe
ch02/05_bpe-from-scratch/vocab.json


# Other
ch0?/0?_user_interface/.chainlit/
ch0?/0?_user_interface/chainlit.md
Expand Down
42 changes: 26 additions & 16 deletions ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -722,14 +722,14 @@
"import os\n",
"import urllib.request\n",
"\n",
"if not os.path.exists(\"the-verdict.txt\"):\n",
"if not os.path.exists(\"../01_main-chapter-code/the-verdict.txt\"):\n",
" url = (\"https://raw.githubusercontent.com/rasbt/\"\n",
" \"LLMs-from-scratch/main/ch02/01_main-chapter-code/\"\n",
" \"the-verdict.txt\")\n",
" file_path = \"the-verdict.txt\"\n",
" file_path = \"../01_main-chapter-code/the-verdict.txt\"\n",
" urllib.request.urlretrieve(url, file_path)\n",
"\n",
"with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
"with open(\"../01_main-chapter-code/the-verdict.txt\", \"r\", encoding=\"utf-8\") as f: # added ../01_main-chapter-code/\n",
" text = f.read()"
]
},
Expand Down Expand Up @@ -876,7 +876,7 @@
"id": "252693ee-e806-4dac-ab76-2c69086360f4",
"metadata": {},
"source": [
"- Note that the vocabulary itself is used in the `decoder()` method, which allows us to map the token IDs back into text:"
"- Note that the vocabulary itself is used in the `decode()` method, which allows us to map the token IDs back into text:"
]
},
{
Expand Down Expand Up @@ -1099,24 +1099,34 @@
"import os\n",
"import urllib.request\n",
"\n",
"def download_file_if_absent(url, filename):\n",
" if not os.path.exists(filename):\n",
" try:\n",
" with urllib.request.urlopen(url) as response, open(filename, 'wb') as out_file:\n",
" out_file.write(response.read())\n",
" print(f\"Downloaded {filename}\")\n",
" except Exception as e:\n",
" print(f\"Failed to download {filename}. Error: {e}\")\n",
" else:\n",
" print(f\"{filename} already exists\")\n",
"def download_file_if_absent(url, filename, search_dirs):\n",
" for directory in search_dirs:\n",
" file_path = os.path.join(directory, filename)\n",
" if os.path.exists(file_path):\n",
" print(f\"{filename} already exists in {file_path}\")\n",
" return file_path\n",
"\n",
" target_path = os.path.join(search_dirs[0], filename)\n",
" try:\n",
" with urllib.request.urlopen(url) as response, open(target_path, \"wb\") as out_file:\n",
" out_file.write(response.read())\n",
" print(f\"Downloaded {filename} to {target_path}\")\n",
" except Exception as e:\n",
" print(f\"Failed to download {filename}. Error: {e}\")\n",
" return target_path\n",
"\n",
"# Define the directories to search and the files to download\n",
"search_directories = [\".\", \"../02_bonus_bytepair-encoder/gpt2_model/\"]\n",
"\n",
"files_to_download = {\n",
" \"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/vocab.bpe\": \"vocab.bpe\",\n",
" \"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/encoder.json\": \"encoder.json\"\n",
"}\n",
"\n",
"# Ensure directories exist and download files if needed\n",
"paths = {}\n",
"for url, filename in files_to_download.items():\n",
" download_file_if_absent(url, filename)"
" paths[filename] = download_file_if_absent(url, filename, search_directories)"
]
},
{
Expand All @@ -1136,7 +1146,7 @@
"source": [
"tokenizer_gpt2 = BPETokenizerSimple()\n",
"tokenizer_gpt2.load_vocab_and_merges_from_openai(\n",
" vocab_path=\"encoder.json\", bpe_merges_path=\"vocab.bpe\"\n",
" vocab_path=paths[\"encoder.json\"], bpe_merges_path=paths[\"vocab.bpe\"]\n",
")"
]
},
Expand Down

0 comments on commit 60acb94

Please sign in to comment.