keras-team · mattdangerw · Sep 1, 2022 · Aug 27, 2022 · Aug 27, 2022 · Sep 1, 2022
diff --git a/tools/pretrained_tokenizers/README.md b/tools/pretrained_tokenizers/README.md
@@ -0,0 +1,29 @@
+# Training WordPiece Vocabularies on Wikipedia
+
+This is unmaintained helper code for training the vocabularies on Wikipedia.
+It is advised to run these scripts on GCS.
+
+Note: use either `screen` or `tmux` when running these commands remotely to
+avoiding killing long running scripts.
+
+## Instructions
+The steps are listed below. You will need to run 1 and 2 for all wikipedia data
+dumps that you want to train on to download and extract the data. 
+
+After, change the list in the cleaning script so that it matches the downloaded 
+data folder names
+
+Finally, change the list in the train vocabulary and run the script to train the 
+vocabularies for each directory in the list.
+
+### 1. Download Wikipedia Dataset from Wikipedia Dumps
+Example: `curl -O https://dumps.wikimedia.org/ptwiki/20220801/ptwiki-20220801-pages-articles-multistream.xml.bz2`
+
+### 2. Run Wikipedia Dataset Extractor
+Example: `python3 -m wikiextractor.WikiExtractor arwiki-20220802-pages-articles-multistream.xml.bz2`
+
+### 3. Additional Removals
+`python3 word_piece_cleaning_script.py`
+
+### 4. Run train vocabulary
+`python3 word_piece_training_script.py`
diff --git a/tools/pretrained_tokenizers/word_piece_cleaning_script.py b/tools/pretrained_tokenizers/word_piece_cleaning_script.py
@@ -0,0 +1,24 @@
+import os
+import tqdm
+
+# Clean these folders and saved parsed version of them.
+clean_folders = ["bnwiki", "arwiki", "ruwiki", "ptwiki", "idwiki"]
+
+for i in range(len(clean_folders)):
+    clean_folder = clean_folders[i]
+    output_folder = clean_folders[i]+"_parsed"
+    os.mkdir(output_folder)
+    for folder in tqdm.tqdm(os.listdir(clean_folder)):
+        path = os.path.join(clean_folder, folder)
+        os.mkdir(os.path.join(output_folder, folder))
+        for file in os.listdir(path):
+            article = []
+            with open(os.path.join(path, file)) as f:
+                for line in f:
+                    if line.startswith("</doc>") or line.startswith("<doc"):
+                        continue
+                    else:
+                        article.append(line)
+            with open(os.path.join(output_folder, folder, file), "w+") as f:
+                for line in article:
+                    f.write(line+"\n")
diff --git a/tools/pretrained_tokenizers/word_piece_training_script.py b/tools/pretrained_tokenizers/word_piece_training_script.py
@@ -0,0 +1,37 @@
+import os
+import keras_nlp
+import time
+
+# List directories of parsed Wikipedia articles and vocab sizes
+directories = [
+    "eswiki_parsed", "frwiki_parsed", "hiwiki_parsed", 
+    "arwiki_parsed", "ruwiki_parsed", "bnwiki_parsed", 
+    "idwiki_parsed", "ptwiki_parsed"
+]
+vocab_sizes = [20000, 50000]
+identifier = "v1"
+
+# Runs the computation
+for directory in directories:
+    for vocab_size in vocab_sizes:
+        print(f"Running directory {directory} with vocab size {vocab_size}")
+        files = []
+        for folder in os.listdir(directory):
+            path = os.path.join(directory, folder)
+            for file in os.listdir(path):
+                if file[0] != ".":
+                    files.append(os.path.join(path, file))
+
+        if os.path.exists(f"{directory}_{vocab_size}_{identifier}.txt"):
+            raise ValueError("already done.")
+
+        start = time.time()
+        keras_nlp.tokenizers.compute_word_piece_vocabulary(
+            files,
+            vocabulary_size=vocab_size,
+            lowercase=False,
+            strip_accents=False,
+            vocabulary_output_file=f"{directory}_{vocab_size}_{identifier}.txt"
+        )
+        end = time.time()
+        print("Time taken:", end-start)