openvinotoolkit · ilya-lavrenov · Dec 21, 2023 · Dec 21, 2023 · Dec 21, 2023 · Dec 21, 2023
@@ -1,46 +1,89 @@
 # OpenVINO Tokenizers
 
+OpenVINO Tokenizers adds text processing operations to OpenVINO.
+
 ## Features
 
-- Convert a HuggingFace tokenizer into OpenVINO model tokenizer and detokenizer:
-  - Fast tokenizers based on Wordpiece and BPE models
-  - Slow tokenizers based on SentencePiece model file
+- Perform tokenization and detokenization without third-party dependencies
+- Convert a HuggingFace tokenizer into OpenVINO model tokenizer and detokenizer
 - Combine OpenVINO models into a single model
 - Add greedy decoding pipeline to text generation model
 
 ## Installation
 
-1. Install [OpenVINO Runtime for C++](https://docs.openvino.ai/latest/openvino_docs_install_guides_install_dev_tools.html#for-c-developers).
-2. (Recommended) Create and activate virtual env:
+(Recommended) Create and activate virtual env:
 ```bash
 python3 -m venv venv
 source venv/bin/activate
+ # or
+conda create --name openvino_tokenizer 
+conda activate openvino_tokenizer
 ```
-3. Go to `modules/custom_operations` and run:
+
+### Minimal Installation
+
+Use minimal installation when you have a converted OpenVINO tokenizer:
+```bash
+pip install openvino-tokenizers
+ # or
+conda install -c conda-forge openvino openvino-tokenizers
+```
+
+### Convert Tokenizers Installation
+
+If you want to convert HuggingFace tokenizers into OpenVINO tokenizers:
+```bash
+pip install openvino-tokenizers[transformers]
+ # or
+conda install -c conda-forge openvino openvino-tokenizers && pip install transformers[sentencepiece] tiktoken
+```
+
+### Build and install from source after [OpenVINO installation](https://docs.openvino.ai/2023.2/openvino_docs_install_guides_overview.html)
+```bash
+source path/to/installed/openvino/setupvars.sh
+git clone https://github.com/openvinotoolkit/openvino_contrib.git
+cd openvino_contrib/modules/custom_operations/
+pip install -e .[transformers]
+```
+
+### Build and install for development
 ```bash
-# to use converted tokenizers or models combined with tokenizers
-pip install .
-# to convert tokenizers from transformers library
-pip install .[transformers]
-# for development and testing the library
+source path/to/installed/openvino/setupvars.sh
+git clone https://github.com/openvinotoolkit/openvino_contrib.git
+cd openvino_contrib/modules/custom_operations/
 pip install -e .[all]
+# verify installation by running tests
+cd user_ie_extensions/tokenizer/python/tests/
+pytest .
 ```
 
+## Usage
+
 ### Convert HuggingFace tokenizer
 
+OpenVINO Tokenizers ships with CLI tool that can convert tokenizers from Huggingface Hub 
+or Huggingface tokenizers saved on disk:
+
+```shell
+convert_tokenizer codellama/CodeLlama-7b-hf --with-detokenizer -o output_dir
+```
+
+There is also `convert_tokenizer` function that can convert tokenizer python object.
+
 ```python
+import numpy as np
 from transformers import AutoTokenizer
-from openvino import compile_model
+from openvino import compile_model, save_model
 from openvino_tokenizers import convert_tokenizer
 
 hf_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
 ov_tokenizer = convert_tokenizer(hf_tokenizer)
 
 compiled_tokenzier = compile_model(ov_tokenizer)
-text_input = "Test string"
+text_input = ["Test string"]
 
-hf_output = hf_tokenizer([text_input], return_tensors="np")
-ov_output = compiled_tokenzier([text_input])
+hf_output = hf_tokenizer(text_input, return_tensors="np")
+ov_output = compiled_tokenzier(text_input)
 
 for output_name in hf_output:
     print(f"OpenVINO {output_name} = {ov_output[output_name]}")
@@ -51,10 +94,20 @@ for output_name in hf_output:
 # HuggingFace token_type_ids = [[0 0 0 0]]
 # OpenVINO attention_mask = [[1 1 1 1]]
 # HuggingFace attention_mask = [[1 1 1 1]]
+
+# save tokenizer for later use
+save_model(ov_tokenizer, "openvino_tokenizer.xml")
+
+loaded_tokenizer = compile_model("openvino_tokenizer.xml")
+loaded_ov_output = loaded_tokenizer(text_input)
+for output_name in hf_output:
+    assert np.all(loaded_ov_output[output_name] == ov_output[output_name])
 ```
 
 ### Connect Tokenizer to a Model
 
+To infer and convert the original model, install torch or torch-cpu to the virtual environment.
+
 ```python
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from openvino import compile_model, convert_model
@@ -83,10 +136,12 @@ print(f"HuggingFace logits {hf_output.logits}")
 
 ### Use Extension With Converted (De)Tokenizer or Model With (De)Tokenizer
 
-To work with converted tokenizer and detokenizer, numpy string tensors are used.
+Import `openvino_tokenizers` will add all tokenizer-related operations to OpenVINO,
+after which you can work with saved tokenizers and detokenizers.
 
 ```python
 import numpy as np
+import openvino_tokenizers
 from openvino import Core
 
 core = Core()
@@ -160,17 +215,27 @@ print(f"HuggingFace output string: `{hf_output}`")
 # HuggingFace output string: `['Quick brown fox was walking through the forest. He was looking for something']`
 ```
 
-## Test Coverage
+## Supported Tokenizer Types
+
+| Huggingface <br/>Tokenizer Type | Tokenizer Model Type | Tokenizer | Detokenizer |
+|---------------------------------|----------------------|----------|------------|
+| Fast                            | WordPiece            | ✅        | ❌          |
+|                                 | BPE                  | ✅        | ✅          |
+|                                 | Unigram              | ❌         | ❌          |
+| Legacy                          | SentencePiece .model | ✅        | ✅          |
+| Custom                          | tiktoken             | ✅        | ✅          |
+
+## Test Results
 
-This report is autogenerated and includes tokenizers and detokenizers tests. To update it run pytest with `--update_readme` flag.
+This report is autogenerated and includes tokenizers and detokenizers tests. The `Output Matched, %` column shows the percent of test strings for which the results of OpenVINO and Hugingface Tokenizers are the same. To update the report run `pytest tokenizers_test.py --update_readme` in `modules/custom_operations/user_ie_extensions/tokenizer/python/tests` directory.
 
-### Coverage by Tokenizer Type
+### Output Match by Tokenizer Type
 
 <table>
   <thead>
     <tr>
       <th >Tokenizer Type</th>
-      <th >Pass Rate, %</th>
+      <th >Output Matched, %</th>
       <th >Number of Tests</th>
     </tr>
   </thead>
@@ -198,14 +263,14 @@ This report is autogenerated and includes tokenizers and detokenizers tests. To
   </tbody>
 </table>
 
-### Coverage by Model Type
+### Output Match by Model
 
 <table>
   <thead>
     <tr>
       <th >Tokenizer Type</th>
       <th >Model</th>
-      <th >Pass Rate, %</th>
+      <th >Output Matched, %</th>
       <th >Number of Tests</th>
     </tr>
   </thead>

@@ -64,7 +64,7 @@ def get_parser() -> ArgumentParser:
             "Pass `use_fast=False` to `AutoTokenizer.from_pretrained`. It will initialize legacy HuggingFace "
             "tokenizer and then converts it to OpenVINO. Might result in slightly different tokenizer. "
             "See models with _slow suffix https://github.com/openvinotoolkit/openvino_contrib/tree/master/modules/"
-            "custom_operations/user_ie_extensions/tokenizer/python#coverage-by-model-type to check the potential "
+            "custom_operations/user_ie_extensions/tokenizer/python#output-match-by-model to check the potential "
             "difference between original and OpenVINO tokenizers"
         ),
     )
@@ -96,6 +96,7 @@ def get_parser() -> ArgumentParser:
     parser.add_argument(
         "--streaming-detokenizer",
         required=False,
+        action="store_true",
         help=(
             "[Experimental] Modify SentencePiece based detokenizer to keep spaces leading space. "
             "Can be used to stream a model output without TextStreamer buffer"
@@ -105,8 +106,14 @@ def get_parser() -> ArgumentParser:
 
 
 def convert_hf_tokenizer() -> None:
-    from transformers import AutoTokenizer
-
+    try:
+        from transformers import AutoTokenizer
+    except (ImportError, ModuleNotFoundError):
+        raise EnvironmentError(
+            "No transformers library in the environment. Install required dependencies with one of two options:\n"
+            "1. pip install openvino-tokenizers[transformers]\n"
+            "2. pip install transformers[sentencepiece] tiktoken\n"
+        )
 
     args = get_parser().parse_args()
 

@@ -61,6 +61,12 @@ def convert_tokenizer(
                     with_detokenizer=with_detokenizer,
                     skip_special_tokens=skip_special_tokens,
                 )
+    else:
+        raise EnvironmentError(
+            "No transformers library in the environment. Install required dependencies with one of two options:\n"
+            "1. pip install openvino-tokenizers[transformers]\n"
+            "2. pip install transformers[sentencepiece] tiktoken\n"
+        )
 
     if ov_tokenizers is None:
         raise OVTypeError(f"Tokenizer type is not supported: {type(tokenizer_object)}")

@@ -42,30 +42,33 @@ def add_tokenizer_type(row):
 
     results_df = results_df[["Tokenizer Type", "Model", "test_string", "status"]]
     grouped_by_model = results_df.groupby(["Tokenizer Type", "Model"]).agg({"status": ["mean", "count"]}).reset_index()
-    grouped_by_model.columns = ["Tokenizer Type", "Model", "Pass Rate, %", "Number of Tests"]
-    grouped_by_model["Pass Rate, %"] *= 100
+    grouped_by_model.columns = ["Tokenizer Type", "Model", "Output Matched, %", "Number of Tests"]
+    grouped_by_model["Output Matched, %"] *= 100
     grouped_by_type = results_df.groupby(["Tokenizer Type"]).agg({"status": ["mean", "count"]}).reset_index()
-    grouped_by_type.columns = ["Tokenizer Type", "Pass Rate, %", "Number of Tests"]
-    grouped_by_type["Pass Rate, %"] *= 100
+    grouped_by_type.columns = ["Tokenizer Type", "Output Matched, %", "Number of Tests"]
+    grouped_by_type["Output Matched, %"] *= 100
 
     readme_path = Path("../README.md")
     with open(readme_path) as f:
-        old_readme = f.read().split("## Test Coverage")[0]
+        old_readme = f.read().split("## Test Results")[0]
 
     new_readme = StringIO()
     new_readme.write(old_readme)
     new_readme.write(
-        "## Test Coverage\n\n"
+        "## Test Results\n\n"
         "This report is autogenerated and includes tokenizers and detokenizers tests. "
-        "To update it run pytest with `--update_readme` flag.\n\n"
-        "### Coverage by Tokenizer Type\n\n"
+        "The `Output Matched, %` column shows the percent of test strings "
+        "for which the results of OpenVINO and Hugingface Tokenizers are the same. "
+        "To update the report run `pytest tokenizers_test.py --update_readme` in "
+        "`modules/custom_operations/user_ie_extensions/tokenizer/python/tests` directory.\n\n"
+        "### Output Match by Tokenizer Type\n\n"
     )
     is_pandas_2 = tuple(map(int, version("pandas").split("."))) >= (2, 0, 0)
     if is_pandas_2:
         grouped_by_type.style.format(precision=2).hide(axis="index").to_html(new_readme, exclude_styles=True)
     else:
         grouped_by_type.style.format(precision=2).hide_index().to_html(new_readme, exclude_styles=True)
-    new_readme.write("\n### Coverage by Model Type\n\n")
+    new_readme.write("\n### Output Match by Model\n\n")
     if is_pandas_2:
         grouped_by_model.style.format(precision=2).hide(axis="index").to_html(new_readme, exclude_styles=True)
     else: