From 631c978d08132de3fda04529334f56c1e86e8b5e Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 21 Dec 2023 18:42:48 +0000 Subject: [PATCH 1/4] Fix --streaming-detokenizer flag --- .../tokenizer/python/openvino_tokenizers/cli.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py index 03365eae2..27197939d 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py @@ -96,6 +96,7 @@ def get_parser() -> ArgumentParser: parser.add_argument( "--streaming-detokenizer", required=False, + action="store_true", help=( "[Experimental] Modify SentencePiece based detokenizer to keep spaces leading space. " "Can be used to stream a model output without TextStreamer buffer" From 51fd9e3523aabceb66e4de5bd49b0a64872a1c35 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 21 Dec 2023 20:32:04 +0000 Subject: [PATCH 2/4] Rewrite README.md --- .../tokenizer/python/README.md | 108 ++++++++++++++---- .../python/openvino_tokenizers/cli.py | 12 +- .../openvino_tokenizers/convert_tokenizer.py | 6 + .../tokenizer/python/tests/conftest.py | 21 ++-- 4 files changed, 113 insertions(+), 34 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md index 925c049cc..29dde3d00 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md @@ -1,46 +1,88 @@ # OpenVINO Tokenizers +OpenVINO Tokenizers adds text processing operations to OpenVINO. + ## Features -- Convert a HuggingFace tokenizer into OpenVINO model tokenizer and detokenizer: - - Fast tokenizers based on Wordpiece and BPE models - - Slow tokenizers based on SentencePiece model file +- Perform tokenization and detokenization without third-party dependencies +- Convert a HuggingFace tokenizer into OpenVINO model tokenizer and detokenizer - Combine OpenVINO models into a single model - Add greedy decoding pipeline to text generation model ## Installation -1. Install [OpenVINO Runtime for C++](https://docs.openvino.ai/latest/openvino_docs_install_guides_install_dev_tools.html#for-c-developers). -2. (Recommended) Create and activate virtual env: +(Recommended) Create and activate virtual env: ```bash python3 -m venv venv source venv/bin/activate + # or +conda create --name openvino_tokenizer +conda activate openvino_tokenizer ``` -3. Go to `modules/custom_operations` and run: + +### Minimal Installation + +Use minimal installation when you have a converted OpenVINO tokenizer: +```bash +pip install openvino-tokenizers + # or +conda install -c conda-forge openvino openvino-tokenizers +``` + +### Convert Tokenizers Installation + +If you want to convert HuggingFace tokenizers into OpenVINO tokenizers: +```bash +pip install openvino-tokenizers[transformers] + # or +conda install -c conda-forge openvino openvino-tokenizers && pip install transformers[sentencepiece] tiktoken +``` + +3. Build and install from source after [OpenVINO installation](): +```bash +source path/to/installed/openvino/setupvars.sh +git clone https://github.com/openvinotoolkit/openvino_contrib.git +cd openvino_contrib/modules/custom_operations/ +pip install -e .[transformers] +``` + +4. Build and install for development: ```bash -# to use converted tokenizers or models combined with tokenizers -pip install . -# to convert tokenizers from transformers library -pip install .[transformers] -# for development and testing the library +source path/to/installed/openvino/setupvars.sh +git clone https://github.com/openvinotoolkit/openvino_contrib.git +cd openvino_contrib/modules/custom_operations/ pip install -e .[all] +# verify installation by running tests +cd user_ie_extensions/tokenizer/python/tests/ +pytest . ``` + ### Convert HuggingFace tokenizer +OpenVINO Tokenizers ships with CLI tool that can convert tokenizers from Huggingface Hub +or Huggingface tokenizers saved on disk: + +```shell +convert_tokenizer codellama/CodeLlama-7b-hf --with-detokenizer -o output_dir +``` + +There is also `convert_tokenizer` function that can convert tokenizer python object. + ```python +import numpy as np from transformers import AutoTokenizer -from openvino import compile_model +from openvino import compile_model, save_model from openvino_tokenizers import convert_tokenizer hf_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") ov_tokenizer = convert_tokenizer(hf_tokenizer) compiled_tokenzier = compile_model(ov_tokenizer) -text_input = "Test string" +text_input = ["Test string"] -hf_output = hf_tokenizer([text_input], return_tensors="np") -ov_output = compiled_tokenzier([text_input]) +hf_output = hf_tokenizer(text_input, return_tensors="np") +ov_output = compiled_tokenzier(text_input) for output_name in hf_output: print(f"OpenVINO {output_name} = {ov_output[output_name]}") @@ -51,10 +93,20 @@ for output_name in hf_output: # HuggingFace token_type_ids = [[0 0 0 0]] # OpenVINO attention_mask = [[1 1 1 1]] # HuggingFace attention_mask = [[1 1 1 1]] + +# save tokenizer for later use +save_model(ov_tokenizer, "openvino_tokenizer.xml") + +loaded_tokenizer = compile_model("openvino_tokenizer.xml") +loaded_ov_output = loaded_tokenizer(text_input) +for output_name in hf_output: + assert np.all(loaded_ov_output[output_name] == ov_output[output_name]) ``` ### Connect Tokenizer to a Model +To infer and convert the original model, install torch or torch-cpu to the virtual environment. + ```python from transformers import AutoTokenizer, AutoModelForSequenceClassification from openvino import compile_model, convert_model @@ -83,10 +135,12 @@ print(f"HuggingFace logits {hf_output.logits}") ### Use Extension With Converted (De)Tokenizer or Model With (De)Tokenizer -To work with converted tokenizer and detokenizer, numpy string tensors are used. +Import `openvino_tokenizers` will add all tokenizer-related operations to OpenVINO, +after which you can work with saved tokenizers and detokenizers. ```python import numpy as np +import openvino_tokenizers from openvino import Core core = Core() @@ -160,17 +214,27 @@ print(f"HuggingFace output string: `{hf_output}`") # HuggingFace output string: `['Quick brown fox was walking through the forest. He was looking for something']` ``` -## Test Coverage +## Supported Tokenizer Types + +| Huggingface
Tokenizer Type | Tokenizer Model Type | Tokenizer | Detokenizer | +|---------------------------------|----------------------|-----------|-------------| +| Fast | WordPiece | ✔ | ✘ | +| | BPE | ✔ | ✔ | +| | Unigram | ✘ | ✘ | +| Legacy | SentencePiece .model | ✔ | ✔ | +| Custom | tiktoken | ✔ | ✔ | + +## Test Results -This report is autogenerated and includes tokenizers and detokenizers tests. To update it run pytest with `--update_readme` flag. +This report is autogenerated and includes tokenizers and detokenizers tests. The `Output Matched, %` column shows the percent of test strings for which the results of OpenVINO and Hugingface Tokenizers are the same.To update the report run `pytest tokenizers_test.py --update_readme` in `modules/custom_operations/user_ie_extensions/tokenizer/python/tests` directory. -### Coverage by Tokenizer Type +### Output Match by Tokenizer Type - + @@ -198,14 +262,14 @@ This report is autogenerated and includes tokenizers and detokenizers tests. To
Tokenizer TypePass Rate, %Output Matched, % Number of Tests
-### Coverage by Model Type +### Output Match by Model - + diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py index 27197939d..d55799712 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/cli.py @@ -64,7 +64,7 @@ def get_parser() -> ArgumentParser: "Pass `use_fast=False` to `AutoTokenizer.from_pretrained`. It will initialize legacy HuggingFace " "tokenizer and then converts it to OpenVINO. Might result in slightly different tokenizer. " "See models with _slow suffix https://github.com/openvinotoolkit/openvino_contrib/tree/master/modules/" - "custom_operations/user_ie_extensions/tokenizer/python#coverage-by-model-type to check the potential " + "custom_operations/user_ie_extensions/tokenizer/python#output-match-by-model to check the potential " "difference between original and OpenVINO tokenizers" ), ) @@ -106,8 +106,14 @@ def get_parser() -> ArgumentParser: def convert_hf_tokenizer() -> None: - from transformers import AutoTokenizer - + try: + from transformers import AutoTokenizer + except (ImportError, ModuleNotFoundError): + raise EnvironmentError( + "No transformers library in the environment. Install required dependencies with one of two options:\n" + "1. pip install openvino-tokenizers[transformers]\n" + "2. pip install transformers[sentencepiece] tiktoken\n" + ) args = get_parser().parse_args() diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/convert_tokenizer.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/convert_tokenizer.py index 2cb0d7750..35a6f05bb 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/convert_tokenizer.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/openvino_tokenizers/convert_tokenizer.py @@ -61,6 +61,12 @@ def convert_tokenizer( with_detokenizer=with_detokenizer, skip_special_tokens=skip_special_tokens, ) + else: + raise EnvironmentError( + "No transformers library in the environment. Install required dependencies with one of two options:\n" + "1. pip install openvino-tokenizers[transformers]\n" + "2. pip install transformers[sentencepiece] tiktoken\n" + ) if ov_tokenizers is None: raise OVTypeError(f"Tokenizer type is not supported: {type(tokenizer_object)}") diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/conftest.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/conftest.py index 47402ee29..fff64d451 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/conftest.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/conftest.py @@ -42,30 +42,33 @@ def add_tokenizer_type(row): results_df = results_df[["Tokenizer Type", "Model", "test_string", "status"]] grouped_by_model = results_df.groupby(["Tokenizer Type", "Model"]).agg({"status": ["mean", "count"]}).reset_index() - grouped_by_model.columns = ["Tokenizer Type", "Model", "Pass Rate, %", "Number of Tests"] - grouped_by_model["Pass Rate, %"] *= 100 + grouped_by_model.columns = ["Tokenizer Type", "Model", "Output Matched, %", "Number of Tests"] + grouped_by_model["Output Matched, %"] *= 100 grouped_by_type = results_df.groupby(["Tokenizer Type"]).agg({"status": ["mean", "count"]}).reset_index() - grouped_by_type.columns = ["Tokenizer Type", "Pass Rate, %", "Number of Tests"] - grouped_by_type["Pass Rate, %"] *= 100 + grouped_by_type.columns = ["Tokenizer Type", "Output Matched, %", "Number of Tests"] + grouped_by_type["Output Matched, %"] *= 100 readme_path = Path("../README.md") with open(readme_path) as f: - old_readme = f.read().split("## Test Coverage")[0] + old_readme = f.read().split("## Test Results")[0] new_readme = StringIO() new_readme.write(old_readme) new_readme.write( - "## Test Coverage\n\n" + "## Test Results\n\n" "This report is autogenerated and includes tokenizers and detokenizers tests. " - "To update it run pytest with `--update_readme` flag.\n\n" - "### Coverage by Tokenizer Type\n\n" + "The `Output Matched, %` column shows the percent of test strings " + "for which the results of OpenVINO and Hugingface Tokenizers are the same. " + "To update the report run `pytest tokenizers_test.py --update_readme` in " + "`modules/custom_operations/user_ie_extensions/tokenizer/python/tests` directory.\n\n" + "### Output Match by Tokenizer Type\n\n" ) is_pandas_2 = tuple(map(int, version("pandas").split("."))) >= (2, 0, 0) if is_pandas_2: grouped_by_type.style.format(precision=2).hide(axis="index").to_html(new_readme, exclude_styles=True) else: grouped_by_type.style.format(precision=2).hide_index().to_html(new_readme, exclude_styles=True) - new_readme.write("\n### Coverage by Model Type\n\n") + new_readme.write("\n### Output Match by Model\n\n") if is_pandas_2: grouped_by_model.style.format(precision=2).hide(axis="index").to_html(new_readme, exclude_styles=True) else: From b3bfc5e4c3b2a59bba66d366062b20648068b34d Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 21 Dec 2023 20:39:27 +0000 Subject: [PATCH 3/4] Rewrite README.md --- .../user_ie_extensions/tokenizer/python/README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md index 29dde3d00..c9e5928ea 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md @@ -38,7 +38,7 @@ pip install openvino-tokenizers[transformers] conda install -c conda-forge openvino openvino-tokenizers && pip install transformers[sentencepiece] tiktoken ``` -3. Build and install from source after [OpenVINO installation](): +### Build and install from source after [OpenVINO installation](https://docs.openvino.ai/2023.2/openvino_docs_install_guides_overview.html) ```bash source path/to/installed/openvino/setupvars.sh git clone https://github.com/openvinotoolkit/openvino_contrib.git @@ -46,7 +46,7 @@ cd openvino_contrib/modules/custom_operations/ pip install -e .[transformers] ``` -4. Build and install for development: +### Build and install for development ```bash source path/to/installed/openvino/setupvars.sh git clone https://github.com/openvinotoolkit/openvino_contrib.git @@ -57,6 +57,7 @@ cd user_ie_extensions/tokenizer/python/tests/ pytest . ``` +## Usage ### Convert HuggingFace tokenizer @@ -226,7 +227,7 @@ print(f"HuggingFace output string: `{hf_output}`") ## Test Results -This report is autogenerated and includes tokenizers and detokenizers tests. The `Output Matched, %` column shows the percent of test strings for which the results of OpenVINO and Hugingface Tokenizers are the same.To update the report run `pytest tokenizers_test.py --update_readme` in `modules/custom_operations/user_ie_extensions/tokenizer/python/tests` directory. +This report is autogenerated and includes tokenizers and detokenizers tests. The `Output Matched, %` column shows the percent of test strings for which the results of OpenVINO and Hugingface Tokenizers are the same. To update the report run `pytest tokenizers_test.py --update_readme` in `modules/custom_operations/user_ie_extensions/tokenizer/python/tests` directory. ### Output Match by Tokenizer Type From a8b5620032ce3763806cc46fed6296b76fed8540 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 21 Dec 2023 20:42:06 +0000 Subject: [PATCH 4/4] Rewrite README.md --- .../user_ie_extensions/tokenizer/python/README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md index c9e5928ea..760a6de6c 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md @@ -218,12 +218,12 @@ print(f"HuggingFace output string: `{hf_output}`") ## Supported Tokenizer Types | Huggingface
Tokenizer Type | Tokenizer Model Type | Tokenizer | Detokenizer | -|---------------------------------|----------------------|-----------|-------------| -| Fast | WordPiece | ✔ | ✘ | -| | BPE | ✔ | ✔ | -| | Unigram | ✘ | ✘ | -| Legacy | SentencePiece .model | ✔ | ✔ | -| Custom | tiktoken | ✔ | ✔ | +|---------------------------------|----------------------|----------|------------| +| Fast | WordPiece | ✅ | ❌ | +| | BPE | ✅ | ✅ | +| | Unigram | ❌ | ❌ | +| Legacy | SentencePiece .model | ✅ | ✅ | +| Custom | tiktoken | ✅ | ✅ | ## Test Results
Tokenizer Type ModelPass Rate, %Output Matched, % Number of Tests