From 31e5bdee1e6a5284d1a359d1f41414acfac8feba Mon Sep 17 00:00:00 2001 From: Danilo Burbano <37355249+danilojsl@users.noreply.github.com> Date: Tue, 18 Jul 2023 06:18:18 -0500 Subject: [PATCH 1/8] SPARKNLP-867 Refactors getActivation to take multilabel into account (#13888) --- .../bert_for_zero_shot_classification.py | 3 ++- .../nlp/HasClassifierActivationProperties.scala | 11 ++++++++--- .../classifier/dl/BertForZeroShotClassification.scala | 2 +- .../dl/DistilBertForZeroShotClassification.scala | 2 +- .../dl/RoBertaForZeroShotClassification.scala | 2 +- 5 files changed, 13 insertions(+), 7 deletions(-) diff --git a/python/sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py b/python/sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py index 742dcb6cc829f4..24787abc59d7ce 100755 --- a/python/sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +++ b/python/sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py @@ -164,7 +164,8 @@ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.Bert maxSentenceLength=128, caseSensitive=True, coalesceSentences=False, - activation="softmax" + activation="softmax", + multilabel=False ) @staticmethod diff --git a/src/main/scala/com/johnsnowlabs/nlp/HasClassifierActivationProperties.scala b/src/main/scala/com/johnsnowlabs/nlp/HasClassifierActivationProperties.scala index 9d92e7a02482e6..2d88385091bc9e 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/HasClassifierActivationProperties.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/HasClassifierActivationProperties.scala @@ -52,10 +52,13 @@ trait HasClassifierActivationProperties extends ParamsAndFeaturesWritable { "multilabel", "Whether or not the result should be multi-class (the sum of all probabilities is 1.0) or multi-label (each label has a probability between 0.0 to 1.0). Default is False i.e. multi-class") - setDefault(activation -> ActivationFunction.softmax, threshold -> 0.5f, multilabel -> false) - /** @group getParam */ - def getActivation: String = $(activation) + def getActivation: String = { + val activation = + if ($(multilabel)) ActivationFunction.sigmoid else ActivationFunction.softmax + + if ($(multilabel)) activation else $(this.activation) + } /** @group setParam */ def setActivation(value: String): this.type = { @@ -94,6 +97,8 @@ trait HasClassifierActivationProperties extends ParamsAndFeaturesWritable { set(this.multilabel, value) } + setDefault(activation -> ActivationFunction.softmax, threshold -> 0.5f, multilabel -> false) + } object ActivationFunction { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForZeroShotClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForZeroShotClassification.scala index 6c6ddc35140d1a..0cc57e366a1301 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForZeroShotClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForZeroShotClassification.scala @@ -325,7 +325,7 @@ class BertForZeroShotClassification(override val uid: String) $(caseSensitive), $(coalesceSentences), $$(labels), - $(activation)) + getActivation) } else { Seq.empty[Annotation] diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForZeroShotClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForZeroShotClassification.scala index b1afba431726d2..0726cf35b5ca7f 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForZeroShotClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForZeroShotClassification.scala @@ -325,7 +325,7 @@ class DistilBertForZeroShotClassification(override val uid: String) $(caseSensitive), $(coalesceSentences), $$(labels), - $(activation)) + getActivation) } else { Seq.empty[Annotation] diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForZeroShotClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForZeroShotClassification.scala index 60041627854e15..e66e8b59627804 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForZeroShotClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForZeroShotClassification.scala @@ -339,7 +339,7 @@ class RoBertaForZeroShotClassification(override val uid: String) $(caseSensitive), $(coalesceSentences), $$(labels), - $(activation)) + getActivation) } else { Seq.empty[Annotation] From 893b6934503bedf508f0e7378d7bdf2b08608789 Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Tue, 18 Jul 2023 13:21:29 +0200 Subject: [PATCH 2/8] Sparknlp 868 make spark driver cores override local in start functions (#13894) * Allow users to change Driver's cores at start * Add unit test for driver cores in start function --- python/sparknlp/__init__.py | 13 ++++++++++++- src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala | 8 +++++++- .../com/johnsnowlabs/nlp/SparkNLPTestSpec.scala | 8 +++++--- 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/python/sparknlp/__init__.py b/python/sparknlp/__init__.py index 5eca3c05193f9f..5685670314fcb8 100644 --- a/python/sparknlp/__init__.py +++ b/python/sparknlp/__init__.py @@ -132,13 +132,24 @@ def start(gpu=False, if params is None: params = {} + else: + if not isinstance(params, dict): + raise TypeError('params must be a dictionary like {"spark.executor.memory": "8G"}') + if '_instantiatedSession' in dir(SparkSession) and SparkSession._instantiatedSession is not None: print('Warning::Spark Session already created, some configs may not take.') + driver_cores = "*" + for key, value in params.items(): + if key == "spark.driver.cores": + driver_cores = f"{value}" + else: + driver_cores = "*" + class SparkNLPConfig: def __init__(self): - self.master, self.app_name = "local[*]", "Spark NLP" + self.master, self.app_name = "local[{}]".format(driver_cores), "Spark NLP" self.serializer, self.serializer_max_buffer = "org.apache.spark.serializer.KryoSerializer", "2000M" self.driver_max_result_size = "0" # Spark NLP on CPU or GPU diff --git a/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala b/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala index 12904be9b6bd97..73dbedef2002cc 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala @@ -68,12 +68,18 @@ object SparkNLP { val builder = SparkSession .builder() .appName("Spark NLP") - .master("local[*]") .config("spark.driver.memory", memory) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.kryoserializer.buffer.max", "2000M") .config("spark.driver.maxResultSize", "0") + // get the set cores by users since local[*] will override spark.driver.cores if set + if (params.contains("spark.driver.cores")) { + builder.master("local[" + params("spark.driver.cores") + "]") + } else { + builder.master("local[*]") + } + val sparkNlpJar = if (apple_silicon) MavenSparkSilicon else if (aarch64) MavenSparkAarch64 diff --git a/src/test/scala/com/johnsnowlabs/nlp/SparkNLPTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/SparkNLPTestSpec.scala index 8fa879b07bc184..69d7e7181ca160 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/SparkNLPTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/SparkNLPTestSpec.scala @@ -1,6 +1,6 @@ package com.johnsnowlabs.nlp -import com.johnsnowlabs.tags.SlowTest +import com.johnsnowlabs.tags.FastTest import com.johnsnowlabs.util.ConfigHelper.{awsJavaSdkVersion, hadoopAwsVersion} import org.scalatest.flatspec.AnyFlatSpec @@ -8,14 +8,16 @@ class SparkNLPTestSpec extends AnyFlatSpec { behavior of "SparkNLPTestSpec" - it should "start with extra parameters" taggedAs SlowTest ignore { + it should "start with extra parameters" taggedAs FastTest in { val extraParams: Map[String, String] = Map( "spark.jars.packages" -> ("org.apache.hadoop:hadoop-aws:" + hadoopAwsVersion + ",com.amazonaws:aws-java-sdk:" + awsJavaSdkVersion), - "spark.hadoop.fs.s3a.path.style.access" -> "true") + "spark.hadoop.fs.s3a.path.style.access" -> "true", + "spark.driver.cores" -> "2") val spark = SparkNLP.start(params = extraParams) assert(spark.conf.get("spark.hadoop.fs.s3a.path.style.access") == "true") + assert(spark.conf.get("spark.master") == "local[2]") Seq( "com.johnsnowlabs.nlp:spark-nlp", From 20d9f3747ea249c1e37b123999d045fb36f42fa5 Mon Sep 17 00:00:00 2001 From: Devin Ha <33089471+DevinTDHa@users.noreply.github.com> Date: Tue, 18 Jul 2023 13:22:17 +0200 Subject: [PATCH 3/8] ONNX Export Notebooks (#13889) SPARKNLP-863 BERT SPARKNLP-864 DistilBERT SPARKNLP-865 RoBERTa SPARKNLP-866 DeBERTa --- .../HuggingFace_ONNX_in_Spark_NLP_BERT.ipynb | 2223 ++++++++++++++++ ...uggingFace_ONNX_in_Spark_NLP_DeBERTa.ipynb | 460 ++++ ...ingFace_ONNX_in_Spark_NLP_DistilBERT.ipynb | 2229 +++++++++++++++++ ...uggingFace_ONNX_in_Spark_NLP_RoBERTa.ipynb | 447 ++++ 4 files changed, 5359 insertions(+) create mode 100644 examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BERT.ipynb create mode 100644 examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DeBERTa.ipynb create mode 100644 examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DistilBERT.ipynb create mode 100644 examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_RoBERTa.ipynb diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BERT.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BERT.ipynb new file mode 100644 index 00000000000000..daadf8baf6eeb5 --- /dev/null +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BERT.ipynb @@ -0,0 +1,2223 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BERT.ipynb)\n", + "\n", + "# Import ONNX BERT models from HuggingFace 🤗 into Spark NLP 🚀\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models. Please make sure you have upgraded to the latest Spark NLP release.\n", + "- You can import models for BERT from HuggingFace and they have to be in `Fill Mask` category. Meaning, you cannot use BERT models trained/fine-tuned on a specific task such as token/sequence classification." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Export and Save HuggingFace model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.29.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.1/7.1 MB\u001b[0m \u001b[31m52.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m252.3/252.3 kB\u001b[0m \u001b[31m23.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m268.8/268.8 kB\u001b[0m \u001b[31m22.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m97.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m83.8/83.8 kB\u001b[0m \u001b[31m8.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m451.2/451.2 kB\u001b[0m \u001b[31m38.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m90.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.7/212.7 kB\u001b[0m \u001b[31m20.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m4.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m486.2/486.2 kB\u001b[0m \u001b[31m39.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m83.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m72.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m58.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m110.5/110.5 kB\u001b[0m \u001b[31m11.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.5/212.5 kB\u001b[0m \u001b[31m20.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.3/134.3 kB\u001b[0m \u001b[31m13.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for optimum (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "tensorflow 2.12.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\n", + "tensorflow-metadata 1.13.1 requires protobuf<5,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", + "- We'll use [bert-base-cased](https://huggingface.co/bert-base-cased) model from HuggingFace as an example and load it as a `ORTModelForFeatureExtraction`, representing an ONNX model.\n", + "- In addition to the BERT model, we also need to save the `BertTokenizer`. This is the same for every model, these are assets (saved in `/assets`) needed for tokenization inside Spark NLP." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ada9173f56b841a39bac4509d7eee4b5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading (…)lve/main/config.json: 0%| | 0.00/570 [00:00 False\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "============= Diagnostic Run torch.onnx.export version 2.0.1+cu118 =============\n", + "verbose: False, log level: Level.ERROR\n", + "======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================\n", + "\n" + ] + } + ], + "source": [ + "from optimum.onnxruntime import ORTModelForFeatureExtraction\n", + "\n", + "MODEL_NAME = \"bert-base-cased\"\n", + "EXPORT_PATH = f\"onnx_models/{MODEL_NAME}\"\n", + "\n", + "ort_model = ORTModelForFeatureExtraction.from_pretrained(MODEL_NAME, export=True)\n", + "\n", + "# Save the ONNX model\n", + "ort_model.save_pretrained(EXPORT_PATH)\n", + "\n", + "# Create directory for assets and move the tokenizer files.\n", + "# A separate folder is needed for Spark NLP.\n", + "!mkdir {EXPORT_PATH}/assets\n", + "!mv {EXPORT_PATH}/vocab.txt {EXPORT_PATH}/assets/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's have a look inside these two directories and see what we are dealing with:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 421028\n", + "drwxr-xr-x 2 root root 4096 Jul 11 09:15 assets\n", + "-rw-r--r-- 1 root root 634 Jul 11 09:15 config.json\n", + "-rw-r--r-- 1 root root 431123644 Jul 11 09:15 model.onnx\n" + ] + } + ], + "source": [ + "!ls -l {EXPORT_PATH}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 876\n", + "-rw-r--r-- 1 root root 125 Jul 11 09:15 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 315 Jul 11 09:15 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 668923 Jul 11 09:15 tokenizer.json\n", + "-rw-r--r-- 1 root root 213450 Jul 11 09:15 vocab.txt\n" + ] + } + ], + "source": [ + "!ls -l {EXPORT_PATH}/assets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import and Save BERT in Spark NLP\n", + "\n", + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Installing PySpark 3.2.3 and Spark NLP 5.0.0\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.0.0\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m498.7/498.7 kB\u001b[0m \u001b[31m37.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m20.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sparknlp\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's use `loadSavedModel` functon in `BertEmbeddings` which allows us to load the ONNX model\n", + "- Most params will be set automatically. They can also be set later after loading the model in `BertEmbeddings` during runtime, so don't worry about setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the exported model. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- `setStorageRef` is very important. When you are training a task like NER or any Text Classification, we use this reference to bound the trained model to this specific embeddings so you won't load a different embeddings by mistake and see terrible results 😊\n", + "- It's up to you what you put in `setStorageRef` but it cannot be changed later on. We usually use the name of the model to be clear, but you can get creative if you want!\n", + "- The `dimension` param is is purely cosmetic and won't change anything. It's mostly for you to know later via `.getDimension` what is the dimension of your model. So set this accordingly.\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "\n", + "# All these params should be identical to the original ONNX model\n", + "bert = BertEmbeddings.loadSavedModel(f\"{EXPORT_PATH}\", spark)\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"bert\")\\\n", + " .setCaseSensitive(True)\\\n", + " .setDimension(768)\\\n", + " .setStorageRef('bert_base_cased')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bert.write().overwrite().save(f\"{MODEL_NAME}_spark_nlp\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your ONNX BERT model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 421092\n", + "-rw-r--r-- 1 root root 431189561 Jul 11 09:18 bert_onnx\n", + "drwxr-xr-x 3 root root 4096 Jul 11 09:17 fields\n", + "drwxr-xr-x 2 root root 4096 Jul 11 09:17 metadata\n" + ] + } + ], + "source": [ + "! ls -l {MODEL_NAME}_spark_nlp" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny BERT model 😊" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sparknlp\n", + "\n", + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "\n", + "document_assembler = DocumentAssembler()\\\n", + " .setInputCol(\"text\")\\\n", + " .setOutputCol(\"document\")\n", + "\n", + "tokenizer = Tokenizer()\\\n", + " .setInputCols([\"document\"])\\\n", + " .setOutputCol(\"token\")\n", + "\n", + "bert_loaded = BertEmbeddings.load(f\"{MODEL_NAME}_spark_nlp\")\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"bert\")\\\n", + "\n", + "pipeline = Pipeline(\n", + " stages = [\n", + " document_assembler,\n", + " tokenizer,\n", + " bert_loaded\n", + " ])\n", + "\n", + "data = spark.createDataFrame([['William Henry Gates III (born October 28, 1955) is an American business magnate, software developer, investor,and philanthropist.']]).toDF(\"text\")\n", + "model = pipeline.fit(data)\n", + "result = model.transform(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| embeddings|\n", + "+--------------------+\n", + "|[0.43426436, -0.3...|\n", + "|[-0.033402592, -0...|\n", + "|[0.38291216, 0.11...|\n", + "|[-0.11996313, 0.2...|\n", + "|[-0.4832554, 0.05...|\n", + "|[-0.17415513, 0.2...|\n", + "|[0.030410081, -0....|\n", + "|[-0.09456871, -1....|\n", + "|[0.20999376, 0.27...|\n", + "|[-0.6175975, -0.1...|\n", + "|[0.26205128, 0.31...|\n", + "|[0.071797684, 0.3...|\n", + "|[0.11466834, 0.16...|\n", + "|[0.112313405, 0.2...|\n", + "|[0.97112143, 0.13...|\n", + "|[0.6206643, -0.10...|\n", + "|[0.21066168, 0.42...|\n", + "|[0.451869, 0.2410...|\n", + "|[0.3347211, -0.16...|\n", + "|[0.10000777, -0.3...|\n", + "+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "result.selectExpr(\"explode(bert.embeddings) as embeddings\").show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's it! You can now go wild and use hundreds of BERT models from HuggingFace 🤗 in Spark NLP 🚀\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "003df43c87374303ac35fc9511889d9d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "07947510f827455593741f7307285172": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4935538ee79f4287b29510a606f19075", + "max": 435797, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_e2532e73e7964adc8dd9ab3a925808de", + "value": 435797 + } + }, + "197c4900589f4e32a8927241af41ff50": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_ef603c2c89574bd482833a87d1a42db4", + "IPY_MODEL_a83ee309c8ab41cba17c3733f76610e6", + "IPY_MODEL_2af9f521cd0a4525b0582cc7ea9602dc" + ], + "layout": "IPY_MODEL_4b1c2524c69b4e85a29a5aeedc5ec03c" + } + }, + "19835dbd520841c48cbd539e252e350c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1bb102d2be6844d4a2b0a0d2211403fb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "1ef1f1da55c44ffbbcb4d4462f85e10e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "20357cb4c69b4b0d8f5d4057e1ec3603": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_d0520552020c46918b9881a3851db1cb", + "IPY_MODEL_fb2ea249365c4923884ccf98d6de74ba", + "IPY_MODEL_80d479e14aed47ac9bf0ce17b42d1788" + ], + "layout": "IPY_MODEL_52451a60c3de4b37aa5203e725519916" + } + }, + "276704e828d14eec889fbb8be78e050b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "27ba4eba7900413f8bbdb270213be75d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d02ccd3a39514b3e96afe8afdacd9664", + "placeholder": "​", + "style": "IPY_MODEL_4568491276d846a199da9b460877c4e5", + "value": "Downloading pytorch_model.bin: 100%" + } + }, + "29194dd0c5fb46458826ad0e66c26602": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2af9f521cd0a4525b0582cc7ea9602dc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_461ba797a5714f6db81372e3385db87e", + "placeholder": "​", + "style": "IPY_MODEL_29194dd0c5fb46458826ad0e66c26602", + "value": " 213k/213k [00:00<00:00, 2.55MB/s]" + } + }, + "2bd0463a4a0944b1993331767ba32096": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_19835dbd520841c48cbd539e252e350c", + "placeholder": "​", + "style": "IPY_MODEL_a9b973617c014c6883982d5df372cfef", + "value": "Downloading (…)/main/tokenizer.json: 100%" + } + }, + "2dd35e0567d54430937157b8ef13fd1a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a9e92fbed2814b69b429a58dc51ebb14", + "placeholder": "​", + "style": "IPY_MODEL_ebb31b7743c14e2f84892a3ead9059cd", + "value": " 570/570 [00:00<00:00, 22.2kB/s]" + } + }, + "3575cad2671545b49579c164eb3669d7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3f686d7ac5a244f8a24fc180f45db850": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3ff48c9451434a93ab3ec054e9ef703e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "43ee3d5c0ebc47e4b26b64cd299e6192": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_2bd0463a4a0944b1993331767ba32096", + "IPY_MODEL_07947510f827455593741f7307285172", + "IPY_MODEL_4f44ed39f6ee4d14953dd223c72ff1d2" + ], + "layout": "IPY_MODEL_d754520c1e3b483bb979870b97409ec0" + } + }, + "4568491276d846a199da9b460877c4e5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "461ba797a5714f6db81372e3385db87e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4935538ee79f4287b29510a606f19075": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4b1c2524c69b4e85a29a5aeedc5ec03c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4f44ed39f6ee4d14953dd223c72ff1d2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b8996fd1f1a3415d8d60422cf4bd9f72", + "placeholder": "​", + "style": "IPY_MODEL_bfd976ee58154f1cb77f665377fa953b", + "value": " 436k/436k [00:00<00:00, 12.0MB/s]" + } + }, + "52451a60c3de4b37aa5203e725519916": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "532e9756604042de9cf929749e817afa": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5607909f2c8e4ed2bb10d5daa1b8dc2f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5d0c0491fb3745e3a75f52beb0f7bd83": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_27ba4eba7900413f8bbdb270213be75d", + "IPY_MODEL_ea22c48510c94e5fa0cc6501778c2960", + "IPY_MODEL_79418799d56747df857d493a9ab557c7" + ], + "layout": "IPY_MODEL_82956c89fb93471093f505c29bb27e21" + } + }, + "60c3def95512434f96aa335904177677": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_777bf87a1f80474bb89b7a16a400d407", + "placeholder": "​", + "style": "IPY_MODEL_c433dd764a904da7b7665a8636d3495b", + "value": "Downloading (…)lve/main/config.json: 100%" + } + }, + "72e4860916ea4a279cfe89e2e080f394": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "777bf87a1f80474bb89b7a16a400d407": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "79418799d56747df857d493a9ab557c7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cc6a63cc44d24e3daf6e575cc9c2834e", + "placeholder": "​", + "style": "IPY_MODEL_1bb102d2be6844d4a2b0a0d2211403fb", + "value": " 436M/436M [00:10<00:00, 33.3MB/s]" + } + }, + "794582ac3f074a8f9229c22fe825fbde": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_276704e828d14eec889fbb8be78e050b", + "max": 570, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ce0978ee6da141c8b185a0cc4a8af55f", + "value": 570 + } + }, + "80d479e14aed47ac9bf0ce17b42d1788": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_003df43c87374303ac35fc9511889d9d", + "placeholder": "​", + "style": "IPY_MODEL_3f686d7ac5a244f8a24fc180f45db850", + "value": " 29.0/29.0 [00:00<00:00, 1.16kB/s]" + } + }, + "82956c89fb93471093f505c29bb27e21": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8eea76dc64674937954791772a32ef96": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "962de089349e44e89aea7c357acece0a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a46c68a0bd914e4db6a6cf1da691e05b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "a83ee309c8ab41cba17c3733f76610e6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_962de089349e44e89aea7c357acece0a", + "max": 213450, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_1ef1f1da55c44ffbbcb4d4462f85e10e", + "value": 213450 + } + }, + "a9b973617c014c6883982d5df372cfef": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a9e92fbed2814b69b429a58dc51ebb14": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ada9173f56b841a39bac4509d7eee4b5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_60c3def95512434f96aa335904177677", + "IPY_MODEL_794582ac3f074a8f9229c22fe825fbde", + "IPY_MODEL_2dd35e0567d54430937157b8ef13fd1a" + ], + "layout": "IPY_MODEL_fc4434f63a214724a5e41d92f451080f" + } + }, + "b8996fd1f1a3415d8d60422cf4bd9f72": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bfd976ee58154f1cb77f665377fa953b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c433dd764a904da7b7665a8636d3495b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "cc6a63cc44d24e3daf6e575cc9c2834e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ce0978ee6da141c8b185a0cc4a8af55f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "d02ccd3a39514b3e96afe8afdacd9664": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d0520552020c46918b9881a3851db1cb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_532e9756604042de9cf929749e817afa", + "placeholder": "​", + "style": "IPY_MODEL_8eea76dc64674937954791772a32ef96", + "value": "Downloading (…)okenizer_config.json: 100%" + } + }, + "d754520c1e3b483bb979870b97409ec0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e2532e73e7964adc8dd9ab3a925808de": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "ea22c48510c94e5fa0cc6501778c2960": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5607909f2c8e4ed2bb10d5daa1b8dc2f", + "max": 435779157, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ee06f59202b94cdc961617d47d08694e", + "value": 435779157 + } + }, + "ebb31b7743c14e2f84892a3ead9059cd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ee06f59202b94cdc961617d47d08694e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "ef603c2c89574bd482833a87d1a42db4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3ff48c9451434a93ab3ec054e9ef703e", + "placeholder": "​", + "style": "IPY_MODEL_3575cad2671545b49579c164eb3669d7", + "value": "Downloading (…)solve/main/vocab.txt: 100%" + } + }, + "fb2ea249365c4923884ccf98d6de74ba": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_72e4860916ea4a279cfe89e2e080f394", + "max": 29, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_a46c68a0bd914e4db6a6cf1da691e05b", + "value": 29 + } + }, + "fc4434f63a214724a5e41d92f451080f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DeBERTa.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DeBERTa.ipynb new file mode 100644 index 00000000000000..64ea287fc9376f --- /dev/null +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DeBERTa.ipynb @@ -0,0 +1,460 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DeBERTa.ipynb)\n", + "\n", + "# Import ONNX DeBERTa models from HuggingFace 🤗 into Spark NLP 🚀\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models. Please make sure you have upgraded to the latest Spark NLP release.\n", + "- You can import models for DeBERTa from HuggingFace and they have to be in `Fill Mask` category. Meaning, you cannot use DeBERTa models trained/fine-tuned on a specific task such as token/sequence classification." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Export and Save HuggingFace model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.29.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.1/7.1 MB\u001b[0m \u001b[31m44.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m252.3/252.3 kB\u001b[0m \u001b[31m20.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m268.8/268.8 kB\u001b[0m \u001b[31m20.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m77.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m83.8/83.8 kB\u001b[0m \u001b[31m8.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m451.2/451.2 kB\u001b[0m \u001b[31m41.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m99.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.7/212.7 kB\u001b[0m \u001b[31m22.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m4.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m486.2/486.2 kB\u001b[0m \u001b[31m40.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m76.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m60.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m64.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m110.5/110.5 kB\u001b[0m \u001b[31m10.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.5/212.5 kB\u001b[0m \u001b[31m19.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.3/134.3 kB\u001b[0m \u001b[31m9.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for optimum (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "tensorflow 2.12.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\n", + "tensorflow-metadata 1.13.1 requires protobuf<5,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", + "- We'll use [microsoft/deberta-v3-base](https://huggingface.co/microsoft/deberta-v3-base) model from HuggingFace as an example and load it as a `ORTModelForFeatureExtraction`, representing an ONNX model.\n", + "- In addition to the DeBERTa model, we also need to save the tokenizer. This is the same for every model, these are assets (saved in `/assets`) needed for tokenization inside Spark NLP." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Framework not specified. Using pt to export to ONNX.\n", + "Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias']\n", + "- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", + "/usr/local/lib/python3.10/dist-packages/transformers/convert_slow_tokenizer.py:454: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.\n", + " warnings.warn(\n", + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", + "Using framework PyTorch: 2.0.1+cu118\n", + "/usr/local/lib/python3.10/dist-packages/transformers/models/deberta_v2/modeling_deberta_v2.py:561: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n", + " torch.tensor(mid - 1).type_as(relative_pos),\n", + "/usr/local/lib/python3.10/dist-packages/transformers/models/deberta_v2/modeling_deberta_v2.py:565: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n", + " torch.ceil(torch.log(abs_pos / mid) / torch.log(torch.tensor((max_position - 1) / mid)) * (mid - 1)) + mid\n", + "/usr/local/lib/python3.10/dist-packages/transformers/models/deberta_v2/modeling_deberta_v2.py:724: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n", + " scale = torch.sqrt(torch.tensor(query_layer.size(-1), dtype=torch.float) * scale_factor)\n", + "/usr/local/lib/python3.10/dist-packages/transformers/models/deberta_v2/modeling_deberta_v2.py:724: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", + " scale = torch.sqrt(torch.tensor(query_layer.size(-1), dtype=torch.float) * scale_factor)\n", + "/usr/local/lib/python3.10/dist-packages/transformers/models/deberta_v2/modeling_deberta_v2.py:803: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n", + " scale = torch.sqrt(torch.tensor(pos_key_layer.size(-1), dtype=torch.float) * scale_factor)\n", + "/usr/local/lib/python3.10/dist-packages/transformers/models/deberta_v2/modeling_deberta_v2.py:803: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", + " scale = torch.sqrt(torch.tensor(pos_key_layer.size(-1), dtype=torch.float) * scale_factor)\n", + "/usr/local/lib/python3.10/dist-packages/transformers/models/deberta_v2/modeling_deberta_v2.py:815: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n", + " scale = torch.sqrt(torch.tensor(pos_query_layer.size(-1), dtype=torch.float) * scale_factor)\n", + "/usr/local/lib/python3.10/dist-packages/transformers/models/deberta_v2/modeling_deberta_v2.py:815: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", + " scale = torch.sqrt(torch.tensor(pos_query_layer.size(-1), dtype=torch.float) * scale_factor)\n", + "/usr/local/lib/python3.10/dist-packages/transformers/models/deberta_v2/modeling_deberta_v2.py:816: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", + " if key_layer.size(-2) != query_layer.size(-2):\n", + "/usr/local/lib/python3.10/dist-packages/transformers/models/deberta_v2/modeling_deberta_v2.py:112: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n", + " output = input.masked_fill(rmask, torch.tensor(torch.finfo(input.dtype).min))\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "============= Diagnostic Run torch.onnx.export version 2.0.1+cu118 =============\n", + "verbose: False, log level: Level.ERROR\n", + "======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" + ] + } + ], + "source": [ + "from optimum.onnxruntime import ORTModelForFeatureExtraction\n", + "\n", + "MODEL_NAME = \"microsoft/deberta-v3-base\"\n", + "EXPORT_PATH = f\"onnx_models/{MODEL_NAME}\"\n", + "\n", + "ort_model = ORTModelForFeatureExtraction.from_pretrained(MODEL_NAME, export=True)\n", + "\n", + "# Save the ONNX model\n", + "ort_model.save_pretrained(EXPORT_PATH)\n", + "\n", + "# Create directory for assets and move the tokenizer files.\n", + "# A separate folder is needed for Spark NLP.\n", + "!mkdir {EXPORT_PATH}/assets\n", + "!mv {EXPORT_PATH}/spm.model {EXPORT_PATH}/assets/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's have a look inside these two directories and see what we are dealing with:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 727308\n", + "-rw-r--r-- 1 root root 23 Jul 11 10:14 added_tokens.json\n", + "drwxr-xr-x 2 root root 4096 Jul 11 10:14 assets\n", + "-rw-r--r-- 1 root root 785 Jul 11 10:14 config.json\n", + "-rw-r--r-- 1 root root 736087819 Jul 11 10:14 model.onnx\n", + "-rw-r--r-- 1 root root 173 Jul 11 10:14 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 412 Jul 11 10:14 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 8648791 Jul 11 10:14 tokenizer.json\n" + ] + } + ], + "source": [ + "!ls -l {EXPORT_PATH}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 2408\n", + "-rw-r--r-- 1 root root 2464616 Jul 11 10:14 spm.model\n" + ] + } + ], + "source": [ + "!ls -l {EXPORT_PATH}/assets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import and Save DeBERTa in Spark NLP\n", + "\n", + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Installing PySpark 3.2.3 and Spark NLP 5.0.0\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.0.0\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m3.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m498.7/498.7 kB\u001b[0m \u001b[31m38.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m20.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sparknlp\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's use `loadSavedModel` functon in `BertEmbeddings` which allows us to load the ONNX model\n", + "- Most params will be set automatically. They can also be set later after loading the model in `BertEmbeddings` during runtime, so don't worry about setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the exported model. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- `setStorageRef` is very important. When you are training a task like NER or any Text Classification, we use this reference to bound the trained model to this specific embeddings so you won't load a different embeddings by mistake and see terrible results 😊\n", + "- It's up to you what you put in `setStorageRef` but it cannot be changed later on. We usually use the name of the model to be clear, but you can get creative if you want!\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "\n", + "# All these params should be identical to the original ONNX model\n", + "deberta = DeBertaEmbeddings.loadSavedModel(f\"{EXPORT_PATH}\", spark)\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"deberta\")\\\n", + " .setCaseSensitive(True)\\\n", + " .setStorageRef('deberta-v3-base')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "deberta.write().overwrite().save(f\"{MODEL_NAME}_spark_nlp\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your ONNX DeBERTa model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 721364\n", + "-rw-r--r-- 1 root root 736200277 Jul 11 10:23 deberta_onnx\n", + "-rw-r--r-- 1 root root 2464616 Jul 11 10:23 deberta_spp\n", + "drwxr-xr-x 2 root root 4096 Jul 11 10:22 metadata\n" + ] + } + ], + "source": [ + "! ls -l {MODEL_NAME}_spark_nlp" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny DeBERTa model 😊" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sparknlp\n", + "\n", + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "\n", + "document_assembler = DocumentAssembler()\\\n", + " .setInputCol(\"text\")\\\n", + " .setOutputCol(\"document\")\n", + "\n", + "tokenizer = Tokenizer()\\\n", + " .setInputCols([\"document\"])\\\n", + " .setOutputCol(\"token\")\n", + "\n", + "deberta_loaded = DeBertaEmbeddings.load(f\"{MODEL_NAME}_spark_nlp\")\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"deberta\")\\\n", + "\n", + "pipeline = Pipeline(\n", + " stages = [\n", + " document_assembler,\n", + " tokenizer,\n", + " deberta_loaded\n", + " ])\n", + "\n", + "data = spark.createDataFrame([['William Henry Gates III (born October 28, 1955) is an American business magnate, software developer, investor,and philanthropist.']]).toDF(\"text\")\n", + "model = pipeline.fit(data)\n", + "result = model.transform(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| embeddings|\n", + "+--------------------+\n", + "|[0.38130012, -0.5...|\n", + "|[0.44632936, -0.2...|\n", + "|[0.51096636, 0.08...|\n", + "|[-0.35451564, -0....|\n", + "|[0.39058837, 0.43...|\n", + "|[-0.49727106, 0.9...|\n", + "|[0.16452815, -0.4...|\n", + "|[0.17555067, -0.2...|\n", + "|[-0.024700366, 0....|\n", + "|[-0.3068737, 0.37...|\n", + "|[0.58636177, 0.59...|\n", + "|[-0.6334504, 0.09...|\n", + "|[-0.6704677, -0.3...|\n", + "|[-0.046930477, -0...|\n", + "|[0.15983628, -0.4...|\n", + "|[0.4974299, -0.17...|\n", + "|[1.1059247, 0.549...|\n", + "|[-0.19082792, -0....|\n", + "|[-0.17161375, -0....|\n", + "|[0.71347564, 0.23...|\n", + "+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "result.selectExpr(\"explode(deberta.embeddings) as embeddings\").show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's it! You can now go wild and use hundreds of DeBERTa models from HuggingFace 🤗 in Spark NLP 🚀\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DistilBERT.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DistilBERT.ipynb new file mode 100644 index 00000000000000..678d8840ab629b --- /dev/null +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DistilBERT.ipynb @@ -0,0 +1,2229 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DistilBERT.ipynb)\n", + "\n", + "# Import ONNX DistilBERT models from HuggingFace 🤗 into Spark NLP 🚀\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models. Please make sure you have upgraded to the latest Spark NLP release.\n", + "- You can import models for DistilBERT from HuggingFace and they have to be in `Fill Mask` category. Meaning, you cannot use DistilBERT models trained/fine-tuned on a specific task such as token/sequence classification." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Export and Save HuggingFace model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.29.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.1/7.1 MB\u001b[0m \u001b[31m62.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m252.3/252.3 kB\u001b[0m \u001b[31m19.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m268.8/268.8 kB\u001b[0m \u001b[31m22.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m76.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m83.8/83.8 kB\u001b[0m \u001b[31m9.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m451.2/451.2 kB\u001b[0m \u001b[31m35.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m86.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.7/212.7 kB\u001b[0m \u001b[31m17.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m486.2/486.2 kB\u001b[0m \u001b[31m39.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m81.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m5.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m66.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m55.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m110.5/110.5 kB\u001b[0m \u001b[31m10.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.5/212.5 kB\u001b[0m \u001b[31m17.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.3/134.3 kB\u001b[0m \u001b[31m12.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for optimum (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "tensorflow 2.12.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\n", + "tensorflow-metadata 1.13.1 requires protobuf<5,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", + "- We'll use [distilbert-base-cased](https://huggingface.co/distilbert-base-cased) model from HuggingFace as an example and load it as a `ORTModelForFeatureExtraction`, representing an ONNX model.\n", + "- In addition to the DistilBERT model, we also need to save the `DistilBertTokenizer`. This is the same for every model, these are assets (saved in `/assets`) needed for tokenization inside Spark NLP." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6ea03cb3d64f493fb46fd1289aa69ae5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading (…)lve/main/config.json: 0%| | 0.00/411 [00:00=3.20.3, but you have protobuf 3.20.2 which is incompatible.\n", + "tensorflow-metadata 1.13.1 requires protobuf<5,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", + "- We'll use [roberta-base](https://huggingface.co/roberta-base) model from HuggingFace as an example and load it as a `ORTModelForFeatureExtraction`, representing an ONNX model.\n", + "- In addition to the RoBERTa model, we also need to save the tokenizer. This is the same for every model, these are assets (saved in `/assets`) needed for tokenization inside Spark NLP." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Framework not specified. Using pt to export to ONNX.\n", + "Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias']\n", + "- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "Using framework PyTorch: 2.0.1+cu118\n", + "Overriding 1 configuration item(s)\n", + "\t- use_cache -> False\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "============= Diagnostic Run torch.onnx.export version 2.0.1+cu118 =============\n", + "verbose: False, log level: Level.ERROR\n", + "======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================\n", + "\n" + ] + } + ], + "source": [ + "from optimum.onnxruntime import ORTModelForFeatureExtraction\n", + "\n", + "MODEL_NAME = \"roberta-base\"\n", + "EXPORT_PATH = f\"onnx_models/{MODEL_NAME}\"\n", + "\n", + "ort_model = ORTModelForFeatureExtraction.from_pretrained(MODEL_NAME, export=True)\n", + "\n", + "# Save the ONNX model\n", + "ort_model.save_pretrained(EXPORT_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create directory for assets\n", + "!mkdir {EXPORT_PATH}/assets\n", + "\n", + "# let's make sure we sort the vocabs based on their ids first\n", + "vocabs = ort_model.preprocessors[0].get_vocab()\n", + "vocabs = sorted(vocabs, key=vocabs.get)\n", + "\n", + "# let's save the vocab as txt file\n", + "with open(f'{EXPORT_PATH}/vocab.txt', 'w') as f:\n", + " for item in vocabs:\n", + " f.write(\"%s\\n\" % item)\n", + "\n", + "# let's copy both vocab.txt and merges.txt files to saved_model/1/assets\n", + "!cp {EXPORT_PATH}/vocab.txt {EXPORT_PATH}/assets\n", + "!cp {EXPORT_PATH}/merges.txt {EXPORT_PATH}/assets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's have a look inside these two directories and see what we are dealing with:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 488536\n", + "drwxr-xr-x 2 root root 4096 Jul 11 10:38 assets\n", + "-rw-r--r-- 1 root root 644 Jul 11 10:37 config.json\n", + "-rw-r--r-- 1 root root 456318 Jul 11 10:37 merges.txt\n", + "-rw-r--r-- 1 root root 496467018 Jul 11 10:37 model.onnx\n", + "-rw-r--r-- 1 root root 280 Jul 11 10:37 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 351 Jul 11 10:37 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 2108619 Jul 11 10:37 tokenizer.json\n", + "-rw-r--r-- 1 root root 798293 Jul 11 10:37 vocab.json\n", + "-rw-r--r-- 1 root root 407065 Jul 11 10:38 vocab.txt\n" + ] + } + ], + "source": [ + "!ls -l {EXPORT_PATH}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 848\n", + "-rw-r--r-- 1 root root 456318 Jul 11 10:38 merges.txt\n", + "-rw-r--r-- 1 root root 407065 Jul 11 10:38 vocab.txt\n" + ] + } + ], + "source": [ + "!ls -l {EXPORT_PATH}/assets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import and Save RoBERTa in Spark NLP\n", + "\n", + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Installing PySpark 3.2.3 and Spark NLP 5.0.0\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.0.0\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m2.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m498.7/498.7 kB\u001b[0m \u001b[31m30.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m17.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sparknlp\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's use `loadSavedModel` functon in `BertEmbeddings` which allows us to load the ONNX model\n", + "- Most params will be set automatically. They can also be set later after loading the model in `BertEmbeddings` during runtime, so don't worry about setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the exported model. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- `setStorageRef` is very important. When you are training a task like NER or any Text Classification, we use this reference to bound the trained model to this specific embeddings so you won't load a different embeddings by mistake and see terrible results 😊\n", + "- It's up to you what you put in `setStorageRef` but it cannot be changed later on. We usually use the name of the model to be clear, but you can get creative if you want!\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "\n", + "# All these params should be identical to the original ONNX model\n", + "roberta = RoBertaEmbeddings.loadSavedModel(f\"{EXPORT_PATH}\", spark)\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"roberta\")\\\n", + " .setCaseSensitive(True)\\\n", + " .setStorageRef('roberta-base')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "roberta.write().overwrite().save(f\"{MODEL_NAME}_spark_nlp\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your ONNX RoBERTa model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 484920\n", + "drwxr-xr-x 4 root root 4096 Jul 11 10:43 fields\n", + "drwxr-xr-x 2 root root 4096 Jul 11 10:43 metadata\n", + "-rw-r--r-- 1 root root 496542911 Jul 11 10:43 roberta_onnx\n" + ] + } + ], + "source": [ + "! ls -l {MODEL_NAME}_spark_nlp" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny RoBERTa model 😊" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sparknlp\n", + "\n", + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "\n", + "document_assembler = DocumentAssembler()\\\n", + " .setInputCol(\"text\")\\\n", + " .setOutputCol(\"document\")\n", + "\n", + "tokenizer = Tokenizer()\\\n", + " .setInputCols([\"document\"])\\\n", + " .setOutputCol(\"token\")\n", + "\n", + "roberta_loaded = RoBertaEmbeddings.load(f\"{MODEL_NAME}_spark_nlp\")\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"roberta\")\\\n", + "\n", + "pipeline = Pipeline(\n", + " stages = [\n", + " document_assembler,\n", + " tokenizer,\n", + " roberta_loaded\n", + " ])\n", + "\n", + "data = spark.createDataFrame([['William Henry Gates III (born October 28, 1955) is an American business magnate, software developer, investor,and philanthropist.']]).toDF(\"text\")\n", + "model = pipeline.fit(data)\n", + "result = model.transform(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| embeddings|\n", + "+--------------------+\n", + "|[0.38130012, -0.5...|\n", + "|[0.44632936, -0.2...|\n", + "|[0.51096636, 0.08...|\n", + "|[-0.35451564, -0....|\n", + "|[0.39058837, 0.43...|\n", + "|[-0.49727106, 0.9...|\n", + "|[0.16452815, -0.4...|\n", + "|[0.17555067, -0.2...|\n", + "|[-0.024700366, 0....|\n", + "|[-0.3068737, 0.37...|\n", + "|[0.58636177, 0.59...|\n", + "|[-0.6334504, 0.09...|\n", + "|[-0.6704677, -0.3...|\n", + "|[-0.046930477, -0...|\n", + "|[0.15983628, -0.4...|\n", + "|[0.4974299, -0.17...|\n", + "|[1.1059247, 0.549...|\n", + "|[-0.19082792, -0....|\n", + "|[-0.17161375, -0....|\n", + "|[0.71347564, 0.23...|\n", + "+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "result.selectExpr(\"explode(roberta.embeddings) as embeddings\").show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's it! You can now go wild and use hundreds of RoBERTa models from HuggingFace 🤗 in Spark NLP 🚀\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 4620feb440d4d1310a532a10384af18d5e088746 Mon Sep 17 00:00:00 2001 From: Danilo Burbano <37355249+danilojsl@users.noreply.github.com> Date: Tue, 18 Jul 2023 06:22:31 -0500 Subject: [PATCH 4/8] SPARKNLP-869 Adding threshold to properties for python module (#13890) --- python/sparknlp/common/properties.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/python/sparknlp/common/properties.py b/python/sparknlp/common/properties.py index 3d4e3ab7c5a61f..49f0655ac70e28 100644 --- a/python/sparknlp/common/properties.py +++ b/python/sparknlp/common/properties.py @@ -78,6 +78,11 @@ class HasClassifierActivationProperties: "Whether to calculate logits via Multiclass(softmax) or Multilabel(sigmoid). Default is False i.e. Multiclass", typeConverter=TypeConverters.toBoolean) + threshold = Param(Params._dummy(), + "threshold", + "Choose the threshold to determine which logits are considered to be positive or negative", + typeConverter=TypeConverters.toFloat) + def setActivation(self, value): """Sets whether to calculate logits via Softmax or Sigmoid. Default is Softmax @@ -126,6 +131,22 @@ def getMultilabel(self): """ return self.getOrDefault(self.multilabel) + def setThreshold(self, value): + """Set the threshold to determine which logits are considered to be positive or negative. + (Default: `0.5`). The value should be between 0.0 and 1.0. Changing the threshold value + will affect the resulting labels and can be used to adjust the balance between precision and + recall in the classification process. + + Parameters + ---------- + value : float + The threshold to determine which logits are considered to be positive or negative. + (Default: `0.5`). The value should be between 0.0 and 1.0. Changing the threshold value + will affect the resulting labels and can be used to adjust the balance between precision and + recall in the classification process. + """ + return self._set(threshold=value) + class HasEmbeddingsProperties(Params): dimension = Param(Params._dummy(), From b84d9c946d1bf572f2b74547c770e7ecfcfdff14 Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Tue, 18 Jul 2023 11:51:25 +0000 Subject: [PATCH 5/8] removing .lift from the project --- .lift/ignoreFiles | 341 ---------------------------------------------- 1 file changed, 341 deletions(-) delete mode 100644 .lift/ignoreFiles diff --git a/.lift/ignoreFiles b/.lift/ignoreFiles deleted file mode 100644 index 34b53a7362a0a5..00000000000000 --- a/.lift/ignoreFiles +++ /dev/null @@ -1,341 +0,0 @@ - -# Created by https://www.gitignore.io/api/sbt,java,scala,python,eclipse,intellij,intellij+all - -**/docs/** -docs/** -**/*.min.js -**/*.js -**/*.py -python/** -**/python/** - -### Eclipse ### - -.metadata -bin/ -tmp/ -*.tmp -*.bak -*.swp -*~.nib -local.properties -.settings/ -.loadpath -.recommenders -PubMed* -*cache_pretrained* -*.crc -*.sst -_SUCCESS* -*stages* -*auxdata* -# External tool builders -.externalToolBuilders/ - -# Locally stored "Eclipse launch configurations" -*.launch - -# PyDev specific (Python IDE for Eclipse) -*.pydevproject - -# CDT-specific (C/C++ Development Tooling) -.cproject - -# Java annotation processor (APT) -.factorypath - -# PDT-specific (PHP Development Tools) -.buildpath - -# sbteclipse plugin -.target - -# Tern plugin -.tern-project - -# TeXlipse plugin -.texlipse - -# STS (Spring Tool Suite) -.springBeans - -# Code Recommenders -.recommenders/ - -# Scala IDE specific (Scala & Java development for Eclipse) -.cache-main -.scala_dependencies -.worksheet - -### Eclipse Patch ### -# Eclipse Core -.project - -# JDT-specific (Eclipse Java Development Tools) -.classpath - -### Intellij ### -# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm -# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 - -# User-specific stuff: -.idea/**/workspace.xml -.idea/**/tasks.xml -.idea/dictionaries - -# Sensitive or high-churn files: -.idea/**/dataSources/ -.idea/**/dataSources.ids -.idea/**/dataSources.xml -.idea/**/dataSources.local.xml -.idea/**/sqlDataSources.xml -.idea/**/dynamic.xml -.idea/**/uiDesigner.xml - -# Gradle: -.idea/**/gradle.xml -.idea/**/libraries - -# CMake -cmake-build-debug/ - -# Mongo Explorer plugin: -.idea/**/mongoSettings.xml - -## File-based project format: -*.iws - -## Plugin-specific files: - -# IntelliJ -/out/ - -# mpeltonen/sbt-idea plugin -.idea_modules/ - -# JIRA plugin -atlassian-ide-plugin.xml - -# Cursive Clojure plugin -.idea/replstate.xml - -# Crashlytics plugin (for Android Studio and IntelliJ) -com_crashlytics_export_strings.xml -crashlytics.properties -crashlytics-build.properties -fabric.properties - -### Intellij Patch ### -# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 - -*.iml -# modules.xml -# .idea/misc.xml -# *.ipr - -# Sonarlint plugin -.idea/sonarlint - -### Intellij+all ### -# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm -# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 - -# User-specific stuff: - -# Sensitive or high-churn files: - -# Gradle: - -# CMake - -# Mongo Explorer plugin: - -## File-based project format: - -## Plugin-specific files: - -# IntelliJ - -# mpeltonen/sbt-idea plugin - -# JIRA plugin - -# Cursive Clojure plugin - -# Crashlytics plugin (for Android Studio and IntelliJ) - -### Intellij+all Patch ### -# Ignores the whole idea folder -# See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360 - -.idea/ - -### Java ### -# Compiled class file -*.class - -# Log file -*.log - -# BlueJ files -*.ctxt - -# Mobile Tools for Java (J2ME) -.mtj.tmp/ - -# Package Files # -*.jar -*.war -*.ear -*.zip -*.tar.gz -*.rar - -# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml -hs_err_pid* - -### Python ### -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -python/lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -*.egg-info/ -.installed.cfg -*.egg - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -.hypothesis/ - -# Translations -*.mo -*.pot - -# Django stuff: -local_settings.py - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ -docs/vendor/ - -# Frontend -docs/_frontend/node_modules -docs/_frontend/static - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# pyenv -.python-version - -# celery beat schedule file -celerybeat-schedule - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ - -### SBT ### -# Simple Build Tool -# http://www.scala-sbt.org/release/docs/Getting-Started/Directories.html#configuring-version-control - -dist/* -lib_managed/ -src_managed/ -project/boot/ -project/plugins/project/ -.history -.lib/ - -### Scala ### - -# End of https://www.gitignore.io/api/sbt,java,scala,python,eclipse,intellij,intellij+all - -### Local ### -tmp_pipeline/ -tmp_symspell/ -test-output-tmp/ -spark-warehouse/ -/python/python.iml -test_crf_pipeline/ -test_*_pipeline/ -*metastore_db* -python/src/ -python/tensorflow/bert/models/** -**/.DS_Store -**/tmp_* -docs/_site/** -docs/.sass-cache/** -tst_shortcut_sd/ -src/*/resources/*.classes -/word_segmenter_metrics/ -/special_class.ser -.bsp/sbt.json -python/docs/_build/** -python/docs/reference/_autosummary/** From b8928fc7d8a2e43dd33c15785eb5c8ee82415c68 Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Tue, 18 Jul 2023 16:21:47 +0200 Subject: [PATCH 6/8] Bump version to 5.0.1 [run doc] --- README.md | 88 +++++++++---------- build.sbt | 2 +- docs/_layouts/landing.html | 2 +- docs/en/concepts.md | 2 +- docs/en/examples.md | 4 +- docs/en/hardware_acceleration.md | 2 +- docs/en/install.md | 54 ++++++------ docs/en/spark_nlp.md | 2 +- examples/docker/README.md | 4 +- python/README.md | 88 +++++++++---------- python/docs/conf.py | 2 +- python/setup.py | 2 +- python/sparknlp/__init__.py | 4 +- scripts/colab_setup.sh | 2 +- scripts/kaggle_setup.sh | 2 +- scripts/sagemaker_setup.sh | 2 +- .../scala/com/johnsnowlabs/nlp/SparkNLP.scala | 2 +- .../scala/com/johnsnowlabs/util/Build.scala | 2 +- 18 files changed, 133 insertions(+), 133 deletions(-) diff --git a/README.md b/README.md index a002cf66f1c285..57c4ce0793a35a 100644 --- a/README.md +++ b/README.md @@ -167,7 +167,7 @@ To use Spark NLP you need the following requirements: **GPU (optional):** -Spark NLP 5.0.0 is built with ONNX 1.15.1 and TensorFlow 2.7.1 deep learning engines. The minimum following NVIDIA® software are only required for GPU support: +Spark NLP 5.0.1 is built with ONNX 1.15.1 and TensorFlow 2.7.1 deep learning engines. The minimum following NVIDIA® software are only required for GPU support: - NVIDIA® GPU drivers version 450.80.02 or higher - CUDA® Toolkit 11.2 @@ -183,7 +183,7 @@ $ java -version $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==5.0.0 pyspark==3.3.1 +$ pip install spark-nlp==5.0.1 pyspark==3.3.1 ``` In Python console or Jupyter `Python3` kernel: @@ -228,7 +228,7 @@ For more examples, you can visit our dedicated [examples](https://github.com/Joh ## Apache Spark Support -Spark NLP *5.0.0* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x +Spark NLP *5.0.1* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x | Spark NLP | Apache Spark 2.3.x | Apache Spark 2.4.x | Apache Spark 3.0.x | Apache Spark 3.1.x | Apache Spark 3.2.x | Apache Spark 3.3.x | Apache Spark 3.4.x | |-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------| @@ -267,7 +267,7 @@ Find out more about `Spark NLP` versions from our [release notes](https://github ## Databricks Support -Spark NLP 5.0.0 has been tested and is compatible with the following runtimes: +Spark NLP 5.0.1 has been tested and is compatible with the following runtimes: **CPU:** @@ -325,7 +325,7 @@ Spark NLP 5.0.0 has been tested and is compatible with the following runtimes: ## EMR Support -Spark NLP 5.0.0 has been tested and is compatible with the following EMR releases: +Spark NLP 5.0.1 has been tested and is compatible with the following EMR releases: - emr-6.2.0 - emr-6.3.0 @@ -369,11 +369,11 @@ Spark NLP supports all major releases of Apache Spark 3.0.x, Apache Spark 3.1.x, ```sh # CPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 ``` The `spark-nlp` has been published to @@ -382,11 +382,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # GPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.1 ``` @@ -396,11 +396,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # AArch64 -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.1 ``` @@ -410,11 +410,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # M1/M2 (Apple Silicon) -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.1 ``` @@ -428,7 +428,7 @@ set in your SparkSession: spark-shell \ --driver-memory 16g \ --conf spark.kryoserializer.buffer.max=2000M \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 ``` ## Scala @@ -446,7 +446,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp_2.12 - 5.0.0 + 5.0.1 ``` @@ -457,7 +457,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-gpu_2.12 - 5.0.0 + 5.0.1 ``` @@ -468,7 +468,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-aarch64_2.12 - 5.0.0 + 5.0.1 ``` @@ -479,7 +479,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 5.0.0 + 5.0.1 ``` @@ -489,28 +489,28 @@ coordinates: ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.0.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.0.1" ``` **spark-nlp-gpu:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-gpu -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.0.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.0.1" ``` **spark-nlp-aarch64:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-aarch64 -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.0.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.0.1" ``` **spark-nlp-silicon:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-silicon -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.0.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.0.1" ``` Maven @@ -532,7 +532,7 @@ If you installed pyspark through pip/conda, you can install `spark-nlp` through Pip: ```bash -pip install spark-nlp==5.0.0 +pip install spark-nlp==5.0.1 ``` Conda: @@ -561,7 +561,7 @@ spark = SparkSession.builder .config("spark.driver.memory", "16G") .config("spark.driver.maxResultSize", "0") .config("spark.kryoserializer.buffer.max", "2000M") - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1") .getOrCreate() ``` @@ -632,7 +632,7 @@ Use either one of the following options - Add the following Maven Coordinates to the interpreter's library list ```bash -com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 +com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 ``` - Add a path to pre-built jar from [here](#compiled-jars) in the interpreter's library list making sure the jar is @@ -643,7 +643,7 @@ com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 Apart from the previous step, install the python module through pip ```bash -pip install spark-nlp==5.0.0 +pip install spark-nlp==5.0.1 ``` Or you can install `spark-nlp` from inside Zeppelin by using Conda: @@ -671,7 +671,7 @@ launch the Jupyter from the same Python environment: $ conda create -n sparknlp python=3.8 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==5.0.0 pyspark==3.3.1 jupyter +$ pip install spark-nlp==5.0.1 pyspark==3.3.1 jupyter $ jupyter notebook ``` @@ -688,7 +688,7 @@ export PYSPARK_PYTHON=python3 export PYSPARK_DRIVER_PYTHON=jupyter export PYSPARK_DRIVER_PYTHON_OPTS=notebook -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 ``` Alternatively, you can mix in using `--jars` option for pyspark + `pip install spark-nlp` @@ -715,7 +715,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -s is for spark-nlp # -g will enable upgrading libcudnn8 to 8.1.0 on Google Colab for GPU usage # by default they are set to the latest -!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.0 +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.1 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) @@ -738,7 +738,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -s is for spark-nlp # -g will enable upgrading libcudnn8 to 8.1.0 on Kaggle for GPU usage # by default they are set to the latest -!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.0 +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.1 ``` [Spark NLP quick start on Kaggle Kernel](https://www.kaggle.com/mozzie/spark-nlp-named-entity-recognition) is a live @@ -757,9 +757,9 @@ demo on Kaggle Kernel that performs named entity recognitions by using Spark NLP 3. In `Libraries` tab inside your cluster you need to follow these steps: - 3.1. Install New -> PyPI -> `spark-nlp==5.0.0` -> Install + 3.1. Install New -> PyPI -> `spark-nlp==5.0.1` -> Install - 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0` -> Install + 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1` -> Install 4. Now you can attach your notebook to the cluster and use Spark NLP! @@ -810,7 +810,7 @@ A sample of your software configuration in JSON on S3 (must be public access): "spark.kryoserializer.buffer.max": "2000M", "spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.driver.maxResultSize": "0", - "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0" + "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1" } }] ``` @@ -819,7 +819,7 @@ A sample of AWS CLI to launch EMR cluster: ```.sh aws emr create-cluster \ ---name "Spark NLP 5.0.0" \ +--name "Spark NLP 5.0.1" \ --release-label emr-6.2.0 \ --applications Name=Hadoop Name=Spark Name=Hive \ --instance-type m4.4xlarge \ @@ -883,7 +883,7 @@ gcloud dataproc clusters create ${CLUSTER_NAME} \ --enable-component-gateway \ --metadata 'PIP_PACKAGES=spark-nlp spark-nlp-display google-cloud-bigquery google-cloud-storage' \ --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/python/pip-install.sh \ - --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 + --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 ``` 2. On an existing one, you need to install spark-nlp and spark-nlp-display packages from PyPI. @@ -922,7 +922,7 @@ spark = SparkSession.builder .config("spark.kryoserializer.buffer.max", "2000m") .config("spark.jsl.settings.pretrained.cache_folder", "sample_data/pretrained") .config("spark.jsl.settings.storage.cluster_tmp_dir", "sample_data/storage") - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1") .getOrCreate() ``` @@ -936,7 +936,7 @@ spark-shell \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 ``` **pyspark:** @@ -949,7 +949,7 @@ pyspark \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 ``` **Databricks:** @@ -1221,7 +1221,7 @@ spark = SparkSession.builder .config("spark.driver.memory", "16G") .config("spark.driver.maxResultSize", "0") .config("spark.kryoserializer.buffer.max", "2000M") - .config("spark.jars", "/tmp/spark-nlp-assembly-5.0.0.jar") + .config("spark.jars", "/tmp/spark-nlp-assembly-5.0.1.jar") .getOrCreate() ``` @@ -1230,7 +1230,7 @@ spark = SparkSession.builder version (3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x) - If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. ( - i.e., `hdfs:///tmp/spark-nlp-assembly-5.0.0.jar`) + i.e., `hdfs:///tmp/spark-nlp-assembly-5.0.1.jar`) Example of using pretrained Models and Pipelines in offline: diff --git a/build.sbt b/build.sbt index c9e37ecd4a699e..2fdac1c421cc55 100644 --- a/build.sbt +++ b/build.sbt @@ -6,7 +6,7 @@ name := getPackageName(is_silicon, is_gpu, is_aarch64) organization := "com.johnsnowlabs.nlp" -version := "5.0.0" +version := "5.0.1" (ThisBuild / scalaVersion) := scalaVer diff --git a/docs/_layouts/landing.html b/docs/_layouts/landing.html index ad8d347e9edffb..654d6642bf3c8d 100755 --- a/docs/_layouts/landing.html +++ b/docs/_layouts/landing.html @@ -201,7 +201,7 @@

{{ _section.title }}

{% highlight bash %} # Using PyPI - $ pip install spark-nlp==4.4.4 + $ pip install spark-nlp==5.0.1 # Using Anaconda/Conda $ conda install -c johnsnowlabs spark-nlp diff --git a/docs/en/concepts.md b/docs/en/concepts.md index fa58cbf7181663..07d5c17121d3a7 100644 --- a/docs/en/concepts.md +++ b/docs/en/concepts.md @@ -62,7 +62,7 @@ $ java -version $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==5.0.0 pyspark==3.3.1 jupyter +$ pip install spark-nlp==5.0.1 pyspark==3.3.1 jupyter $ jupyter notebook ``` diff --git a/docs/en/examples.md b/docs/en/examples.md index 4a321bb4fb0bdb..bf097b741e9ff1 100644 --- a/docs/en/examples.md +++ b/docs/en/examples.md @@ -16,7 +16,7 @@ $ java -version # should be Java 8 (Oracle or OpenJDK) $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp -$ pip install spark-nlp==5.0.0 pyspark==3.3.1 +$ pip install spark-nlp==5.0.1 pyspark==3.3.1 ``` ## Google Colab Notebook @@ -36,7 +36,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -p is for pyspark # -s is for spark-nlp # by default they are set to the latest -!bash colab.sh -p 3.2.3 -s 5.0.0 +!bash colab.sh -p 3.2.3 -s 5.0.1 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) is a live demo on Google Colab that performs named entity recognitions and sentiment analysis by using Spark NLP pretrained pipelines. diff --git a/docs/en/hardware_acceleration.md b/docs/en/hardware_acceleration.md index 82b6ffc3040a81..1bc6ada008fdb8 100644 --- a/docs/en/hardware_acceleration.md +++ b/docs/en/hardware_acceleration.md @@ -49,7 +49,7 @@ Since the new Transformer models such as BERT for Word and Sentence embeddings a | DeBERTa Large | +477%(5.8x) | | Longformer Base | +52%(1.5x) | -Spark NLP 5.0.0 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: +Spark NLP 5.0.1 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: - NVIDIA® GPU drivers version 450.80.02 or higher - CUDA® Toolkit 11.2 diff --git a/docs/en/install.md b/docs/en/install.md index 0190c1d2047272..78445a811a67a8 100644 --- a/docs/en/install.md +++ b/docs/en/install.md @@ -15,22 +15,22 @@ sidebar: ```bash # Install Spark NLP from PyPI -pip install spark-nlp==5.0.0 +pip install spark-nlp==5.0.1 # Install Spark NLP from Anacodna/Conda conda install -c johnsnowlabs spark-nlp # Load Spark NLP with Spark Shell -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 # Load Spark NLP with PySpark -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 # Load Spark NLP with Spark Submit -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 # Load Spark NLP as external JAR after compiling and building Spark NLP by `sbt assembly` -spark-shell --jars spark-nlp-assembly-5.0.0.jar +spark-shell --jars spark-nlp-assembly-5.0.1.jar ``` ## Python @@ -49,7 +49,7 @@ $ java -version # should be Java 8 (Oracle or OpenJDK) $ conda create -n sparknlp python=3.8 -y $ conda activate sparknlp -$ pip install spark-nlp==5.0.0 pyspark==3.3.1 +$ pip install spark-nlp==5.0.1 pyspark==3.3.1 ``` Of course you will need to have jupyter installed in your system: @@ -76,7 +76,7 @@ spark = SparkSession.builder \ .config("spark.driver.memory","16G")\ .config("spark.driver.maxResultSize", "0") \ .config("spark.kryoserializer.buffer.max", "2000M")\ - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0")\ + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1")\ .getOrCreate() ``` @@ -91,7 +91,7 @@ spark = SparkSession.builder \ com.johnsnowlabs.nlp spark-nlp_2.12 - 5.0.0 + 5.0.1 ``` @@ -102,7 +102,7 @@ spark = SparkSession.builder \ com.johnsnowlabs.nlp spark-nlp-gpu_2.12 - 5.0.0 + 5.0.1 ``` @@ -113,7 +113,7 @@ spark = SparkSession.builder \ com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 5.0.0 + 5.0.1 ``` @@ -124,7 +124,7 @@ spark = SparkSession.builder \ com.johnsnowlabs.nlp spark-nlp-aarch64_2.12 - 5.0.0 + 5.0.1 ``` @@ -134,28 +134,28 @@ spark = SparkSession.builder \ ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.0.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.0.1" ``` **spark-nlp-gpu:** ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-gpu -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.0.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.0.1" ``` **spark-nlp-silicon:** ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-silicon -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.0.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.0.1" ``` **spark-nlp-aarch64:** ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-aarch64 -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.0.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.0.1" ``` Maven Central: [https://mvnrepository.com/artifact/com.johnsnowlabs.nlp](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp) @@ -233,7 +233,7 @@ maven coordinates like these: com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 5.0.0 + 5.0.1 ``` @@ -241,7 +241,7 @@ or in case of sbt: ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.0.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.0.1" ``` If everything went well, you can now start Spark NLP with the `m1` flag set to `true`: @@ -274,7 +274,7 @@ spark = sparknlp.start(apple_silicon=True) ## Installation for Linux Aarch64 Systems -Starting from version 5.0.0, Spark NLP supports Linux systems running on an aarch64 +Starting from version 5.0.1, Spark NLP supports Linux systems running on an aarch64 processor architecture. The necessary dependencies have been built on Ubuntu 16.04, so a recent system with an environment of at least that will be needed. @@ -318,7 +318,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -p is for pyspark # -s is for spark-nlp # by default they are set to the latest -!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.0 +!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.1 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) is a live demo on Google Colab that performs named entity recognitions and sentiment analysis by using Spark NLP pretrained pipelines. @@ -337,7 +337,7 @@ Run the following code in Kaggle Kernel and start using spark-nlp right away. ## Databricks Support -Spark NLP 5.0.0 has been tested and is compatible with the following runtimes: +Spark NLP 5.0.1 has been tested and is compatible with the following runtimes: **CPU:** @@ -403,7 +403,7 @@ NOTE: Spark NLP 4.0.x is based on TensorFlow 2.7.x which is compatible with CUDA 3.1. Install New -> PyPI -> `spark-nlp` -> Install - 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0` -> Install + 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1` -> Install 4. Now you can attach your notebook to the cluster and use Spark NLP! @@ -419,7 +419,7 @@ Note: You can import these notebooks by using their URLs. ## EMR Support -Spark NLP 5.0.0 has been tested and is compatible with the following EMR releases: +Spark NLP 5.0.1 has been tested and is compatible with the following EMR releases: - emr-6.2.0 - emr-6.3.0 @@ -477,7 +477,7 @@ A sample of your software configuration in JSON on S3 (must be public access): "spark.kryoserializer.buffer.max": "2000M", "spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.driver.maxResultSize": "0", - "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0" + "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1" } } ] @@ -487,7 +487,7 @@ A sample of AWS CLI to launch EMR cluster: ```sh aws emr create-cluster \ ---name "Spark NLP 5.0.0" \ +--name "Spark NLP 5.0.1" \ --release-label emr-6.2.0 \ --applications Name=Hadoop Name=Spark Name=Hive \ --instance-type m4.4xlarge \ @@ -741,7 +741,7 @@ We recommend using `conda` to manage your Python environment on Windows. Now you can use the downloaded binary by navigating to `%SPARK_HOME%\bin` and running -Either create a conda env for python 3.6, install *pyspark==3.3.1 spark-nlp numpy* and use Jupyter/python console, or in the same conda env you can go to spark bin for *pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0*. +Either create a conda env for python 3.6, install *pyspark==3.3.1 spark-nlp numpy* and use Jupyter/python console, or in the same conda env you can go to spark bin for *pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1*. @@ -767,12 +767,12 @@ spark = SparkSession.builder \ .config("spark.driver.memory","16G")\ .config("spark.driver.maxResultSize", "0") \ .config("spark.kryoserializer.buffer.max", "2000M")\ - .config("spark.jars", "/tmp/spark-nlp-assembly-5.0.0.jar")\ + .config("spark.jars", "/tmp/spark-nlp-assembly-5.0.1.jar")\ .getOrCreate() ``` - You can download provided Fat JARs from each [release notes](https://github.com/JohnSnowLabs/spark-nlp/releases), please pay attention to pick the one that suits your environment depending on the device (CPU/GPU) and Apache Spark version (3.x) -- If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. (i.e., `hdfs:///tmp/spark-nlp-assembly-5.0.0.jar`) +- If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. (i.e., `hdfs:///tmp/spark-nlp-assembly-5.0.1.jar`) Example of using pretrained Models and Pipelines in offline: diff --git a/docs/en/spark_nlp.md b/docs/en/spark_nlp.md index 5d0a07bc7bd110..a5791610509f45 100644 --- a/docs/en/spark_nlp.md +++ b/docs/en/spark_nlp.md @@ -25,7 +25,7 @@ Spark NLP is built on top of **Apache Spark 3.x**. For using Spark NLP you need: **GPU (optional):** -Spark NLP 5.0.0 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: +Spark NLP 5.0.1 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: - NVIDIA® GPU drivers version 450.80.02 or higher - CUDA® Toolkit 11.2 diff --git a/examples/docker/README.md b/examples/docker/README.md index bdf00be0a55508..4102ca6facddf3 100644 --- a/examples/docker/README.md +++ b/examples/docker/README.md @@ -73,7 +73,7 @@ docker run -it --name sparknlp-container \ --conf "spark.serializer"="org.apache.spark.serializer.KryoSerializer" \ --conf "spark.kryoserializer.buffer.max"="2000M" \ --conf "spark.driver.maxResultSize"="0" \ - --packages "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0" + --packages "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1" ``` To run the shell with GPU support, we use the image from [Jupyter Notebook with GPU @@ -91,5 +91,5 @@ docker run -it --name sparknlp-container \ --conf "spark.serializer"="org.apache.spark.serializer.KryoSerializer" \ --conf "spark.kryoserializer.buffer.max"="2000M" \ --conf "spark.driver.maxResultSize"="0" \ - --packages "com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.0" + --packages "com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.1" ``` diff --git a/python/README.md b/python/README.md index a002cf66f1c285..57c4ce0793a35a 100644 --- a/python/README.md +++ b/python/README.md @@ -167,7 +167,7 @@ To use Spark NLP you need the following requirements: **GPU (optional):** -Spark NLP 5.0.0 is built with ONNX 1.15.1 and TensorFlow 2.7.1 deep learning engines. The minimum following NVIDIA® software are only required for GPU support: +Spark NLP 5.0.1 is built with ONNX 1.15.1 and TensorFlow 2.7.1 deep learning engines. The minimum following NVIDIA® software are only required for GPU support: - NVIDIA® GPU drivers version 450.80.02 or higher - CUDA® Toolkit 11.2 @@ -183,7 +183,7 @@ $ java -version $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==5.0.0 pyspark==3.3.1 +$ pip install spark-nlp==5.0.1 pyspark==3.3.1 ``` In Python console or Jupyter `Python3` kernel: @@ -228,7 +228,7 @@ For more examples, you can visit our dedicated [examples](https://github.com/Joh ## Apache Spark Support -Spark NLP *5.0.0* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x +Spark NLP *5.0.1* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x | Spark NLP | Apache Spark 2.3.x | Apache Spark 2.4.x | Apache Spark 3.0.x | Apache Spark 3.1.x | Apache Spark 3.2.x | Apache Spark 3.3.x | Apache Spark 3.4.x | |-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------| @@ -267,7 +267,7 @@ Find out more about `Spark NLP` versions from our [release notes](https://github ## Databricks Support -Spark NLP 5.0.0 has been tested and is compatible with the following runtimes: +Spark NLP 5.0.1 has been tested and is compatible with the following runtimes: **CPU:** @@ -325,7 +325,7 @@ Spark NLP 5.0.0 has been tested and is compatible with the following runtimes: ## EMR Support -Spark NLP 5.0.0 has been tested and is compatible with the following EMR releases: +Spark NLP 5.0.1 has been tested and is compatible with the following EMR releases: - emr-6.2.0 - emr-6.3.0 @@ -369,11 +369,11 @@ Spark NLP supports all major releases of Apache Spark 3.0.x, Apache Spark 3.1.x, ```sh # CPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 ``` The `spark-nlp` has been published to @@ -382,11 +382,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # GPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.1 ``` @@ -396,11 +396,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # AArch64 -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.1 ``` @@ -410,11 +410,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # M1/M2 (Apple Silicon) -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.1 ``` @@ -428,7 +428,7 @@ set in your SparkSession: spark-shell \ --driver-memory 16g \ --conf spark.kryoserializer.buffer.max=2000M \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 ``` ## Scala @@ -446,7 +446,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp_2.12 - 5.0.0 + 5.0.1 ``` @@ -457,7 +457,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-gpu_2.12 - 5.0.0 + 5.0.1 ``` @@ -468,7 +468,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-aarch64_2.12 - 5.0.0 + 5.0.1 ``` @@ -479,7 +479,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 5.0.0 + 5.0.1 ``` @@ -489,28 +489,28 @@ coordinates: ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.0.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.0.1" ``` **spark-nlp-gpu:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-gpu -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.0.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.0.1" ``` **spark-nlp-aarch64:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-aarch64 -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.0.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.0.1" ``` **spark-nlp-silicon:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-silicon -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.0.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.0.1" ``` Maven @@ -532,7 +532,7 @@ If you installed pyspark through pip/conda, you can install `spark-nlp` through Pip: ```bash -pip install spark-nlp==5.0.0 +pip install spark-nlp==5.0.1 ``` Conda: @@ -561,7 +561,7 @@ spark = SparkSession.builder .config("spark.driver.memory", "16G") .config("spark.driver.maxResultSize", "0") .config("spark.kryoserializer.buffer.max", "2000M") - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1") .getOrCreate() ``` @@ -632,7 +632,7 @@ Use either one of the following options - Add the following Maven Coordinates to the interpreter's library list ```bash -com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 +com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 ``` - Add a path to pre-built jar from [here](#compiled-jars) in the interpreter's library list making sure the jar is @@ -643,7 +643,7 @@ com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 Apart from the previous step, install the python module through pip ```bash -pip install spark-nlp==5.0.0 +pip install spark-nlp==5.0.1 ``` Or you can install `spark-nlp` from inside Zeppelin by using Conda: @@ -671,7 +671,7 @@ launch the Jupyter from the same Python environment: $ conda create -n sparknlp python=3.8 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==5.0.0 pyspark==3.3.1 jupyter +$ pip install spark-nlp==5.0.1 pyspark==3.3.1 jupyter $ jupyter notebook ``` @@ -688,7 +688,7 @@ export PYSPARK_PYTHON=python3 export PYSPARK_DRIVER_PYTHON=jupyter export PYSPARK_DRIVER_PYTHON_OPTS=notebook -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 ``` Alternatively, you can mix in using `--jars` option for pyspark + `pip install spark-nlp` @@ -715,7 +715,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -s is for spark-nlp # -g will enable upgrading libcudnn8 to 8.1.0 on Google Colab for GPU usage # by default they are set to the latest -!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.0 +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.1 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) @@ -738,7 +738,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -s is for spark-nlp # -g will enable upgrading libcudnn8 to 8.1.0 on Kaggle for GPU usage # by default they are set to the latest -!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.0 +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.1 ``` [Spark NLP quick start on Kaggle Kernel](https://www.kaggle.com/mozzie/spark-nlp-named-entity-recognition) is a live @@ -757,9 +757,9 @@ demo on Kaggle Kernel that performs named entity recognitions by using Spark NLP 3. In `Libraries` tab inside your cluster you need to follow these steps: - 3.1. Install New -> PyPI -> `spark-nlp==5.0.0` -> Install + 3.1. Install New -> PyPI -> `spark-nlp==5.0.1` -> Install - 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0` -> Install + 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1` -> Install 4. Now you can attach your notebook to the cluster and use Spark NLP! @@ -810,7 +810,7 @@ A sample of your software configuration in JSON on S3 (must be public access): "spark.kryoserializer.buffer.max": "2000M", "spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.driver.maxResultSize": "0", - "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0" + "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1" } }] ``` @@ -819,7 +819,7 @@ A sample of AWS CLI to launch EMR cluster: ```.sh aws emr create-cluster \ ---name "Spark NLP 5.0.0" \ +--name "Spark NLP 5.0.1" \ --release-label emr-6.2.0 \ --applications Name=Hadoop Name=Spark Name=Hive \ --instance-type m4.4xlarge \ @@ -883,7 +883,7 @@ gcloud dataproc clusters create ${CLUSTER_NAME} \ --enable-component-gateway \ --metadata 'PIP_PACKAGES=spark-nlp spark-nlp-display google-cloud-bigquery google-cloud-storage' \ --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/python/pip-install.sh \ - --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 + --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 ``` 2. On an existing one, you need to install spark-nlp and spark-nlp-display packages from PyPI. @@ -922,7 +922,7 @@ spark = SparkSession.builder .config("spark.kryoserializer.buffer.max", "2000m") .config("spark.jsl.settings.pretrained.cache_folder", "sample_data/pretrained") .config("spark.jsl.settings.storage.cluster_tmp_dir", "sample_data/storage") - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1") .getOrCreate() ``` @@ -936,7 +936,7 @@ spark-shell \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 ``` **pyspark:** @@ -949,7 +949,7 @@ pyspark \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 ``` **Databricks:** @@ -1221,7 +1221,7 @@ spark = SparkSession.builder .config("spark.driver.memory", "16G") .config("spark.driver.maxResultSize", "0") .config("spark.kryoserializer.buffer.max", "2000M") - .config("spark.jars", "/tmp/spark-nlp-assembly-5.0.0.jar") + .config("spark.jars", "/tmp/spark-nlp-assembly-5.0.1.jar") .getOrCreate() ``` @@ -1230,7 +1230,7 @@ spark = SparkSession.builder version (3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x) - If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. ( - i.e., `hdfs:///tmp/spark-nlp-assembly-5.0.0.jar`) + i.e., `hdfs:///tmp/spark-nlp-assembly-5.0.1.jar`) Example of using pretrained Models and Pipelines in offline: diff --git a/python/docs/conf.py b/python/docs/conf.py index faddea1aa5c952..0466935bd8fd5c 100644 --- a/python/docs/conf.py +++ b/python/docs/conf.py @@ -23,7 +23,7 @@ author = "John Snow Labs" # The full version, including alpha/beta/rc tags -release = "5.0.0" +release = "5.0.1" pyspark_version = "3.2.3" # -- General configuration --------------------------------------------------- diff --git a/python/setup.py b/python/setup.py index a4c0594f6ce900..b9a652fc533069 100644 --- a/python/setup.py +++ b/python/setup.py @@ -41,7 +41,7 @@ # project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='5.0.0', # Required + version='5.0.1', # Required # This is a one-line description or tagline of what your project does. This # corresponds to the 'Summary' metadata field: diff --git a/python/sparknlp/__init__.py b/python/sparknlp/__init__.py index 5685670314fcb8..f3a6423ba28940 100644 --- a/python/sparknlp/__init__.py +++ b/python/sparknlp/__init__.py @@ -128,7 +128,7 @@ def start(gpu=False, The initiated Spark session. """ - current_version = "5.0.0" + current_version = "5.0.1" if params is None: params = {} @@ -309,4 +309,4 @@ def version(): str The current Spark NLP version. """ - return '5.0.0' + return '5.0.1' diff --git a/scripts/colab_setup.sh b/scripts/colab_setup.sh index 09d14bb884778a..b8b4b3f91c983d 100644 --- a/scripts/colab_setup.sh +++ b/scripts/colab_setup.sh @@ -1,7 +1,7 @@ #!/bin/bash #default values for pyspark, spark-nlp, and SPARK_HOME -SPARKNLP="5.0.0" +SPARKNLP="5.0.1" PYSPARK="3.2.3" while getopts s:p:g option diff --git a/scripts/kaggle_setup.sh b/scripts/kaggle_setup.sh index 30c3041c84018b..3b198d009c7250 100644 --- a/scripts/kaggle_setup.sh +++ b/scripts/kaggle_setup.sh @@ -1,7 +1,7 @@ #!/bin/bash #default values for pyspark, spark-nlp, and SPARK_HOME -SPARKNLP="5.0.0" +SPARKNLP="5.0.1" PYSPARK="3.2.3" while getopts s:p:g option diff --git a/scripts/sagemaker_setup.sh b/scripts/sagemaker_setup.sh index a8839055c6387f..fb7318a212ac08 100644 --- a/scripts/sagemaker_setup.sh +++ b/scripts/sagemaker_setup.sh @@ -1,7 +1,7 @@ #!/bin/bash # Default values for pyspark, spark-nlp, and SPARK_HOME -SPARKNLP="5.0.0" +SPARKNLP="5.0.1" PYSPARK="3.2.3" echo "Setup SageMaker for PySpark $PYSPARK and Spark NLP $SPARKNLP" diff --git a/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala b/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala index 73dbedef2002cc..5746ae9d1e07fc 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala @@ -20,7 +20,7 @@ import org.apache.spark.sql.SparkSession object SparkNLP { - val currentVersion = "5.0.0" + val currentVersion = "5.0.1" val MavenSpark3 = s"com.johnsnowlabs.nlp:spark-nlp_2.12:$currentVersion" val MavenGpuSpark3 = s"com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:$currentVersion" val MavenSparkSilicon = s"com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:$currentVersion" diff --git a/src/main/scala/com/johnsnowlabs/util/Build.scala b/src/main/scala/com/johnsnowlabs/util/Build.scala index 0e1254219d5164..38c1c2fae724a1 100644 --- a/src/main/scala/com/johnsnowlabs/util/Build.scala +++ b/src/main/scala/com/johnsnowlabs/util/Build.scala @@ -17,5 +17,5 @@ package com.johnsnowlabs.util object Build { - val version: String = "5.0.0" + val version: String = "5.0.1" } From 4130ee899303cf31463e7555c030ebd4a1a9059f Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 18 Jul 2023 14:32:41 +0000 Subject: [PATCH 7/8] Update Scala and Python APIs --- docs/api/com/index.html | 8 +++---- .../johnsnowlabs/client/CredentialParams.html | 8 +++---- .../client/aws/AWSAnonymousCredentials.html | 8 +++---- .../client/aws/AWSBasicCredentials.html | 8 +++---- .../client/aws/AWSCredentialsProvider.html | 8 +++---- .../johnsnowlabs/client/aws/AWSGateway.html | 8 +++---- .../client/aws/AWSProfileCredentials.html | 8 +++---- .../client/aws/AWSTokenCredentials.html | 8 +++---- .../johnsnowlabs/client/aws/Credentials.html | 8 +++---- .../com/johnsnowlabs/client/aws/index.html | 8 +++---- .../johnsnowlabs/client/gcp/GCPGateway.html | 8 +++---- .../com/johnsnowlabs/client/gcp/index.html | 8 +++---- docs/api/com/johnsnowlabs/client/index.html | 8 +++---- .../johnsnowlabs/collections/SearchTrie$.html | 8 +++---- .../johnsnowlabs/collections/SearchTrie.html | 8 +++---- .../collections/StorageSearchTrie$.html | 8 +++---- .../collections/StorageSearchTrie.html | 8 +++---- .../com/johnsnowlabs/collections/index.html | 8 +++---- docs/api/com/johnsnowlabs/index.html | 8 +++---- docs/api/com/johnsnowlabs/ml/ai/DeBerta.html | 8 +++---- .../ml/ai/MergeTokenStrategy$.html | 8 +++---- docs/api/com/johnsnowlabs/ml/ai/index.html | 8 +++---- .../ml/ai/util/Generation/Generate.html | 8 +++---- .../ml/ai/util/Generation/Logit/Logit.html | 8 +++---- .../Logit/LogitProcess/LogitProcessor.html | 8 +++---- .../LogitProcess/MinLengthLogitProcessor.html | 8 +++---- .../NoRepeatNgramsLogitProcessor.html | 8 +++---- .../RepetitionPenaltyLogitProcessor.html | 8 +++---- .../Generation/Logit/LogitProcess/index.html | 8 +++---- .../Generation/Logit/LogitProcessorList.html | 8 +++---- .../Logit/LogitWarper/LogitWarper.html | 8 +++---- .../LogitWarper/TemperatureLogitWarper.html | 8 +++---- .../Logit/LogitWarper/TopKLogitWarper.html | 8 +++---- .../Logit/LogitWarper/TopPLogitWarper.html | 8 +++---- .../Generation/Logit/LogitWarper/index.html | 8 +++---- .../ml/ai/util/Generation/Logit/index.html | 8 +++---- .../Generation/Search/BeamHypotheses.html | 8 +++---- .../ai/util/Generation/Search/BeamScorer.html | 8 +++---- .../Generation/Search/BeamSearchScorer.html | 8 +++---- .../ml/ai/util/Generation/Search/index.html | 8 +++---- .../ml/ai/util/Generation/index.html | 8 +++---- .../com/johnsnowlabs/ml/ai/util/index.html | 8 +++---- docs/api/com/johnsnowlabs/ml/crf/Attr.html | 8 +++---- .../com/johnsnowlabs/ml/crf/AttrFeature.html | 8 +++---- .../api/com/johnsnowlabs/ml/crf/AttrStat.html | 8 +++---- .../com/johnsnowlabs/ml/crf/CrfDataset.html | 8 +++---- .../com/johnsnowlabs/ml/crf/CrfParams.html | 8 +++---- .../johnsnowlabs/ml/crf/DatasetEncoder.html | 8 +++---- .../johnsnowlabs/ml/crf/DatasetMetadata.html | 8 +++---- .../johnsnowlabs/ml/crf/DatasetReader$.html | 8 +++---- .../johnsnowlabs/ml/crf/EdgeCalculator$.html | 8 +++---- .../com/johnsnowlabs/ml/crf/FbCalculator.html | 8 +++---- .../api/com/johnsnowlabs/ml/crf/Instance.html | 8 +++---- .../johnsnowlabs/ml/crf/InstanceLabels.html | 8 +++---- .../johnsnowlabs/ml/crf/L2DecayStrategy.html | 8 +++---- .../johnsnowlabs/ml/crf/LinearChainCrf.html | 8 +++---- .../ml/crf/LinearChainCrfModel.html | 8 +++---- .../ml/crf/SerializedDatasetMetadata.html | 8 +++---- .../ml/crf/SerializedLinearChainCrfModel.html | 8 +++---- .../ml/crf/SparseArray$$SeqWrapper.html | 8 +++---- .../com/johnsnowlabs/ml/crf/SparseArray$.html | 8 +++---- .../com/johnsnowlabs/ml/crf/SparseArray.html | 8 +++---- .../ml/crf/TextSentenceAttrs.html | 8 +++---- .../ml/crf/TextSentenceLabels.html | 8 +++---- .../com/johnsnowlabs/ml/crf/Transition.html | 8 +++---- .../com/johnsnowlabs/ml/crf/VectorMath$.html | 8 +++---- .../com/johnsnowlabs/ml/crf/WordAttrs.html | 8 +++---- docs/api/com/johnsnowlabs/ml/crf/index.html | 8 +++---- docs/api/com/johnsnowlabs/ml/index.html | 8 +++---- .../johnsnowlabs/ml/onnx/OnnxWrapper$.html | 8 +++---- .../com/johnsnowlabs/ml/onnx/OnnxWrapper.html | 8 +++---- .../johnsnowlabs/ml/onnx/ReadOnnxModel.html | 8 +++---- .../johnsnowlabs/ml/onnx/WriteOnnxModel.html | 8 +++---- docs/api/com/johnsnowlabs/ml/onnx/index.html | 8 +++---- .../tensorflow/ClassifierDatasetEncoder.html | 8 +++---- .../ClassifierDatasetEncoderParams.html | 8 +++---- .../ml/tensorflow/DatasetEncoderParams.html | 8 +++---- .../johnsnowlabs/ml/tensorflow/Logging.html | 8 +++---- .../ml/tensorflow/ModelSignature.html | 8 +++---- .../johnsnowlabs/ml/tensorflow/NerBatch$.html | 8 +++---- .../johnsnowlabs/ml/tensorflow/NerBatch.html | 8 +++---- .../ml/tensorflow/NerDatasetEncoder.html | 8 +++---- .../ml/tensorflow/ReadTensorflowModel.html | 8 +++---- .../ml/tensorflow/SentenceGrouper.html | 8 +++---- .../ml/tensorflow/TensorResources$.html | 8 +++---- .../ml/tensorflow/TensorResources.html | 8 +++---- .../ml/tensorflow/TensorflowClassifier.html | 8 +++---- .../ml/tensorflow/TensorflowWrapper$.html | 8 +++---- .../ml/tensorflow/TensorflowWrapper.html | 8 +++---- .../johnsnowlabs/ml/tensorflow/Variables.html | 8 +++---- .../ml/tensorflow/WriteTensorflowModel.html | 8 +++---- .../com/johnsnowlabs/ml/tensorflow/index.html | 8 +++---- .../sentencepiece/ReadSentencePieceModel.html | 8 +++---- .../sentencepiece/SentencePieceException.html | 8 +++---- .../sentencepiece/SentencePieceProcessor.html | 8 +++---- .../sentencepiece/SentencePieceWrapper$.html | 8 +++---- .../WriteSentencePieceModel.html | 8 +++---- .../ml/tensorflow/sentencepiece/index.html | 8 +++---- ...delSignatureConstants$$AttentionMask$.html | 8 +++---- ...lSignatureConstants$$AttentionMaskV1$.html | 8 +++---- ...SignatureConstants$$AudioValuesInput$.html | 8 +++---- ...s$$CachedDecoderEncoderAttentionMask$.html | 8 +++---- ...stants$$CachedDecoderEncoderInputIds$.html | 8 +++---- ...eConstants$$CachedDecoderInputCache1$.html | 8 +++---- ...eConstants$$CachedDecoderInputCache2$.html | 8 +++---- ...tureConstants$$CachedDecoderInputIds$.html | 8 +++---- ...natureConstants$$CachedEncoderOutput$.html | 8 +++---- ...gnatureConstants$$CachedLogitsOutput$.html | 8 +++---- ...delSignatureConstants$$CachedOutPut2$.html | 8 +++---- ...delSignatureConstants$$CachedOutput1$.html | 8 +++---- .../sign/ModelSignatureConstants$$DType$.html | 8 +++---- ...atureConstants$$DecoderAttentionMask$.html | 8 +++---- ...nstants$$DecoderEncoderAttentionMask$.html | 8 +++---- ...ureConstants$$DecoderEncoderInputIds$.html | 8 +++---- ...lSignatureConstants$$DecoderInputIds$.html | 8 +++---- ...delSignatureConstants$$DecoderOutput$.html | 8 +++---- .../ModelSignatureConstants$$DimCount$.html | 8 +++---- ...atureConstants$$EncoderAttentionMask$.html | 8 +++---- ...gnatureConstants$$EncoderContextMask$.html | 8 +++---- ...lSignatureConstants$$EncoderInputIds$.html | 8 +++---- ...delSignatureConstants$$EncoderOutput$.html | 8 +++---- ...lSignatureConstants$$EndLogitsOutput$.html | 8 +++---- ...ignatureConstants$$InitCachedOutPut2$.html | 8 +++---- ...ignatureConstants$$InitCachedOutput1$.html | 8 +++---- ...nts$$InitDecoderEncoderAttentionMask$.html | 8 +++---- ...onstants$$InitDecoderEncoderInputIds$.html | 8 +++---- ...natureConstants$$InitDecoderInputIds$.html | 8 +++---- ...SignatureConstants$$InitLogitsOutput$.html | 8 +++---- .../ModelSignatureConstants$$InputIds$.html | 8 +++---- .../ModelSignatureConstants$$InputIdsV1$.html | 8 +++---- ...lSignatureConstants$$LastHiddenState$.html | 8 +++---- ...ignatureConstants$$LastHiddenStateV1$.html | 8 +++---- ...odelSignatureConstants$$LogitsOutput$.html | 8 +++---- .../sign/ModelSignatureConstants$$Name$.html | 8 +++---- ...SignatureConstants$$PixelValuesInput$.html | 8 +++---- ...odelSignatureConstants$$PoolerOutput$.html | 8 +++---- ...elSignatureConstants$$PoolerOutputV1$.html | 8 +++---- ...elSignatureConstants$$SerializedSize$.html | 8 +++---- ...odelSignatureConstants$$ShapeDimList$.html | 8 +++---- ...ignatureConstants$$StartLogitsOutput$.html | 8 +++---- ...lSignatureConstants$$TFInfoDescriptor.html | 8 +++---- ...lSignatureConstants$$TFInfoNameMapper.html | 8 +++---- ...stants$$TapasLogitsAggregationOutput$.html | 8 +++---- ...ignatureConstants$$TapasLogitsOutput$.html | 8 +++---- ...odelSignatureConstants$$TokenTypeIds$.html | 8 +++---- ...elSignatureConstants$$TokenTypeIdsV1$.html | 8 +++---- .../sign/ModelSignatureConstants$.html | 8 +++---- .../sign/ModelSignatureManager$.html | 8 +++---- .../ml/tensorflow/sign/index.html | 8 +++---- .../ml/util/LoadExternalModel$.html | 8 +++---- .../com/johnsnowlabs/ml/util/ModelArch$.html | 8 +++---- .../com/johnsnowlabs/ml/util/ModelEngine.html | 8 +++---- docs/api/com/johnsnowlabs/ml/util/ONNX$.html | 8 +++---- .../com/johnsnowlabs/ml/util/PyTorch$.html | 8 +++---- .../com/johnsnowlabs/ml/util/TensorFlow$.html | 8 +++---- .../com/johnsnowlabs/ml/util/Unknown$.html | 8 +++---- docs/api/com/johnsnowlabs/ml/util/index.html | 8 +++---- .../johnsnowlabs/nlp/ActivationFunction$.html | 8 +++---- .../nlp/Annotation$$AnnotationContainer.html | 8 +++---- ...nnotation$$extractors$$AnnotationData.html | 8 +++---- .../nlp/Annotation$$extractors$.html | 8 +++---- .../api/com/johnsnowlabs/nlp/Annotation$.html | 8 +++---- docs/api/com/johnsnowlabs/nlp/Annotation.html | 8 +++---- .../AnnotationAudio$$AnnotationContainer.html | 8 +++---- .../nlp/AnnotationAudio$$AudioFields.html | 8 +++---- .../johnsnowlabs/nlp/AnnotationAudio$.html | 8 +++---- .../com/johnsnowlabs/nlp/AnnotationAudio.html | 8 +++---- .../AnnotationImage$$AnnotationContainer.html | 8 +++---- .../nlp/AnnotationImage$$ImageFields.html | 8 +++---- .../johnsnowlabs/nlp/AnnotationImage$.html | 8 +++---- .../com/johnsnowlabs/nlp/AnnotationImage.html | 8 +++---- .../johnsnowlabs/nlp/AnnotatorApproach.html | 8 +++---- .../com/johnsnowlabs/nlp/AnnotatorModel.html | 8 +++---- .../com/johnsnowlabs/nlp/AnnotatorType$.html | 8 +++---- .../com/johnsnowlabs/nlp/AudioAssembler$.html | 8 +++---- .../com/johnsnowlabs/nlp/AudioAssembler.html | 8 +++---- docs/api/com/johnsnowlabs/nlp/CanBeLazy.html | 8 +++---- docs/api/com/johnsnowlabs/nlp/Doc2Chunk$.html | 8 +++---- docs/api/com/johnsnowlabs/nlp/Doc2Chunk.html | 8 +++---- .../johnsnowlabs/nlp/DocumentAssembler$.html | 8 +++---- .../johnsnowlabs/nlp/DocumentAssembler.html | 8 +++---- .../johnsnowlabs/nlp/EmbeddingsFinisher$.html | 8 +++---- .../johnsnowlabs/nlp/EmbeddingsFinisher.html | 8 +++---- .../com/johnsnowlabs/nlp/FeaturesReader.html | 8 +++---- .../com/johnsnowlabs/nlp/FeaturesWriter.html | 8 +++---- docs/api/com/johnsnowlabs/nlp/Finisher$.html | 8 +++---- docs/api/com/johnsnowlabs/nlp/Finisher.html | 8 +++---- .../com/johnsnowlabs/nlp/GraphFinisher.html | 8 +++---- .../nlp/HasAudioFeatureProperties.html | 8 +++---- .../johnsnowlabs/nlp/HasBatchedAnnotate.html | 8 +++---- .../nlp/HasBatchedAnnotateAudio.html | 8 +++---- .../nlp/HasBatchedAnnotateImage.html | 8 +++---- .../nlp/HasCandidateLabelsProperties.html | 8 +++---- .../nlp/HasCaseSensitiveProperties.html | 8 +++---- .../HasClassifierActivationProperties.html | 8 +++---- .../nlp/HasEnableCachingProperties.html | 8 +++---- docs/api/com/johnsnowlabs/nlp/HasEngine.html | 8 +++---- .../api/com/johnsnowlabs/nlp/HasFeatures.html | 8 +++---- .../nlp/HasImageFeatureProperties.html | 8 +++---- .../nlp/HasInputAnnotationCols.html | 8 +++---- .../nlp/HasMultipleInputAnnotationCols.html | 8 +++---- .../nlp/HasOutputAnnotationCol.html | 8 +++---- .../nlp/HasOutputAnnotatorType.html | 8 +++---- .../com/johnsnowlabs/nlp/HasPretrained.html | 8 +++---- .../HasProtectedParams$ProtectedParam.html | 8 +++---- .../johnsnowlabs/nlp/HasProtectedParams.html | 8 +++---- .../com/johnsnowlabs/nlp/HasRecursiveFit.html | 8 +++---- .../nlp/HasRecursiveTransform.html | 8 +++---- .../johnsnowlabs/nlp/HasSimpleAnnotate.html | 8 +++---- .../api/com/johnsnowlabs/nlp/IAnnotation.html | 8 +++---- .../com/johnsnowlabs/nlp/ImageAssembler$.html | 8 +++---- .../com/johnsnowlabs/nlp/ImageAssembler.html | 8 +++---- .../com/johnsnowlabs/nlp/JavaAnnotation.html | 8 +++---- .../com/johnsnowlabs/nlp/LightPipeline.html | 8 +++---- .../nlp/MultiDocumentAssembler$.html | 8 +++---- .../nlp/MultiDocumentAssembler.html | 8 +++---- .../nlp/ParamsAndFeaturesReadable.html | 8 +++---- .../nlp/ParamsAndFeaturesWritable.html | 8 +++---- .../com/johnsnowlabs/nlp/RawAnnotator.html | 8 +++---- .../johnsnowlabs/nlp/RecursivePipeline.html | 8 +++---- .../nlp/RecursivePipelineModel.html | 8 +++---- docs/api/com/johnsnowlabs/nlp/SparkNLP$.html | 8 +++---- .../com/johnsnowlabs/nlp/TableAssembler$.html | 8 +++---- .../com/johnsnowlabs/nlp/TableAssembler.html | 8 +++---- .../com/johnsnowlabs/nlp/TokenAssembler$.html | 8 +++---- .../com/johnsnowlabs/nlp/TokenAssembler.html | 8 +++---- .../nlp/annotators/Chunk2Doc$.html | 8 +++---- .../nlp/annotators/Chunk2Doc.html | 8 +++---- .../nlp/annotators/ChunkTokenizer$.html | 8 +++---- .../nlp/annotators/ChunkTokenizer.html | 8 +++---- .../nlp/annotators/ChunkTokenizerModel$.html | 8 +++---- .../nlp/annotators/ChunkTokenizerModel.html | 8 +++---- .../johnsnowlabs/nlp/annotators/Chunker$.html | 8 +++---- .../johnsnowlabs/nlp/annotators/Chunker.html | 8 +++---- .../nlp/annotators/Date2Chunk$.html | 8 +++---- .../nlp/annotators/Date2Chunk.html | 8 +++---- .../nlp/annotators/DateMatcher$.html | 8 +++---- .../nlp/annotators/DateMatcher.html | 8 +++---- .../nlp/annotators/DateMatcherTranslator.html | 8 +++---- .../DateMatcherTranslatorPolicy.html | 8 +++---- .../nlp/annotators/DateMatcherUtils.html | 8 +++---- .../nlp/annotators/DocumentNormalizer$.html | 8 +++---- .../nlp/annotators/DocumentNormalizer.html | 8 +++---- .../nlp/annotators/EnglishStemmer$.html | 8 +++---- .../nlp/annotators/GraphExtraction.html | 8 +++---- .../nlp/annotators/Lemmatizer$.html | 8 +++---- .../nlp/annotators/Lemmatizer.html | 8 +++---- .../nlp/annotators/LemmatizerModel$.html | 8 +++---- .../nlp/annotators/LemmatizerModel.html | 8 +++---- .../nlp/annotators/LookAroundManager$.html | 8 +++---- .../nlp/annotators/MultiDateMatcher$.html | 8 +++---- .../nlp/annotators/MultiDateMatcher.html | 8 +++---- .../nlp/annotators/MultiDatePolicy$.html | 8 +++---- .../nlp/annotators/NGramGenerator$.html | 8 +++---- .../nlp/annotators/NGramGenerator.html | 8 +++---- .../nlp/annotators/Normalizer$.html | 8 +++---- .../nlp/annotators/Normalizer.html | 8 +++---- .../nlp/annotators/NormalizerModel$.html | 8 +++---- ...alizerModel$TokenizerAndNormalizerMap.html | 8 +++---- .../nlp/annotators/NormalizerModel.html | 8 +++---- .../annotators/PretrainedAnnotations$.html | 8 +++---- .../ReadablePretrainedLemmatizer.html | 8 +++---- ...adablePretrainedStopWordsCleanerModel.html | 8 +++---- .../ReadablePretrainedTextMatcher.html | 8 +++---- .../ReadablePretrainedTokenizer.html | 8 +++---- .../nlp/annotators/RecursiveTokenizer.html | 8 +++---- .../annotators/RecursiveTokenizerModel$.html | 8 +++---- .../annotators/RecursiveTokenizerModel.html | 8 +++---- .../nlp/annotators/RegexMatcher$.html | 8 +++---- .../nlp/annotators/RegexMatcher.html | 8 +++---- .../nlp/annotators/RegexMatcherModel$.html | 8 +++---- .../nlp/annotators/RegexMatcherModel.html | 8 +++---- .../nlp/annotators/RegexTokenizer$.html | 8 +++---- .../nlp/annotators/RegexTokenizer.html | 8 +++---- .../nlp/annotators/SingleDatePolicy$.html | 8 +++---- .../johnsnowlabs/nlp/annotators/Stemmer$.html | 8 +++---- .../johnsnowlabs/nlp/annotators/Stemmer.html | 8 +++---- .../nlp/annotators/StopWordsCleaner$.html | 8 +++---- .../nlp/annotators/StopWordsCleaner.html | 8 +++---- .../nlp/annotators/TextMatcher$.html | 8 +++---- .../nlp/annotators/TextMatcher.html | 8 +++---- .../nlp/annotators/TextMatcherModel$.html | 8 +++---- .../nlp/annotators/TextMatcherModel.html | 8 +++---- .../nlp/annotators/Token2Chunk$.html | 8 +++---- .../nlp/annotators/Token2Chunk.html | 8 +++---- .../nlp/annotators/Tokenizer$.html | 8 +++---- .../nlp/annotators/Tokenizer.html | 8 +++---- .../nlp/annotators/TokenizerModel$.html | 8 +++---- .../nlp/annotators/TokenizerModel.html | 8 +++---- .../nlp/annotators/audio/HubertForCTC$.html | 8 +++---- .../nlp/annotators/audio/HubertForCTC.html | 8 +++---- .../audio/ReadHubertForAudioDLModel.html | 8 +++---- .../audio/ReadWav2Vec2ForAudioDLModel.html | 8 +++---- ...ReadablePretrainedHubertForAudioModel.html | 8 +++---- ...adablePretrainedWav2Vec2ForAudioModel.html | 8 +++---- .../nlp/annotators/audio/Wav2Vec2ForCTC$.html | 8 +++---- .../nlp/annotators/audio/Wav2Vec2ForCTC.html | 8 +++---- .../nlp/annotators/audio/index.html | 8 +++---- .../nlp/annotators/btm/BigTextMatcher$.html | 8 +++---- .../nlp/annotators/btm/BigTextMatcher.html | 8 +++---- .../annotators/btm/BigTextMatcherModel$.html | 8 +++---- .../annotators/btm/BigTextMatcherModel.html | 8 +++---- .../btm/ReadablePretrainedBigTextMatcher.html | 8 +++---- .../nlp/annotators/btm/TMEdgesReadWriter.html | 8 +++---- .../nlp/annotators/btm/TMEdgesReader.html | 8 +++---- .../nlp/annotators/btm/TMNodesReader.html | 8 +++---- .../nlp/annotators/btm/TMNodesWriter.html | 8 +++---- .../nlp/annotators/btm/TMVocabReadWriter.html | 8 +++---- .../nlp/annotators/btm/TMVocabReader.html | 8 +++---- .../nlp/annotators/btm/TrieNode.html | 8 +++---- .../nlp/annotators/btm/index.html | 8 +++---- .../dl/AlbertForQuestionAnswering$.html | 8 +++---- .../dl/AlbertForQuestionAnswering.html | 8 +++---- .../dl/AlbertForSequenceClassification$.html | 8 +++---- .../dl/AlbertForSequenceClassification.html | 8 +++---- .../dl/AlbertForTokenClassification$.html | 8 +++---- .../dl/AlbertForTokenClassification.html | 8 +++---- .../dl/BertForQuestionAnswering$.html | 8 +++---- .../dl/BertForQuestionAnswering.html | 8 +++---- .../dl/BertForSequenceClassification$.html | 8 +++---- .../dl/BertForSequenceClassification.html | 8 +++---- .../dl/BertForTokenClassification$.html | 8 +++---- .../dl/BertForTokenClassification.html | 8 +++---- .../dl/BertForZeroShotClassification$.html | 8 +++---- .../dl/BertForZeroShotClassification.html | 8 +++---- .../dl/CamemBertForQuestionAnswering$.html | 8 +++---- .../dl/CamemBertForQuestionAnswering.html | 8 +++---- .../CamemBertForSequenceClassification$.html | 8 +++---- .../CamemBertForSequenceClassification.html | 8 +++---- .../dl/CamemBertForTokenClassification$.html | 8 +++---- .../dl/CamemBertForTokenClassification.html | 8 +++---- .../classifier/dl/ClassifierDLApproach$.html | 8 +++---- .../classifier/dl/ClassifierDLApproach.html | 8 +++---- .../classifier/dl/ClassifierDLModel$.html | 8 +++---- .../classifier/dl/ClassifierDLModel.html | 8 +++---- .../classifier/dl/ClassifierEncoder.html | 8 +++---- .../classifier/dl/ClassifierMetrics.html | 8 +++---- .../dl/DeBertaForQuestionAnswering$.html | 8 +++---- .../dl/DeBertaForQuestionAnswering.html | 8 +++---- .../dl/DeBertaForSequenceClassification$.html | 8 +++---- .../dl/DeBertaForSequenceClassification.html | 8 +++---- .../dl/DeBertaForTokenClassification$.html | 8 +++---- .../dl/DeBertaForTokenClassification.html | 8 +++---- .../dl/DistilBertForQuestionAnswering$.html | 8 +++---- .../dl/DistilBertForQuestionAnswering.html | 8 +++---- .../DistilBertForSequenceClassification$.html | 8 +++---- .../DistilBertForSequenceClassification.html | 8 +++---- .../dl/DistilBertForTokenClassification$.html | 8 +++---- .../dl/DistilBertForTokenClassification.html | 8 +++---- .../DistilBertForZeroShotClassification$.html | 8 +++---- .../DistilBertForZeroShotClassification.html | 8 +++---- .../dl/LongformerForQuestionAnswering$.html | 8 +++---- .../dl/LongformerForQuestionAnswering.html | 8 +++---- .../LongformerForSequenceClassification$.html | 8 +++---- .../LongformerForSequenceClassification.html | 8 +++---- .../dl/LongformerForTokenClassification$.html | 8 +++---- .../dl/LongformerForTokenClassification.html | 8 +++---- .../dl/MultiClassifierDLApproach.html | 8 +++---- .../dl/MultiClassifierDLModel$.html | 8 +++---- .../classifier/dl/MultiClassifierDLModel.html | 8 +++---- ...ReadAlbertForQuestionAnsweringDLModel.html | 8 +++---- .../dl/ReadAlbertForSequenceDLModel.html | 8 +++---- .../dl/ReadAlbertForTokenDLModel.html | 8 +++---- .../ReadBertForQuestionAnsweringDLModel.html | 8 +++---- .../dl/ReadBertForSequenceDLModel.html | 8 +++---- .../dl/ReadBertForTokenDLModel.html | 8 +++---- .../dl/ReadBertForZeroShotDLModel.html | 8 +++---- .../dl/ReadCamemBertForQADLModel.html | 8 +++---- .../dl/ReadCamemBertForSequenceDLModel.html | 8 +++---- .../dl/ReadCamemBertForTokenDLModel.html | 8 +++---- .../dl/ReadClassifierDLTensorflowModel.html | 8 +++---- ...eadDeBertaForQuestionAnsweringDLModel.html | 8 +++---- .../dl/ReadDeBertaForSequenceDLModel.html | 8 +++---- .../dl/ReadDeBertaForTokenDLModel.html | 8 +++---- ...DistilBertForQuestionAnsweringDLModel.html | 8 +++---- .../dl/ReadDistilBertForSequenceDLModel.html | 8 +++---- .../dl/ReadDistilBertForTokenDLModel.html | 8 +++---- .../dl/ReadDistilBertForZeroShotDLModel.html | 8 +++---- ...LongformerForQuestionAnsweringDLModel.html | 8 +++---- .../dl/ReadLongformerForSequenceDLModel.html | 8 +++---- .../dl/ReadLongformerForTokenDLModel.html | 8 +++---- .../ReadMultiClassifierDLTensorflowModel.html | 8 +++---- ...eadRoBertaForQuestionAnsweringDLModel.html | 8 +++---- .../dl/ReadRoBertaForSequenceDLModel.html | 8 +++---- .../dl/ReadRoBertaForTokenDLModel.html | 8 +++---- .../dl/ReadRoBertaForZeroShotDLModel.html | 8 +++---- .../dl/ReadSentimentDLTensorflowModel.html | 8 +++---- .../ReadTapasForQuestionAnsweringDLModel.html | 8 +++---- ...XlmRoBertaForQuestionAnsweringDLModel.html | 8 +++---- .../dl/ReadXlmRoBertaForSequenceDLModel.html | 8 +++---- .../dl/ReadXlmRoBertaForTokenDLModel.html | 8 +++---- .../dl/ReadXlnetForSequenceDLModel.html | 8 +++---- .../dl/ReadXlnetForTokenDLModel.html | 8 +++---- .../ReadablePretrainedAlbertForQAModel.html | 8 +++---- ...dablePretrainedAlbertForSequenceModel.html | 8 +++---- ...ReadablePretrainedAlbertForTokenModel.html | 8 +++---- .../dl/ReadablePretrainedBertForQAModel.html | 8 +++---- ...eadablePretrainedBertForSequenceModel.html | 8 +++---- .../ReadablePretrainedBertForTokenModel.html | 8 +++---- ...eadablePretrainedBertForZeroShotModel.html | 8 +++---- ...ReadablePretrainedCamemBertForQAModel.html | 8 +++---- ...lePretrainedCamemBertForSequenceModel.html | 8 +++---- ...dablePretrainedCamemBertForTokenModel.html | 8 +++---- .../dl/ReadablePretrainedClassifierDL.html | 8 +++---- .../ReadablePretrainedDeBertaForQAModel.html | 8 +++---- ...ablePretrainedDeBertaForSequenceModel.html | 8 +++---- ...eadablePretrainedDeBertaForTokenModel.html | 8 +++---- ...eadablePretrainedDistilBertForQAModel.html | 8 +++---- ...ePretrainedDistilBertForSequenceModel.html | 8 +++---- ...ablePretrainedDistilBertForTokenModel.html | 8 +++---- ...ePretrainedDistilBertForZeroShotModel.html | 8 +++---- ...eadablePretrainedLongformerForQAModel.html | 8 +++---- ...ePretrainedLongformerForSequenceModel.html | 8 +++---- ...ablePretrainedLongformerForTokenModel.html | 8 +++---- .../ReadablePretrainedMultiClassifierDL.html | 8 +++---- .../ReadablePretrainedRoBertaForQAModel.html | 8 +++---- ...ablePretrainedRoBertaForSequenceModel.html | 8 +++---- ...eadablePretrainedRoBertaForTokenModel.html | 8 +++---- ...ablePretrainedRoBertaForZeroShotModel.html | 8 +++---- .../dl/ReadablePretrainedSentimentDL.html | 8 +++---- .../dl/ReadablePretrainedTapasForQAModel.html | 8 +++---- ...eadablePretrainedXlmRoBertaForQAModel.html | 8 +++---- ...ePretrainedXlmRoBertaForSequenceModel.html | 8 +++---- ...ablePretrainedXlmRoBertaForTokenModel.html | 8 +++---- ...adablePretrainedXlnetForSequenceModel.html | 8 +++---- .../ReadablePretrainedXlnetForTokenModel.html | 8 +++---- .../dl/RoBertaForQuestionAnswering$.html | 8 +++---- .../dl/RoBertaForQuestionAnswering.html | 8 +++---- .../dl/RoBertaForSequenceClassification$.html | 8 +++---- .../dl/RoBertaForSequenceClassification.html | 8 +++---- .../dl/RoBertaForTokenClassification$.html | 8 +++---- .../dl/RoBertaForTokenClassification.html | 8 +++---- .../dl/RoBertaForZeroShotClassification$.html | 8 +++---- .../dl/RoBertaForZeroShotClassification.html | 8 +++---- .../classifier/dl/SentimentApproach$.html | 8 +++---- .../classifier/dl/SentimentDLApproach.html | 8 +++---- .../classifier/dl/SentimentDLModel$.html | 8 +++---- .../classifier/dl/SentimentDLModel.html | 8 +++---- .../dl/TapasForQuestionAnswering$.html | 8 +++---- .../dl/TapasForQuestionAnswering.html | 8 +++---- .../dl/XlmRoBertaForQuestionAnswering$.html | 8 +++---- .../dl/XlmRoBertaForQuestionAnswering.html | 8 +++---- .../XlmRoBertaForSequenceClassification$.html | 8 +++---- .../XlmRoBertaForSequenceClassification.html | 8 +++---- .../dl/XlmRoBertaForTokenClassification$.html | 8 +++---- .../dl/XlmRoBertaForTokenClassification.html | 8 +++---- .../dl/XlnetForSequenceClassification$.html | 8 +++---- .../dl/XlnetForSequenceClassification.html | 8 +++---- .../dl/XlnetForTokenClassification$.html | 8 +++---- .../dl/XlnetForTokenClassification.html | 8 +++---- .../nlp/annotators/classifier/dl/index.html | 8 +++---- .../nlp/annotators/classifier/index.html | 8 +++---- .../nlp/annotators/common/Annotated$.html | 8 +++---- .../nlp/annotators/common/Annotated.html | 8 +++---- .../nlp/annotators/common/ChunkSplit$.html | 8 +++---- .../nlp/annotators/common/ConllSentence.html | 8 +++---- .../DatasetHelpers$$DataFrameHelper.html | 8 +++---- .../annotators/common/DatasetHelpers$.html | 8 +++---- .../annotators/common/DependencyParsed$.html | 8 +++---- .../common/DependencyParsedSentence.html | 8 +++---- .../common/EmbeddingsWithSentence$.html | 8 +++---- .../annotators/common/IndexedTaggedWord.html | 8 +++---- .../nlp/annotators/common/IndexedToken.html | 8 +++---- .../nlp/annotators/common/InfixToken$.html | 8 +++---- .../nlp/annotators/common/InfixToken.html | 8 +++---- .../LabeledDependency$$DependencyInfo.html | 8 +++---- .../annotators/common/LabeledDependency$.html | 8 +++---- .../nlp/annotators/common/NerTagged$.html | 8 +++---- .../nlp/annotators/common/PosTagged$.html | 8 +++---- .../nlp/annotators/common/PrefixedToken$.html | 8 +++---- .../nlp/annotators/common/PrefixedToken.html | 8 +++---- .../common/PreprocessingParser.html | 8 +++---- .../nlp/annotators/common/Sentence$.html | 8 +++---- .../nlp/annotators/common/Sentence.html | 8 +++---- .../nlp/annotators/common/SentenceSplit$.html | 8 +++---- .../nlp/annotators/common/SuffixedToken$.html | 8 +++---- .../nlp/annotators/common/SuffixedToken.html | 8 +++---- .../nlp/annotators/common/TableData$.html | 8 +++---- .../nlp/annotators/common/TableData.html | 8 +++---- .../nlp/annotators/common/Tagged.html | 8 +++---- .../annotators/common/TaggedSentence$.html | 8 +++---- .../nlp/annotators/common/TaggedSentence.html | 8 +++---- .../nlp/annotators/common/TaggedWord.html | 8 +++---- .../nlp/annotators/common/TokenPiece.html | 8 +++---- .../common/TokenPieceEmbeddings$.html | 8 +++---- .../common/TokenPieceEmbeddings.html | 8 +++---- .../annotators/common/TokenizedSentence.html | 8 +++---- .../common/TokenizedWithSentence$.html | 8 +++---- .../annotators/common/WordWithDependency.html | 8 +++---- .../common/WordpieceEmbeddingsSentence$.html | 8 +++---- .../common/WordpieceEmbeddingsSentence.html | 8 +++---- .../common/WordpieceTokenized$.html | 8 +++---- .../common/WordpieceTokenizedSentence.html | 8 +++---- .../nlp/annotators/common/index.html | 8 +++---- .../ReadSpanBertCorefTensorflowModel.html | 8 +++---- .../ReadablePretrainedSpanBertCorefModel.html | 8 +++---- .../annotators/coref/SpanBertCorefModel$.html | 8 +++---- .../annotators/coref/SpanBertCorefModel.html | 8 +++---- .../nlp/annotators/coref/index.html | 8 +++---- .../cv/ConvNextForImageClassification$.html | 8 +++---- .../cv/ConvNextForImageClassification.html | 8 +++---- .../cv/ReadConvNextForImageDLModel.html | 8 +++---- .../cv/ReadSwinForImageDLModel.html | 8 +++---- .../annotators/cv/ReadViTForImageDLModel.html | 8 +++---- ...adablePretrainedConvNextForImageModel.html | 8 +++---- .../ReadablePretrainedSwinForImageModel.html | 8 +++---- .../ReadablePretrainedViTForImageModel.html | 8 +++---- .../cv/SwinForImageClassification$.html | 8 +++---- .../cv/SwinForImageClassification.html | 8 +++---- .../cv/ViTForImageClassification$.html | 8 +++---- .../cv/ViTForImageClassification.html | 8 +++---- .../johnsnowlabs/nlp/annotators/cv/index.html | 8 +++---- .../er/AhoCorasickAutomaton$Node.html | 8 +++---- .../annotators/er/AhoCorasickAutomaton.html | 8 +++---- .../nlp/annotators/er/EntityPattern.html | 8 +++---- .../annotators/er/EntityRulerApproach.html | 8 +++---- .../annotators/er/EntityRulerFeatures.html | 8 +++---- .../nlp/annotators/er/EntityRulerModel$.html | 8 +++---- .../nlp/annotators/er/EntityRulerModel.html | 8 +++---- .../nlp/annotators/er/EntityRulerUtil$.html | 8 +++---- .../annotators/er/FlattenEntityPattern.html | 8 +++---- .../nlp/annotators/er/PatternsReadWriter.html | 8 +++---- .../nlp/annotators/er/PatternsReader.html | 8 +++---- .../er/ReadablePretrainedEntityRuler.html | 8 +++---- .../er/RegexPatternsReadWriter.html | 8 +++---- .../annotators/er/RegexPatternsReader.html | 8 +++---- .../johnsnowlabs/nlp/annotators/er/index.html | 8 +++---- .../johnsnowlabs/nlp/annotators/index.html | 8 +++---- .../nlp/annotators/keyword/index.html | 8 +++---- .../keyword/yake/YakeKeywordExtraction$.html | 8 +++---- .../keyword/yake/YakeKeywordExtraction.html | 8 +++---- .../annotators/keyword/yake/YakeParams.html | 8 +++---- .../nlp/annotators/keyword/yake/index.html | 8 +++---- .../annotators/keyword/yake/util/Token.html | 8 +++---- .../keyword/yake/util/Utilities$.html | 8 +++---- .../annotators/keyword/yake/util/index.html | 8 +++---- .../annotators/ld/dl/LanguageDetectorDL$.html | 8 +++---- .../annotators/ld/dl/LanguageDetectorDL.html | 8 +++---- ...ReadLanguageDetectorDLTensorflowModel.html | 8 +++---- ...ablePretrainedLanguageDetectorDLModel.html | 8 +++---- .../nlp/annotators/ld/dl/index.html | 8 +++---- .../johnsnowlabs/nlp/annotators/ld/index.html | 8 +++---- .../nlp/annotators/ner/ModelMetrics$.html | 8 +++---- .../nlp/annotators/ner/NamedEntity.html | 8 +++---- .../nlp/annotators/ner/NerApproach.html | 8 +++---- .../nlp/annotators/ner/NerConverter$.html | 8 +++---- .../nlp/annotators/ner/NerConverter.html | 8 +++---- .../nlp/annotators/ner/NerOverwriter$.html | 8 +++---- .../nlp/annotators/ner/NerOverwriter.html | 8 +++---- .../nlp/annotators/ner/NerTagsEncoding$.html | 8 +++---- .../nlp/annotators/ner/Verbose$.html | 8 +++---- .../ner/crf/DictionaryFeatures$.html | 8 +++---- .../ner/crf/DictionaryFeatures.html | 8 +++---- .../ner/crf/FeatureGenerator$TokenType$.html | 8 +++---- .../annotators/ner/crf/FeatureGenerator.html | 8 +++---- .../annotators/ner/crf/NerCrfApproach$.html | 8 +++---- .../annotators/ner/crf/NerCrfApproach.html | 8 +++---- .../nlp/annotators/ner/crf/NerCrfModel$.html | 8 +++---- .../nlp/annotators/ner/crf/NerCrfModel.html | 8 +++---- .../ner/crf/ReadablePretrainedNerCrf.html | 8 +++---- .../nlp/annotators/ner/crf/index.html | 8 +++---- .../nlp/annotators/ner/dl/LoadsContrib$.html | 8 +++---- .../nlp/annotators/ner/dl/NerDLApproach$.html | 8 +++---- .../nlp/annotators/ner/dl/NerDLApproach.html | 8 +++---- .../nlp/annotators/ner/dl/NerDLModel$.html | 8 +++---- .../nlp/annotators/ner/dl/NerDLModel.html | 8 +++---- .../ner/dl/NerDLModelPythonReader$.html | 8 +++---- .../ner/dl/ReadZeroShotNerDLModel.html | 8 +++---- .../ner/dl/ReadablePretrainedNerDL.html | 8 +++---- .../ner/dl/ReadablePretrainedZeroShotNer.html | 8 +++---- .../nlp/annotators/ner/dl/ReadsNERGraph.html | 8 +++---- .../annotators/ner/dl/WithGraphResolver.html | 8 +++---- .../annotators/ner/dl/ZeroShotNerModel$.html | 8 +++---- .../annotators/ner/dl/ZeroShotNerModel.html | 8 +++---- .../nlp/annotators/ner/dl/index.html | 8 +++---- .../nlp/annotators/ner/index.html | 8 +++---- ...lizableFormat$$SerializableDateFormat.html | 8 +++---- .../AnnotatorParam$SerializableFormat$.html | 8 +++---- .../nlp/annotators/param/AnnotatorParam.html | 8 +++---- .../annotators/param/EvaluationDLParams.html | 8 +++---- .../param/ExternalResourceParam.html | 8 +++---- .../param/SerializedAnnotatorComponent.html | 8 +++---- .../param/WritableAnnotatorComponent.html | 8 +++---- .../nlp/annotators/param/index.html | 8 +++---- .../parser/dep/DependencyParserApproach$.html | 8 +++---- .../parser/dep/DependencyParserApproach.html | 8 +++---- .../parser/dep/DependencyParserModel$.html | 8 +++---- .../parser/dep/DependencyParserModel.html | 8 +++---- .../GreedyTransition/DependencyMaker$.html | 8 +++---- .../DependencyMaker$CurrentState.html | 8 +++---- .../DependencyMaker$ParseState.html | 8 +++---- .../dep/GreedyTransition/DependencyMaker.html | 8 +++---- .../GreedyTransitionApproach$.html | 8 +++---- .../parser/dep/GreedyTransition/index.html | 8 +++---- .../GreedyTransition/package$$Feature.html | 8 +++---- .../GreedyTransition/package$$WordData.html | 8 +++---- .../parser/dep/Perceptron$WeightLearner.html | 8 +++---- .../nlp/annotators/parser/dep/Perceptron.html | 8 +++---- .../dep/ReadablePretrainedDependency.html | 8 +++---- .../annotators/parser/dep/TagDictionary$.html | 8 +++---- .../nlp/annotators/parser/dep/Tagger$.html | 8 +++---- .../nlp/annotators/parser/dep/Tagger.html | 8 +++---- .../nlp/annotators/parser/dep/index.html | 8 +++---- .../nlp/annotators/parser/index.html | 8 +++---- .../annotators/parser/typdep/ConllData.html | 8 +++---- .../parser/typdep/DependencyArcList.html | 8 +++---- .../parser/typdep/DependencyInstance.html | 8 +++---- .../parser/typdep/DependencyPipe.html | 8 +++---- .../parser/typdep/LocalFeatureData.html | 8 +++---- .../parser/typdep/LowRankTensor.html | 8 +++---- .../nlp/annotators/parser/typdep/Options.html | 8 +++---- .../annotators/parser/typdep/Parameters.html | 8 +++---- .../parser/typdep/PredictionParameters.html | 8 +++---- .../ReadablePretrainedTypedDependency.html | 8 +++---- .../parser/typdep/TrainDependencies.html | 8 +++---- .../annotators/parser/typdep/TrainFile.html | 8 +++---- .../parser/typdep/TypedDependencyParser.html | 8 +++---- .../TypedDependencyParserApproach$.html | 8 +++---- .../typdep/TypedDependencyParserApproach.html | 8 +++---- .../typdep/TypedDependencyParserModel$.html | 8 +++---- .../typdep/TypedDependencyParserModel.html | 8 +++---- .../typdep/feature/FeatureTemplate.html | 8 +++---- .../feature/SyntacticFeatureFactory.html | 8 +++---- .../parser/typdep/feature/index.html | 8 +++---- .../nlp/annotators/parser/typdep/index.html | 8 +++---- .../parser/typdep/io/Conll09Reader.html | 8 +++---- .../parser/typdep/io/ConllUReader.html | 8 +++---- .../parser/typdep/io/ConllWriter.html | 8 +++---- .../parser/typdep/io/DependencyReader.html | 8 +++---- .../annotators/parser/typdep/io/index.html | 8 +++---- .../parser/typdep/util/Alphabet.html | 8 +++---- .../parser/typdep/util/Collector.html | 8 +++---- .../parser/typdep/util/DependencyLabel.html | 8 +++---- .../parser/typdep/util/Dictionary.html | 8 +++---- .../parser/typdep/util/DictionarySet.html | 8 +++---- .../parser/typdep/util/FeatureVector.html | 8 +++---- .../parser/typdep/util/ScoreCollector.html | 8 +++---- .../annotators/parser/typdep/util/Utils.html | 8 +++---- .../annotators/parser/typdep/util/index.html | 8 +++---- .../nlp/annotators/pos/index.html | 8 +++---- .../pos/perceptron/AveragedPerceptron.html | 8 +++---- .../pos/perceptron/PerceptronApproach$.html | 8 +++---- .../pos/perceptron/PerceptronApproach.html | 8 +++---- .../PerceptronApproachDistributed$.html | 8 +++---- .../PerceptronApproachDistributed.html | 8 +++---- .../pos/perceptron/PerceptronModel$.html | 8 +++---- .../pos/perceptron/PerceptronModel.html | 8 +++---- .../perceptron/PerceptronPredictionUtils.html | 8 +++---- .../perceptron/PerceptronTrainingUtils.html | 8 +++---- .../pos/perceptron/PerceptronUtils.html | 8 +++---- .../ReadablePretrainedPerceptron.html | 8 +++---- .../StringMapStringDoubleAccumulator.html | 8 +++---- .../perceptron/TrainingPerceptronLegacy.html | 8 +++---- .../TupleKeyLongDoubleMapAccumulator.html | 8 +++---- .../nlp/annotators/pos/perceptron/index.html | 8 +++---- .../sbd/SentenceDetectorParams.html | 8 +++---- .../nlp/annotators/sbd/index.html | 8 +++---- .../sbd/pragmatic/CustomPragmaticMethod.html | 8 +++---- .../sbd/pragmatic/DefaultPragmaticMethod.html | 8 +++---- .../sbd/pragmatic/MixedPragmaticMethod.html | 8 +++---- .../pragmatic/PragmaticContentFormatter$.html | 8 +++---- .../pragmatic/PragmaticContentFormatter.html | 8 +++---- .../sbd/pragmatic/PragmaticDictionaries$.html | 8 +++---- .../sbd/pragmatic/PragmaticMethod.html | 8 +++---- .../pragmatic/PragmaticSentenceExtractor.html | 8 +++---- .../sbd/pragmatic/PragmaticSymbols$.html | 8 +++---- .../annotators/sbd/pragmatic/RuleSymbols.html | 8 +++---- .../sbd/pragmatic/SentenceDetector$.html | 8 +++---- .../sbd/pragmatic/SentenceDetector.html | 8 +++---- .../nlp/annotators/sbd/pragmatic/index.html | 8 +++---- .../nlp/annotators/sda/index.html | 8 +++---- .../sda/pragmatic/PragmaticScorer.html | 8 +++---- .../sda/pragmatic/SentimentDetector$.html | 8 +++---- .../sda/pragmatic/SentimentDetector.html | 8 +++---- .../pragmatic/SentimentDetectorModel$.html | 8 +++---- .../sda/pragmatic/SentimentDetectorModel.html | 8 +++---- .../nlp/annotators/sda/pragmatic/index.html | 8 +++---- .../sda/vivekn/ReadablePretrainedVivekn.html | 8 +++---- .../sda/vivekn/ViveknSentimentApproach.html | 8 +++---- .../sda/vivekn/ViveknSentimentModel$.html | 8 +++---- .../sda/vivekn/ViveknSentimentModel.html | 8 +++---- .../sda/vivekn/ViveknSentimentUtils.html | 8 +++---- .../nlp/annotators/sda/vivekn/index.html | 8 +++---- .../sentence_detector_dl/Metrics.html | 8 +++---- .../ReadablePretrainedSentenceDetectorDL.html | 8 +++---- .../ReadsSentenceDetectorDLGraph.html | 8 +++---- .../SentenceDetectorDLApproach.html | 8 +++---- .../SentenceDetectorDLEncoder$.html | 8 +++---- .../SentenceDetectorDLEncoder.html | 8 +++---- .../SentenceDetectorDLEncoderParam.html | 8 +++---- .../SentenceDetectorDLModel$.html | 8 +++---- .../SentenceDetectorDLModel.html | 8 +++---- .../sentence_detector_dl/index.html | 8 +++---- .../annotators/seq2seq/BartTransformer$.html | 8 +++---- .../annotators/seq2seq/BartTransformer.html | 8 +++---- .../annotators/seq2seq/GPT2Transformer$.html | 8 +++---- .../annotators/seq2seq/GPT2Transformer.html | 8 +++---- .../seq2seq/MarianTransformer$.html | 8 +++---- .../annotators/seq2seq/MarianTransformer.html | 8 +++---- .../seq2seq/ReadBartTransformerDLModel.html | 8 +++---- .../seq2seq/ReadGPT2TransformerDLModel.html | 8 +++---- .../seq2seq/ReadMarianMTDLModel.html | 8 +++---- .../seq2seq/ReadT5TransformerDLModel.html | 8 +++---- ...eadablePretrainedBartTransformerModel.html | 8 +++---- ...eadablePretrainedGPT2TransformerModel.html | 8 +++---- .../ReadablePretrainedMarianMTModel.html | 8 +++---- .../ReadablePretrainedT5TransformerModel.html | 8 +++---- .../annotators/seq2seq/T5Transformer$.html | 8 +++---- .../nlp/annotators/seq2seq/T5Transformer.html | 8 +++---- .../nlp/annotators/seq2seq/index.html | 8 +++---- .../DocumentSimilarityRankerApproach$.html | 8 +++---- .../DocumentSimilarityRankerApproach.html | 8 +++---- .../DocumentSimilarityRankerModel$.html | 8 +++---- .../DocumentSimilarityRankerModel.html | 8 +++---- .../similarity/IndexedNeighbors.html | 8 +++---- .../IndexedNeighborsWithDistance.html | 8 +++---- .../similarity/NeighborAnnotation.html | 8 +++---- .../similarity/NeighborsResultSet.html | 8 +++---- .../ReadableDocumentSimilarityRanker.html | 8 +++---- .../nlp/annotators/similarity/index.html | 8 +++---- .../spell/context/CandidateStrategy$.html | 8 +++---- ...ntextSpellCheckerApproach$ArrayHelper.html | 8 +++---- .../context/ContextSpellCheckerApproach.html | 8 +++---- .../context/ContextSpellCheckerModel$.html | 8 +++---- .../ContextSpellCheckerModel$StringTools.html | 8 +++---- .../context/ContextSpellCheckerModel.html | 8 +++---- .../spell/context/HasTransducerFeatures.html | 8 +++---- .../spell/context/LangModelSentence.html | 8 +++---- .../ReadablePretrainedContextSpell.html | 8 +++---- .../context/ReadsLanguageModelGraph.html | 8 +++---- .../spell/context/WeightedLevenshtein.html | 8 +++---- .../nlp/annotators/spell/context/index.html | 8 +++---- .../spell/context/parser/AgeToken.html | 8 +++---- .../spell/context/parser/DateToken.html | 8 +++---- .../context/parser/GenericRegexParser.html | 8 +++---- .../context/parser/GenericVocabParser.html | 8 +++---- .../spell/context/parser/LocationClass.html | 8 +++---- .../spell/context/parser/MainVocab.html | 8 +++---- .../spell/context/parser/MedicationClass.html | 8 +++---- .../spell/context/parser/NamesClass.html | 8 +++---- .../spell/context/parser/NumberToken.html | 8 +++---- .../spell/context/parser/RegexParser.html | 8 +++---- .../context/parser/SerializableClass.html | 8 +++---- .../context/parser/SpecialClassParser.html | 8 +++---- .../context/parser/TransducerSeqFeature.html | 8 +++---- .../spell/context/parser/UnitToken.html | 8 +++---- .../spell/context/parser/VocabParser.html | 8 +++---- .../spell/context/parser/index.html | 8 +++---- .../nlp/annotators/spell/index.html | 8 +++---- .../spell/norvig/NorvigSweetingApproach$.html | 8 +++---- .../spell/norvig/NorvigSweetingApproach.html | 8 +++---- .../spell/norvig/NorvigSweetingModel$.html | 8 +++---- .../spell/norvig/NorvigSweetingModel.html | 8 +++---- .../spell/norvig/NorvigSweetingParams.html | 8 +++---- .../norvig/ReadablePretrainedNorvig.html | 8 +++---- .../nlp/annotators/spell/norvig/index.html | 8 +++---- .../ReadablePretrainedSymmetric.html | 8 +++---- .../symmetric/SymmetricDeleteApproach$.html | 8 +++---- .../symmetric/SymmetricDeleteApproach.html | 8 +++---- .../symmetric/SymmetricDeleteModel$.html | 8 +++---- .../SymmetricDeleteModel$SuggestedWord.html | 8 +++---- .../spell/symmetric/SymmetricDeleteModel.html | 8 +++---- .../symmetric/SymmetricDeleteParams.html | 8 +++---- .../nlp/annotators/spell/symmetric/index.html | 8 +++---- .../nlp/annotators/spell/util/Utilities$.html | 8 +++---- .../nlp/annotators/spell/util/index.html | 8 +++---- .../nlp/annotators/tapas/TapasCellDate$.html | 8 +++---- .../nlp/annotators/tapas/TapasCellDate.html | 8 +++---- .../nlp/annotators/tapas/TapasCellValue$.html | 8 +++---- .../nlp/annotators/tapas/TapasCellValue.html | 8 +++---- .../nlp/annotators/tapas/TapasEncoder.html | 8 +++---- .../nlp/annotators/tapas/TapasInputData.html | 8 +++---- .../tapas/TapasNumericRelation$.html | 8 +++---- .../tapas/TapasNumericValueSpan$.html | 8 +++---- .../tapas/TapasNumericValueSpan.html | 8 +++---- .../nlp/annotators/tapas/index.html | 8 +++---- .../tokenizer/bpe/BartTokenizer.html | 8 +++---- .../tokenizer/bpe/BpeTokenizer$.html | 8 +++---- .../tokenizer/bpe/Gpt2Tokenizer.html | 8 +++---- .../tokenizer/bpe/RobertaTokenizer.html | 8 +++---- .../tokenizer/bpe/SpecialToken.html | 8 +++---- .../nlp/annotators/tokenizer/bpe/index.html | 8 +++---- .../nlp/annotators/tokenizer/index.html | 8 +++---- .../ws/ReadablePretrainedWordSegmenter.html | 8 +++---- .../nlp/annotators/ws/TagsType$.html | 8 +++---- .../annotators/ws/WordSegmenterApproach$.html | 8 +++---- .../annotators/ws/WordSegmenterApproach.html | 8 +++---- .../annotators/ws/WordSegmenterModel$.html | 8 +++---- .../nlp/annotators/ws/WordSegmenterModel.html | 8 +++---- .../johnsnowlabs/nlp/annotators/ws/index.html | 8 +++---- .../nlp/embeddings/AlbertEmbeddings$.html | 8 +++---- .../nlp/embeddings/AlbertEmbeddings.html | 8 +++---- .../nlp/embeddings/BertEmbeddings$.html | 8 +++---- .../nlp/embeddings/BertEmbeddings.html | 8 +++---- .../embeddings/BertSentenceEmbeddings$.html | 8 +++---- .../embeddings/BertSentenceEmbeddings.html | 8 +++---- .../nlp/embeddings/CamemBertEmbeddings$.html | 8 +++---- .../nlp/embeddings/CamemBertEmbeddings.html | 8 +++---- .../nlp/embeddings/ChunkEmbeddings$.html | 8 +++---- .../nlp/embeddings/ChunkEmbeddings.html | 8 +++---- .../nlp/embeddings/DeBertaEmbeddings$.html | 8 +++---- .../nlp/embeddings/DeBertaEmbeddings.html | 8 +++---- .../nlp/embeddings/DistilBertEmbeddings$.html | 8 +++---- .../nlp/embeddings/DistilBertEmbeddings.html | 8 +++---- .../nlp/embeddings/Doc2VecApproach$.html | 8 +++---- .../nlp/embeddings/Doc2VecApproach.html | 8 +++---- .../nlp/embeddings/Doc2VecModel$.html | 8 +++---- .../nlp/embeddings/Doc2VecModel.html | 8 +++---- .../nlp/embeddings/E5Embeddings$.html | 8 +++---- .../nlp/embeddings/E5Embeddings.html | 8 +++---- .../nlp/embeddings/ElmoEmbeddings$.html | 8 +++---- .../nlp/embeddings/ElmoEmbeddings.html | 8 +++---- .../EmbeddingsCoverage$CoverageResult.html | 8 +++---- .../nlp/embeddings/EmbeddingsCoverage.html | 8 +++---- .../embeddings/HasEmbeddingsProperties.html | 8 +++---- .../nlp/embeddings/InstructorEmbeddings$.html | 8 +++---- .../nlp/embeddings/InstructorEmbeddings.html | 8 +++---- .../nlp/embeddings/LongformerEmbeddings$.html | 8 +++---- .../nlp/embeddings/LongformerEmbeddings.html | 8 +++---- .../PoolingStrategy$$AnnotatorType$.html | 8 +++---- .../nlp/embeddings/PoolingStrategy$.html | 8 +++---- .../nlp/embeddings/ReadAlbertDLModel.html | 8 +++---- .../nlp/embeddings/ReadBertDLModel.html | 8 +++---- .../embeddings/ReadBertSentenceDLModel.html | 8 +++---- .../nlp/embeddings/ReadCamemBertDLModel.html | 8 +++---- .../nlp/embeddings/ReadDeBertaDLModel.html | 8 +++---- .../nlp/embeddings/ReadDistilBertDLModel.html | 8 +++---- .../nlp/embeddings/ReadE5DLModel.html | 8 +++---- .../nlp/embeddings/ReadElmoDLModel.html | 8 +++---- .../nlp/embeddings/ReadInstructorDLModel.html | 8 +++---- .../nlp/embeddings/ReadLongformerDLModel.html | 8 +++---- .../nlp/embeddings/ReadRobertaDLModel.html | 8 +++---- .../ReadRobertaSentenceDLModel.html | 8 +++---- .../nlp/embeddings/ReadUSEDLModel.html | 8 +++---- .../nlp/embeddings/ReadXlmRobertaDLModel.html | 8 +++---- .../ReadXlmRobertaSentenceDLModel.html | 8 +++---- .../nlp/embeddings/ReadXlnetDLModel.html | 8 +++---- .../ReadablePretrainedAlbertModel.html | 8 +++---- .../ReadablePretrainedBertModel.html | 8 +++---- .../ReadablePretrainedBertSentenceModel.html | 8 +++---- .../ReadablePretrainedCamemBertModel.html | 8 +++---- .../ReadablePretrainedDeBertaModel.html | 8 +++---- .../ReadablePretrainedDistilBertModel.html | 8 +++---- .../embeddings/ReadablePretrainedDoc2Vec.html | 8 +++---- .../embeddings/ReadablePretrainedE5Model.html | 8 +++---- .../ReadablePretrainedElmoModel.html | 8 +++---- .../ReadablePretrainedInstructorModel.html | 8 +++---- .../ReadablePretrainedLongformerModel.html | 8 +++---- .../ReadablePretrainedRobertaModel.html | 8 +++---- ...eadablePretrainedRobertaSentenceModel.html | 8 +++---- .../ReadablePretrainedUSEModel.html | 8 +++---- .../ReadablePretrainedWord2Vec.html | 8 +++---- .../ReadablePretrainedWordEmbeddings.html | 8 +++---- .../ReadablePretrainedXlmRobertaModel.html | 8 +++---- ...ablePretrainedXlmRobertaSentenceModel.html | 8 +++---- .../ReadablePretrainedXlnetModel.html | 8 +++---- .../nlp/embeddings/ReadsFromBytes.html | 8 +++---- .../nlp/embeddings/RoBertaEmbeddings$.html | 8 +++---- .../nlp/embeddings/RoBertaEmbeddings.html | 8 +++---- .../RoBertaSentenceEmbeddings$.html | 8 +++---- .../embeddings/RoBertaSentenceEmbeddings.html | 8 +++---- .../nlp/embeddings/SentenceEmbeddings$.html | 8 +++---- .../nlp/embeddings/SentenceEmbeddings.html | 8 +++---- .../embeddings/UniversalSentenceEncoder$.html | 8 +++---- .../embeddings/UniversalSentenceEncoder.html | 8 +++---- .../nlp/embeddings/Word2VecApproach$.html | 8 +++---- .../nlp/embeddings/Word2VecApproach.html | 8 +++---- .../nlp/embeddings/Word2VecModel$.html | 8 +++---- .../nlp/embeddings/Word2VecModel.html | 8 +++---- .../nlp/embeddings/WordEmbeddings$.html | 8 +++---- .../nlp/embeddings/WordEmbeddings.html | 8 +++---- .../WordEmbeddingsBinaryIndexer$.html | 8 +++---- .../nlp/embeddings/WordEmbeddingsModel$.html | 8 +++---- .../nlp/embeddings/WordEmbeddingsModel.html | 8 +++---- .../nlp/embeddings/WordEmbeddingsReader.html | 8 +++---- .../WordEmbeddingsTextIndexer$.html | 8 +++---- .../nlp/embeddings/WordEmbeddingsWriter.html | 8 +++---- .../nlp/embeddings/XlmRoBertaEmbeddings$.html | 8 +++---- .../nlp/embeddings/XlmRoBertaEmbeddings.html | 8 +++---- .../XlmRoBertaSentenceEmbeddings$.html | 8 +++---- .../XlmRoBertaSentenceEmbeddings.html | 8 +++---- .../nlp/embeddings/XlnetEmbeddings$.html | 8 +++---- .../nlp/embeddings/XlnetEmbeddings.html | 8 +++---- .../johnsnowlabs/nlp/embeddings/index.html | 8 +++---- .../DocumentSimilarityRankerFinisher$.html | 8 +++---- .../DocumentSimilarityRankerFinisher.html | 8 +++---- .../com/johnsnowlabs/nlp/finisher/index.html | 8 +++---- .../nlp/functions$$EachAnnotations.html | 8 +++---- .../nlp/functions$$ExplodeAnnotations.html | 8 +++---- .../nlp/functions$$FilterAnnotations.html | 8 +++---- .../nlp/functions$$MapAnnotations.html | 8 +++---- docs/api/com/johnsnowlabs/nlp/functions$.html | 8 +++---- docs/api/com/johnsnowlabs/nlp/index.html | 8 +++---- .../nlp/pretrained/PretrainedPipeline$.html | 8 +++---- .../nlp/pretrained/PretrainedPipeline.html | 8 +++---- .../pretrained/PythonResourceDownloader$.html | 8 +++---- .../nlp/pretrained/RepositoryMetadata.html | 8 +++---- .../nlp/pretrained/ResourceDownloader$.html | 8 +++---- .../nlp/pretrained/ResourceDownloader.html | 8 +++---- .../nlp/pretrained/ResourceMetadata$.html | 8 +++---- .../nlp/pretrained/ResourceMetadata.html | 8 +++---- .../nlp/pretrained/ResourceRequest.html | 8 +++---- .../nlp/pretrained/ResourceType$.html | 8 +++---- .../nlp/pretrained/S3ResourceDownloader.html | 8 +++---- .../johnsnowlabs/nlp/pretrained/index.html | 8 +++---- .../com/johnsnowlabs/nlp/recursive/index.html | 8 +++---- .../nlp/recursive/package$$Recursive.html | 8 +++---- .../recursive/package$$RecursiveModel.html | 8 +++---- .../nlp/serialization/ArrayFeature.html | 8 +++---- .../nlp/serialization/Feature.html | 8 +++---- .../nlp/serialization/MapFeature.html | 8 +++---- .../SerializedExternalResource.html | 8 +++---- .../nlp/serialization/SetFeature.html | 8 +++---- .../nlp/serialization/StructFeature.html | 8 +++---- .../nlp/serialization/TransducerFeature.html | 8 +++---- .../johnsnowlabs/nlp/serialization/index.html | 8 +++---- .../com/johnsnowlabs/nlp/training/CoNLL.html | 8 +++---- .../nlp/training/CoNLL2003NerReader.html | 8 +++---- .../nlp/training/CoNLLDocument.html | 8 +++---- .../CoNLLHelper$$CoNLLSentenceCols.html | 8 +++---- .../training/CoNLLHelper$$CoNLLTokenCols.html | 8 +++---- .../nlp/training/CoNLLHelper$.html | 8 +++---- .../com/johnsnowlabs/nlp/training/CoNLLU.html | 8 +++---- .../nlp/training/CoNLLUCols$.html | 8 +++---- .../nlp/training/CoNLLUDocument.html | 8 +++---- .../com/johnsnowlabs/nlp/training/POS.html | 8 +++---- .../johnsnowlabs/nlp/training/PubTator.html | 8 +++---- .../nlp/training/SpacyToAnnotation.html | 8 +++---- .../com/johnsnowlabs/nlp/training/index.html | 8 +++---- .../johnsnowlabs/nlp/util/FinisherUtil$.html | 8 +++---- .../johnsnowlabs/nlp/util/GraphBuilder.html | 8 +++---- .../nlp/util/LfuCache$CachedItem.html | 8 +++---- .../nlp/util/LfuCache$DoubleLinked.html | 8 +++---- .../nlp/util/LfuCache$FrequencyList.html | 8 +++---- .../com/johnsnowlabs/nlp/util/LfuCache.html | 8 +++---- .../nlp/util/LruMap$KeyPriority.html | 8 +++---- .../nlp/util/LruMap$KeyPriorityOrdering$.html | 8 +++---- .../api/com/johnsnowlabs/nlp/util/LruMap.html | 8 +++---- .../nlp/util/SparkNlpConfigKeys$.html | 8 +++---- docs/api/com/johnsnowlabs/nlp/util/index.html | 8 +++---- .../nlp/util/io/ExternalResource$.html | 8 +++---- .../nlp/util/io/ExternalResource.html | 8 +++---- .../nlp/util/io/MatchStrategy$.html | 8 +++---- .../nlp/util/io/OutputHelper$.html | 8 +++---- .../com/johnsnowlabs/nlp/util/io/ReadAs$.html | 8 +++---- .../util/io/ResourceHelper$$SourceStream.html | 8 +++---- .../nlp/util/io/ResourceHelper$.html | 8 +++---- .../com/johnsnowlabs/nlp/util/io/index.html | 8 +++---- .../nlp/util/regex/RegexRule.html | 8 +++---- .../util/regex/RuleFactory$$RuleMatch.html | 8 +++---- .../nlp/util/regex/RuleFactory$.html | 8 +++---- .../nlp/util/regex/RuleFactory.html | 8 +++---- .../nlp/util/regex/TransformStrategy$.html | 8 +++---- .../johnsnowlabs/nlp/util/regex/index.html | 8 +++---- .../com/johnsnowlabs/storage/BytesKey.html | 8 +++---- .../com/johnsnowlabs/storage/Database$.html | 8 +++---- .../com/johnsnowlabs/storage/Database.html | 8 +++---- .../johnsnowlabs/storage/HasConnection.html | 8 +++---- .../com/johnsnowlabs/storage/HasStorage.html | 8 +++---- .../johnsnowlabs/storage/HasStorageModel.html | 8 +++---- .../storage/HasStorageOptions.html | 8 +++---- .../storage/HasStorageReader.html | 8 +++---- .../johnsnowlabs/storage/HasStorageRef$.html | 8 +++---- .../johnsnowlabs/storage/HasStorageRef.html | 8 +++---- .../storage/RocksDBConnection$.html | 8 +++---- .../storage/RocksDBConnection.html | 8 +++---- .../storage/StorageBatchWriter.html | 8 +++---- .../johnsnowlabs/storage/StorageFormat.html | 8 +++---- .../johnsnowlabs/storage/StorageHelper$.html | 8 +++---- .../johnsnowlabs/storage/StorageLocator$.html | 8 +++---- .../johnsnowlabs/storage/StorageLocator.html | 8 +++---- .../storage/StorageReadWriter.html | 8 +++---- .../johnsnowlabs/storage/StorageReadable.html | 8 +++---- .../johnsnowlabs/storage/StorageReader.html | 8 +++---- .../johnsnowlabs/storage/StorageWriter.html | 8 +++---- docs/api/com/johnsnowlabs/storage/index.html | 8 +++---- .../api/com/johnsnowlabs/util/Benchmark$.html | 8 +++---- docs/api/com/johnsnowlabs/util/Build$.html | 8 +++---- .../johnsnowlabs/util/CoNLLGenerator$.html | 8 +++---- .../com/johnsnowlabs/util/ConfigHelper$.html | 8 +++---- .../com/johnsnowlabs/util/ConfigLoader$.html | 8 +++---- .../com/johnsnowlabs/util/FileHelper$.html | 8 +++---- .../com/johnsnowlabs/util/JsonParser$.html | 8 +++---- .../johnsnowlabs/util/PipelineModels$.html | 8 +++---- .../johnsnowlabs/util/TrainingHelper$.html | 8 +++---- docs/api/com/johnsnowlabs/util/Version$.html | 8 +++---- docs/api/com/johnsnowlabs/util/Version.html | 8 +++---- .../johnsnowlabs/util/ZipArchiveUtil$.html | 8 +++---- docs/api/com/johnsnowlabs/util/index.html | 8 +++---- .../util/spark/LongMapAccumulator.html | 8 +++---- .../util/spark/MapAccumulator.html | 8 +++---- .../johnsnowlabs/util/spark/SparkUtil$.html | 8 +++---- .../com/johnsnowlabs/util/spark/index.html | 8 +++---- docs/api/index.html | 8 +++---- docs/api/python/.buildinfo | 2 +- docs/api/python/genindex.html | 2 +- docs/api/python/getting_started/index.html | 20 ++++++++-------- docs/api/python/index.html | 2 +- docs/api/python/modules/index.html | 2 +- docs/api/python/modules/sparknlp.html | 19 +++++++++++---- .../python/modules/sparknlp/annotation.html | 2 +- .../modules/sparknlp/annotation_audio.html | 2 +- .../modules/sparknlp/annotation_image.html | 2 +- .../annotator/audio/hubert_for_ctc.html | 2 +- .../annotator/audio/wav2vec2_for_ctc.html | 2 +- .../sparknlp/annotator/chunk2_doc.html | 2 +- .../modules/sparknlp/annotator/chunker.html | 2 +- .../albert_for_question_answering.html | 2 +- .../albert_for_sequence_classification.html | 2 +- .../albert_for_token_classification.html | 2 +- .../bert_for_question_answering.html | 2 +- .../bert_for_sequence_classification.html | 2 +- .../bert_for_token_classification.html | 2 +- .../bert_for_zero_shot_classification.html | 5 ++-- .../camembert_for_question_answering.html | 2 +- ...camembert_for_sequence_classification.html | 2 +- .../camembert_for_token_classification.html | 2 +- .../classifier_dl/classifier_dl.html | 2 +- .../deberta_for_question_answering.html | 2 +- .../deberta_for_sequence_classification.html | 2 +- .../deberta_for_token_classification.html | 2 +- .../distil_bert_for_question_answering.html | 2 +- ...stil_bert_for_sequence_classification.html | 2 +- .../distil_bert_for_token_classification.html | 2 +- ...til_bert_for_zero_shot_classification.html | 2 +- .../longformer_for_question_answering.html | 2 +- ...ongformer_for_sequence_classification.html | 2 +- .../longformer_for_token_classification.html | 2 +- .../classifier_dl/multi_classifier_dl.html | 2 +- ...rta_bert_for_zero_shot_classification.html | 2 +- .../roberta_for_question_answering.html | 2 +- .../roberta_for_sequence_classification.html | 2 +- .../roberta_for_token_classification.html | 2 +- .../annotator/classifier_dl/sentiment_dl.html | 2 +- .../tapas_for_question_answering.html | 2 +- .../xlm_roberta_for_question_answering.html | 2 +- ...m_roberta_for_sequence_classification.html | 2 +- .../xlm_roberta_for_token_classification.html | 2 +- .../xlnet_for_sequence_classification.html | 2 +- .../xlnet_for_token_classification.html | 2 +- .../annotator/coref/spanbert_coref.html | 2 +- .../cv/convnext_for_image_classification.html | 2 +- .../cv/swin_for_image_classification.html | 2 +- .../cv/vit_for_image_classification.html | 2 +- .../sparknlp/annotator/date2_chunk.html | 2 +- .../dependency/dependency_parser.html | 2 +- .../dependency/typed_dependency_parser.html | 2 +- .../annotator/document_normalizer.html | 2 +- .../embeddings/albert_embeddings.html | 2 +- .../annotator/embeddings/bert_embeddings.html | 2 +- .../embeddings/bert_sentence_embeddings.html | 2 +- .../embeddings/camembert_embeddings.html | 2 +- .../embeddings/chunk_embeddings.html | 2 +- .../embeddings/deberta_embeddings.html | 2 +- .../embeddings/distil_bert_embeddings.html | 2 +- .../annotator/embeddings/doc2vec.html | 2 +- .../annotator/embeddings/e5_embeddings.html | 2 +- .../annotator/embeddings/elmo_embeddings.html | 2 +- .../embeddings/instructor_embeddings.html | 2 +- .../embeddings/longformer_embeddings.html | 2 +- .../embeddings/roberta_embeddings.html | 2 +- .../roberta_sentence_embeddings.html | 2 +- .../embeddings/sentence_embeddings.html | 2 +- .../universal_sentence_encoder.html | 2 +- .../annotator/embeddings/word2vec.html | 2 +- .../annotator/embeddings/word_embeddings.html | 2 +- .../embeddings/xlm_roberta_embeddings.html | 2 +- .../xlm_roberta_sentence_embeddings.html | 2 +- .../embeddings/xlnet_embeddings.html | 2 +- .../sparknlp/annotator/er/entity_ruler.html | 2 +- .../sparknlp/annotator/graph_extraction.html | 2 +- .../yake_keyword_extraction.html | 2 +- .../annotator/ld_dl/language_detector_dl.html | 2 +- .../sparknlp/annotator/lemmatizer.html | 2 +- .../annotator/matcher/big_text_matcher.html | 2 +- .../annotator/matcher/date_matcher.html | 2 +- .../annotator/matcher/multi_date_matcher.html | 2 +- .../annotator/matcher/regex_matcher.html | 2 +- .../annotator/matcher/text_matcher.html | 2 +- .../sparknlp/annotator/n_gram_generator.html | 2 +- .../sparknlp/annotator/ner/ner_approach.html | 2 +- .../sparknlp/annotator/ner/ner_converter.html | 2 +- .../sparknlp/annotator/ner/ner_crf.html | 2 +- .../sparknlp/annotator/ner/ner_dl.html | 2 +- .../annotator/ner/ner_overwriter.html | 2 +- .../annotator/ner/zero_shot_ner_model.html | 2 +- .../sparknlp/annotator/normalizer.html | 2 +- .../annotator/param/classifier_encoder.html | 2 +- .../annotator/param/evaluation_dl_params.html | 2 +- .../sparknlp/annotator/pos/perceptron.html | 2 +- .../annotator/sentence/sentence_detector.html | 2 +- .../sentence/sentence_detector_dl.html | 2 +- .../sentiment/sentiment_detector.html | 2 +- .../annotator/sentiment/vivekn_sentiment.html | 2 +- .../annotator/seq2seq/bart_transformer.html | 2 +- .../annotator/seq2seq/gpt2_transformer.html | 2 +- .../annotator/seq2seq/marian_transformer.html | 2 +- .../annotator/seq2seq/t5_transformer.html | 2 +- .../document_similarity_ranker.html | 2 +- .../spell_check/context_spell_checker.html | 2 +- .../spell_check/norvig_sweeting.html | 2 +- .../spell_check/symmetric_delete.html | 2 +- .../modules/sparknlp/annotator/stemmer.html | 2 +- .../annotator/stop_words_cleaner.html | 2 +- .../annotator/tf_ner_dl_graph_builder.html | 2 +- .../annotator/token/chunk_tokenizer.html | 2 +- .../annotator/token/recursive_tokenizer.html | 2 +- .../annotator/token/regex_tokenizer.html | 2 +- .../sparknlp/annotator/token/tokenizer.html | 2 +- .../sparknlp/annotator/ws/word_segmenter.html | 2 +- .../sparknlp/base/audio_assembler.html | 2 +- .../modules/sparknlp/base/doc2_chunk.html | 2 +- .../sparknlp/base/document_assembler.html | 2 +- .../sparknlp/base/embeddings_finisher.html | 2 +- .../modules/sparknlp/base/finisher.html | 2 +- .../modules/sparknlp/base/graph_finisher.html | 2 +- .../sparknlp/base/has_recursive_fit.html | 2 +- .../base/has_recursive_transform.html | 2 +- .../sparknlp/base/image_assembler.html | 2 +- .../modules/sparknlp/base/light_pipeline.html | 2 +- .../base/multi_document_assembler.html | 2 +- .../sparknlp/base/recursive_pipeline.html | 2 +- .../sparknlp/base/table_assembler.html | 2 +- .../modules/sparknlp/base/token2_chunk.html | 2 +- .../sparknlp/base/token_assembler.html | 2 +- .../sparknlp/common/annotator_approach.html | 2 +- .../sparknlp/common/annotator_model.html | 2 +- .../sparknlp/common/annotator_properties.html | 2 +- .../sparknlp/common/match_strategy.html | 2 +- .../modules/sparknlp/common/properties.html | 23 ++++++++++++++++++- .../modules/sparknlp/common/read_as.html | 2 +- .../common/recursive_annotator_approach.html | 2 +- .../python/modules/sparknlp/common/utils.html | 2 +- .../python/modules/sparknlp/functions.html | 2 +- .../sparknlp/internal/annotator_java_ml.html | 2 +- .../internal/annotator_transformer.html | 2 +- .../internal/extended_java_wrapper.html | 2 +- .../internal/params_getters_setters.html | 2 +- .../modules/sparknlp/internal/recursive.html | 2 +- .../modules/sparknlp/logging/comet.html | 2 +- .../pretrained/pretrained_pipeline.html | 2 +- .../pretrained/resource_downloader.html | 2 +- .../modules/sparknlp/training/conll.html | 2 +- .../modules/sparknlp/training/conllu.html | 2 +- .../python/modules/sparknlp/training/pos.html | 2 +- .../modules/sparknlp/training/pub_tator.html | 2 +- .../training/spacy_to_annotation.html | 2 +- docs/api/python/py-modindex.html | 2 +- .../sparknlp/annotation/index.html | 2 +- .../sparknlp/annotation_audio/index.html | 2 +- .../sparknlp/annotation_image/index.html | 2 +- .../annotator/audio/hubert_for_ctc/index.html | 2 +- .../sparknlp/annotator/audio/index.html | 2 +- .../audio/wav2vec2_for_ctc/index.html | 2 +- .../sparknlp/annotator/chunk2_doc/index.html | 2 +- .../sparknlp/annotator/chunker/index.html | 2 +- .../albert_for_question_answering/index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../bert_for_question_answering/index.html | 2 +- .../index.html | 2 +- .../bert_for_token_classification/index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../classifier_dl/classifier_dl/index.html | 2 +- .../deberta_for_question_answering/index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../annotator/classifier_dl/index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../multi_classifier_dl/index.html | 2 +- .../index.html | 2 +- .../roberta_for_question_answering/index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../classifier_dl/sentiment_dl/index.html | 2 +- .../tapas_for_question_answering/index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../xlnet_for_token_classification/index.html | 2 +- .../sparknlp/annotator/coref/index.html | 2 +- .../annotator/coref/spanbert_coref/index.html | 2 +- .../index.html | 2 +- .../sparknlp/annotator/cv/index.html | 2 +- .../swin_for_image_classification/index.html | 2 +- .../vit_for_image_classification/index.html | 2 +- .../sparknlp/annotator/date2_chunk/index.html | 2 +- .../dependency/dependency_parser/index.html | 2 +- .../sparknlp/annotator/dependency/index.html | 2 +- .../typed_dependency_parser/index.html | 2 +- .../annotator/document_normalizer/index.html | 2 +- .../embeddings/albert_embeddings/index.html | 2 +- .../embeddings/bert_embeddings/index.html | 2 +- .../bert_sentence_embeddings/index.html | 2 +- .../camembert_embeddings/index.html | 2 +- .../embeddings/chunk_embeddings/index.html | 2 +- .../embeddings/deberta_embeddings/index.html | 2 +- .../distil_bert_embeddings/index.html | 2 +- .../annotator/embeddings/doc2vec/index.html | 2 +- .../embeddings/e5_embeddings/index.html | 2 +- .../embeddings/elmo_embeddings/index.html | 2 +- .../sparknlp/annotator/embeddings/index.html | 2 +- .../instructor_embeddings/index.html | 2 +- .../longformer_embeddings/index.html | 2 +- .../embeddings/roberta_embeddings/index.html | 2 +- .../roberta_sentence_embeddings/index.html | 2 +- .../embeddings/sentence_embeddings/index.html | 2 +- .../universal_sentence_encoder/index.html | 2 +- .../annotator/embeddings/word2vec/index.html | 2 +- .../embeddings/word_embeddings/index.html | 2 +- .../xlm_roberta_embeddings/index.html | 2 +- .../index.html | 2 +- .../embeddings/xlnet_embeddings/index.html | 2 +- .../annotator/er/entity_ruler/index.html | 2 +- .../sparknlp/annotator/er/index.html | 2 +- .../annotator/graph_extraction/index.html | 2 +- .../autosummary/sparknlp/annotator/index.html | 2 +- .../annotator/keyword_extraction/index.html | 2 +- .../yake_keyword_extraction/index.html | 2 +- .../sparknlp/annotator/ld_dl/index.html | 2 +- .../ld_dl/language_detector_dl/index.html | 2 +- .../sparknlp/annotator/lemmatizer/index.html | 2 +- .../matcher/big_text_matcher/index.html | 2 +- .../annotator/matcher/date_matcher/index.html | 2 +- .../sparknlp/annotator/matcher/index.html | 2 +- .../matcher/multi_date_matcher/index.html | 2 +- .../matcher/regex_matcher/index.html | 2 +- .../annotator/matcher/text_matcher/index.html | 2 +- .../annotator/n_gram_generator/index.html | 2 +- .../sparknlp/annotator/ner/index.html | 2 +- .../annotator/ner/ner_approach/index.html | 2 +- .../annotator/ner/ner_converter/index.html | 2 +- .../sparknlp/annotator/ner/ner_crf/index.html | 2 +- .../sparknlp/annotator/ner/ner_dl/index.html | 2 +- .../annotator/ner/ner_overwriter/index.html | 2 +- .../ner/zero_shot_ner_model/index.html | 2 +- .../sparknlp/annotator/normalizer/index.html | 2 +- .../param/classifier_encoder/index.html | 2 +- .../param/evaluation_dl_params/index.html | 2 +- .../sparknlp/annotator/param/index.html | 2 +- .../sparknlp/annotator/pos/index.html | 2 +- .../annotator/pos/perceptron/index.html | 2 +- .../sparknlp/annotator/sentence/index.html | 2 +- .../sentence/sentence_detector/index.html | 2 +- .../sentence/sentence_detector_dl/index.html | 2 +- .../sparknlp/annotator/sentiment/index.html | 2 +- .../sentiment/sentiment_detector/index.html | 2 +- .../sentiment/vivekn_sentiment/index.html | 2 +- .../seq2seq/bart_transformer/index.html | 2 +- .../seq2seq/gpt2_transformer/index.html | 2 +- .../sparknlp/annotator/seq2seq/index.html | 2 +- .../seq2seq/marian_transformer/index.html | 2 +- .../seq2seq/t5_transformer/index.html | 2 +- .../document_similarity_ranker/index.html | 2 +- .../sparknlp/annotator/similarity/index.html | 2 +- .../context_spell_checker/index.html | 2 +- .../sparknlp/annotator/spell_check/index.html | 2 +- .../spell_check/norvig_sweeting/index.html | 2 +- .../spell_check/symmetric_delete/index.html | 2 +- .../sparknlp/annotator/stemmer/index.html | 2 +- .../annotator/stop_words_cleaner/index.html | 2 +- .../tf_ner_dl_graph_builder/index.html | 2 +- .../token/chunk_tokenizer/index.html | 2 +- .../sparknlp/annotator/token/index.html | 2 +- .../token/recursive_tokenizer/index.html | 2 +- .../token/regex_tokenizer/index.html | 2 +- .../annotator/token/tokenizer/index.html | 2 +- .../sparknlp/annotator/ws/index.html | 2 +- .../annotator/ws/word_segmenter/index.html | 2 +- .../sparknlp/base/audio_assembler/index.html | 2 +- .../sparknlp/base/doc2_chunk/index.html | 2 +- .../base/document_assembler/index.html | 2 +- .../base/embeddings_finisher/index.html | 2 +- .../sparknlp/base/finisher/index.html | 2 +- .../sparknlp/base/graph_finisher/index.html | 2 +- .../base/has_recursive_fit/index.html | 2 +- .../base/has_recursive_transform/index.html | 2 +- .../sparknlp/base/image_assembler/index.html | 2 +- .../autosummary/sparknlp/base/index.html | 2 +- .../sparknlp/base/light_pipeline/index.html | 2 +- .../base/multi_document_assembler/index.html | 2 +- .../base/recursive_pipeline/index.html | 2 +- .../sparknlp/base/table_assembler/index.html | 2 +- .../sparknlp/base/token2_chunk/index.html | 2 +- .../sparknlp/base/token_assembler/index.html | 2 +- .../common/annotator_approach/index.html | 2 +- .../common/annotator_model/index.html | 2 +- .../common/annotator_properties/index.html | 2 +- .../sparknlp/common/annotator_type/index.html | 2 +- .../common/coverage_result/index.html | 2 +- .../autosummary/sparknlp/common/index.html | 2 +- .../sparknlp/common/match_strategy/index.html | 2 +- .../sparknlp/common/properties/index.html | 2 +- .../sparknlp/common/read_as/index.html | 2 +- .../recursive_annotator_approach/index.html | 2 +- .../sparknlp/common/storage/index.html | 2 +- .../sparknlp/common/utils/index.html | 2 +- .../autosummary/sparknlp/functions/index.html | 2 +- .../reference/autosummary/sparknlp/index.html | 2 +- .../internal/annotator_java_ml/index.html | 2 +- .../internal/annotator_transformer/index.html | 2 +- .../internal/extended_java_wrapper/index.html | 2 +- .../autosummary/sparknlp/internal/index.html | 2 +- .../params_getters_setters/index.html | 2 +- .../sparknlp/internal/recursive/index.html | 2 +- .../sparknlp/logging/comet/index.html | 2 +- .../autosummary/sparknlp/logging/index.html | 2 +- .../sparknlp/pretrained/index.html | 2 +- .../pretrained/pretrained_pipeline/index.html | 2 +- .../pretrained/resource_downloader/index.html | 2 +- .../sparknlp/pretrained/utils/index.html | 2 +- .../sparknlp/training/conll/index.html | 2 +- .../sparknlp/training/conllu/index.html | 2 +- .../autosummary/sparknlp/training/index.html | 2 +- .../sparknlp/training/pos/index.html | 2 +- .../sparknlp/training/pub_tator/index.html | 2 +- .../training/spacy_to_annotation/index.html | 2 +- .../sparknlp/training/tfgraphs/index.html | 2 +- .../sparknlp/upload_to_hub/index.html | 2 +- .../autosummary/sparknlp/util/index.html | 2 +- docs/api/python/reference/index.html | 2 +- docs/api/python/search.html | 2 +- docs/api/python/searchindex.js | 2 +- .../python/static/documentation_options.js | 2 +- docs/api/python/third_party/Comet.html | 2 +- docs/api/python/third_party/MLflow.html | 2 +- docs/api/python/third_party/index.html | 2 +- docs/api/python/user_guide/annotation.html | 2 +- docs/api/python/user_guide/annotators.html | 2 +- .../python/user_guide/custom_pipelines.html | 2 +- docs/api/python/user_guide/helpers.html | 2 +- docs/api/python/user_guide/index.html | 2 +- .../python/user_guide/light_pipelines.html | 2 +- .../user_guide/pretrained_pipelines.html | 2 +- docs/api/python/user_guide/training.html | 2 +- 1345 files changed, 4373 insertions(+), 4340 deletions(-) diff --git a/docs/api/com/index.html b/docs/api/com/index.html index f541dae7aaccae..0e735e306ba9fe 100644 --- a/docs/api/com/index.html +++ b/docs/api/com/index.html @@ -3,9 +3,9 @@ - Spark NLP 5.0.0 ScalaDoc - com - - + Spark NLP 5.0.1 ScalaDoc - com + + @@ -28,7 +28,7 @@