diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst
index ebd4667d544616..f18b66915fc3ce 100644
--- a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst
+++ b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst
@@ -218,6 +218,114 @@ Specify generation_config to use grouped beam search:
cout << pipe.generate("The Sun is yellow because", config);
}
+Efficient Text Generation via Speculative Decoding
+##################################################
+
+Speculative decoding (or assisted-generation) enables faster token generation
+when an additional smaller draft model is used alongside the main model.
+The draft model predicts the next K tokens one by one in an autoregressive manner,
+while the main model validates these predictions and corrects them if necessary.
+
+Each predicted token is compared, and when there is a difference between the draft and
+main model, the last token predicted by the main model is kept. Then, the draft
+model acquires this token and tries prediction of the next K tokens,
+thus repeating the cycle.
+
+This method eliminates the need for multiple infer requests to the main model,
+which results in increased performance. Its implementation in the pipeline is
+shown in the code samples below:
+
+.. tab-set::
+
+ .. tab-item:: Python
+ :sync: py
+
+ .. code-block:: python
+
+ import openvino_genai
+ import queue
+ import threading
+
+ def streamer(subword):
+ print(subword, end='', flush=True)
+ return False
+
+ def infer(model_dir: str, draft_model_dir: str, prompt: str):
+ main_device = 'CPU' # GPU can be used as well.
+ draft_device = 'CPU'
+
+ scheduler_config = openvino_genai.SchedulerConfig()
+ scheduler_config.cache_size = 2
+
+ draft_model = openvino_genai.draft_model(draft_model_dir, draft_device)
+
+ pipe = openvino_genai.LLMPipeline(model_dir, main_device, scheduler_config=scheduler_config, draft_model=draft_model)
+
+ config = openvino_genai.GenerationConfig()
+ config.max_new_tokens = 100
+ config.num_assistant_tokens = 5
+
+ pipe.generate(prompt, config, streamer)
+
+
+ For more information, refer to the
+ `Python sample `__.
+
+
+ .. tab-item:: C++
+ :sync: cpp
+
+ .. code-block:: cpp
+
+ #include
+
+ #include "openvino/genai/llm_pipeline.hpp"
+
+ int main(int argc, char* argv[]) try {
+ if (4 != argc) {
+ throw std::runtime_error(std::string{"Usage: "} + argv[0] + " ''");
+ }
+
+ ov::genai::GenerationConfig config;
+ config.max_new_tokens = 100;
+ config.num_assistant_tokens = 5;
+
+ std::string main_model_path = argv[1];
+ std::string draft_model_path = argv[2];
+ std::string prompt = argv[3];
+
+ std::string main_device = "CPU", draft_device = "CPU";
+
+ ov::genai::SchedulerConfig scheduler_config;
+ scheduler_config.cache_size = 5;
+
+ ov::genai::LLMPipeline pipe(
+ main_model_path,
+ main_device,
+ ov::genai::draft_model(draft_model_path, draft_device),
+ ov::genai::scheduler_config(scheduler_config));
+
+ auto streamer = [](std::string subword) {
+ std::cout << subword << std::flush;
+ return false;
+ };
+
+ pipe.generate(prompt, config, streamer);
+ } catch (const std::exception& error) {
+ try {
+ std::cerr << error.what() << '\n';
+ } catch (const std::ios_base::failure&) {}
+ return EXIT_FAILURE;
+ } catch (...) {
+ try {
+ std::cerr << "Non-exception object thrown\n";
+ } catch (const std::ios_base::failure&) {}
+ return EXIT_FAILURE;
+ }
+
+
+ For more information, refer to the
+ `C++ sample `__
Comparing with Hugging Face Results
#######################################
diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide/genai-use-cases.rst b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide/genai-use-cases.rst
index 6033bd8ed96106..245a2648aab491 100644
--- a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide/genai-use-cases.rst
+++ b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide/genai-use-cases.rst
@@ -118,7 +118,7 @@ sample shows basic usage of the ``Text2ImagePipeline`` pipeline.
image_write("baseline.bmp", image)
For more information, refer to the
- `Python sample `__
+ `Python sample `__
.. tab-item:: C++
:sync: cpp
@@ -218,7 +218,7 @@ sample shows basic usage of the ``Text2ImagePipeline`` pipeline.
For more information, refer to the
- `C++ sample `__
+ `C++ sample `__
@@ -269,7 +269,7 @@ and use audio files in WAV format at a sampling rate of 16 kHz as input.
For more information, refer to the
- `Python sample `__.
+ `Python sample `__.
.. tab-item:: C++
:sync: cpp
@@ -322,7 +322,7 @@ and use audio files in WAV format at a sampling rate of 16 kHz as input.
For more information, refer to the
- `C++ sample `__.
+ `C++ sample `__.
Using GenAI in Chat Scenario
@@ -367,7 +367,7 @@ mark a conversation session, as shown in the samples below:
For more information, refer to the
- `Python sample `__.
+ `Python sample `__.
.. tab-item:: C++
:sync: cpp
@@ -415,7 +415,142 @@ mark a conversation session, as shown in the samples below:
For more information, refer to the
- `C++ sample `__
+ `C++ sample `__
+
+
+Using GenAI with Vision Language Models
+#######################################
+
+OpenVINO GenAI introduces the ``openvino_genai.VLMPipeline`` pipeline for
+inference of multimodal text-generation Vision Language Models (VLMs).
+With a text prompt and an image as input, VLMPipeline can generate text using
+models such as LLava or MiniCPM-V. See the chat scenario presented
+in the samples below:
+
+.. tab-set::
+
+ .. tab-item:: Python
+ :sync: py
+
+ .. code-block:: python
+
+ import numpy as np
+ import openvino_genai
+ from PIL import Image
+ from openvino import Tensor
+ from pathlib import Path
+
+
+ def streamer(subword: str) -> bool:
+ print(subword, end='', flush=True)
+
+
+ def read_image(path: str) -> Tensor:
+ pic = Image.open(path).convert("RGB")
+ image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.uint8)
+ return Tensor(image_data)
+
+
+ def read_images(path: str) -> list[Tensor]:
+ entry = Path(path)
+ if entry.is_dir():
+ return [read_image(str(file)) for file in sorted(entry.iterdir())]
+ return [read_image(path)]
+
+
+ def infer(model_dir: str, image_dir: str):
+ rgbs = read_images(image_dir)
+ device = 'CPU' # GPU can be used as well.
+ enable_compile_cache = dict()
+ if "GPU" == device:
+ enable_compile_cache["CACHE_DIR"] = "vlm_cache"
+ pipe = openvino_genai.VLMPipeline(model_dir, device, **enable_compile_cache)
+
+ config = openvino_genai.GenerationConfig()
+ config.max_new_tokens = 100
+
+ pipe.start_chat()
+ prompt = input('question:\n')
+ pipe.generate(prompt, images=rgbs, generation_config=config, streamer=streamer)
+
+ while True:
+ try:
+ prompt = input("\n----------\n"
+ "question:\n")
+ except EOFError:
+ break
+ pipe.generate(prompt, generation_config=config, streamer=streamer)
+ pipe.finish_chat()
+
+
+ For more information, refer to the
+ `Python sample `__.
+
+ .. tab-item:: C++
+ :sync: cpp
+
+ .. code-block:: cpp
+
+ #include "load_image.hpp"
+ #include
+ #include
+
+ bool print_subword(std::string&& subword) {
+ return !(std::cout << subword << std::flush);
+ }
+
+ int main(int argc, char* argv[]) try {
+ if (3 != argc) {
+ throw std::runtime_error(std::string{"Usage "} + argv[0] + " ");
+ }
+
+ std::vector rgbs = utils::load_images(argv[2]);
+
+ std::string device = "CPU"; // GPU can be used as well.
+ ov::AnyMap enable_compile_cache;
+ if ("GPU" == device) {
+ enable_compile_cache.insert({ov::cache_dir("vlm_cache")});
+ }
+ ov::genai::VLMPipeline pipe(argv[1], device, enable_compile_cache);
+
+ ov::genai::GenerationConfig generation_config;
+ generation_config.max_new_tokens = 100;
+
+ std::string prompt;
+
+ pipe.start_chat();
+ std::cout << "question:\n";
+
+ std::getline(std::cin, prompt);
+ pipe.generate(prompt,
+ ov::genai::images(rgbs),
+ ov::genai::generation_config(generation_config),
+ ov::genai::streamer(print_subword));
+ std::cout << "\n----------\n"
+ "question:\n";
+ while (std::getline(std::cin, prompt)) {
+ pipe.generate(prompt,
+ ov::genai::generation_config(generation_config),
+ ov::genai::streamer(print_subword));
+ std::cout << "\n----------\n"
+ "question:\n";
+ }
+ pipe.finish_chat();
+ } catch (const std::exception& error) {
+ try {
+ std::cerr << error.what() << '\n';
+ } catch (const std::ios_base::failure&) {}
+ return EXIT_FAILURE;
+ } catch (...) {
+ try {
+ std::cerr << "Non-exception object thrown\n";
+ } catch (const std::ios_base::failure&) {}
+ return EXIT_FAILURE;
+ }
+
+
+ For more information, refer to the
+ `C++ sample `__
Additional Resources
#####################
@@ -423,4 +558,6 @@ Additional Resources
* :doc:`Install OpenVINO GenAI <../../../get-started/install-openvino/install-openvino-genai>`
* `OpenVINO GenAI Repo `__
* `OpenVINO GenAI Samples `__
+* A Jupyter notebook demonstrating
+ `Visual-language assistant with MiniCPM-V2 and OpenVINO `__
* `OpenVINO Tokenizers `__