From fd5b8a55d8f6fa106f4acabf7cd241e976f9fa3e Mon Sep 17 00:00:00 2001
From: siddhivelankar23 <siddhi.velankar@intel.com>
Date: Mon, 14 Oct 2024 14:27:17 +0000
Subject: [PATCH 01/17] add gaudi files

---
 gaudi_spawn.py                |  65 ++++++
 gaudi_utils/embeddings.py     | 257 ++++++++++++++++++++++++
 gaudi_utils/pipeline.py       | 361 ++++++++++++++++++++++++++++++++++
 gaudi_utils/test_deepspeed.py |  34 ++++
 gaudi_utils/test_pipeline.py  |  71 +++++++
 5 files changed, 788 insertions(+)
 create mode 100644 gaudi_spawn.py
 create mode 100644 gaudi_utils/embeddings.py
 create mode 100644 gaudi_utils/pipeline.py
 create mode 100644 gaudi_utils/test_deepspeed.py
 create mode 100644 gaudi_utils/test_pipeline.py

diff --git a/gaudi_spawn.py b/gaudi_spawn.py
new file mode 100644
index 00000000..142e5809
--- /dev/null
+++ b/gaudi_spawn.py
@@ -0,0 +1,65 @@
+import sys
+from argparse import REMAINDER, ArgumentParser
+
+from optimum.habana.distributed import DistributedRunner
+
+
+def parse_args():
+    """
+    Helper function parsing the command line options.
+    @retval ArgumentParser
+    """
+    parser = ArgumentParser(
+        description=(
+            "Habana Gaudi distributed inference launch helper utility that will spawn up multiple distributed"
+            " processes."
+        )
+    )
+
+    # Optional arguments for the launch helper
+    parser.add_argument("--world_size", type=int, default=1, help="Number of HPUs to use (1, 4 or 8)")
+    parser.add_argument("--hostfile", type=str, default=None, help="Path to the file where hosts are specified.")
+    parser.add_argument("--use_mpi", action="store_true", help="Use MPI for distributed inference")
+    parser.add_argument("--use_deepspeed", action="store_true", help="Use DeepSpeed for distributed inference")
+
+    # positional
+    parser.add_argument(
+        "inference_script",
+        type=str,
+        help=(
+            "The full path to the single HPU inference "
+            "program/script to be launched in parallel, "
+            "followed by all the arguments for the "
+            "inference script."
+        ),
+    )
+
+    # rest from the training program
+    parser.add_argument("inference_script_args", nargs=REMAINDER)
+
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    # Patch sys.argv
+    sys.argv = [args.inference_script] + args.inference_script_args
+    # Handle the case where arguments contain whitespaces
+    argv = ['"{}"'.format(arg) if " " in arg and arg[0] != '"' and arg[-1] != '"' else arg for arg in sys.argv]
+    command_list = [" ".join(argv)]
+
+    distributed_runner = DistributedRunner(
+        command_list=command_list,
+        world_size=args.world_size,
+        hostfile=args.hostfile,
+        use_mpi=False,
+        use_deepspeed=args.use_deepspeed,
+    )
+
+    ret_code = distributed_runner.run()
+    sys.exit(ret_code)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/gaudi_utils/embeddings.py b/gaudi_utils/embeddings.py
new file mode 100644
index 00000000..83db944a
--- /dev/null
+++ b/gaudi_utils/embeddings.py
@@ -0,0 +1,257 @@
+import json
+import os
+from collections import OrderedDict
+
+import numpy as np
+import torch
+from InstructorEmbedding import INSTRUCTOR_Pooling, INSTRUCTOR_Transformer
+from InstructorEmbedding.instructor import batch_to_device, import_from_string
+from sentence_transformers import SentenceTransformer
+from tqdm.autonotebook import trange
+
+from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
+
+
+class GaudiSentenceTransformer(SentenceTransformer):
+    """Child class that overrides the tokenize method from SentenceTransformer"""
+
+    def __init__(self, model_name_or_path, embedding_input_size=-1, **kwargs):
+        super().__init__(model_name_or_path, **kwargs)
+        self.embedding_input_size = embedding_input_size
+
+    def tokenize(self, texts):
+        """Override tokenize method from SentenceTransformer"""
+        return self._first_module().tokenizer(
+            texts,
+            max_length=self.max_seq_length
+            if (self.embedding_input_size == -1 or self.embedding_input_size > self.max_seq_length)
+            else self.embedding_input_size,
+            padding="max_length",
+            return_tensors="pt",
+            truncation=True,
+        )
+
+
+class GaudiHuggingFaceEmbeddings(HuggingFaceEmbeddings):
+    """Child class that uses a GaudiSentenceTransformer client"""
+
+    def __init__(self, embedding_input_size=-1, **kwargs):
+        super().__init__(**kwargs)
+        self.client = GaudiSentenceTransformer(
+            self.model_name,
+            embedding_input_size=embedding_input_size,
+            cache_folder=self.cache_folder,
+            **self.model_kwargs,
+        )
+
+
+class GaudiINSTRUCTOR(GaudiSentenceTransformer):
+    """INSTRUCTOR class for running on Gaudis. Code taken from instructor-embedding repo"""
+
+    def __init__(self, model_name_or_path, embedding_input_size=-1, **kwargs):
+        super().__init__(model_name_or_path, embedding_input_size=embedding_input_size, **kwargs)
+
+    def smart_batching_collate(self, batch):
+        num_texts = len(batch[0].texts)
+        texts = [[] for _ in range(num_texts)]
+        labels = []
+
+        for example in batch:
+            for idx, text in enumerate(example.texts):
+                texts[idx].append(text)
+
+            labels.append(example.label)
+
+        labels = torch.tensor(labels)
+
+        sentence_features = []
+        for idx in range(num_texts):
+            assert isinstance(texts[idx][0], list)
+            assert len(texts[idx][0]) == 2, "The input should have both instruction and input text"
+            # if len(texts[idx][0])==3:
+            # print('component 3')
+            num = len(texts[idx])
+            contexts = []
+            concatenated_input_texts = []
+            for local_idx in range(num):
+                assert len(texts[idx][local_idx]) == 2
+                contexts.append(texts[idx][local_idx][0])
+                concatenated_input_texts.append("".join(texts[idx][local_idx]))
+                assert isinstance(contexts[-1], str)
+                assert isinstance(concatenated_input_texts[-1], str)
+            tokenized = self.tokenize(concatenated_input_texts)
+            context_tok = self.tokenize(contexts)
+            tokenized["context_masks"] = torch.sum(context_tok["attention_mask"], dim=1)
+            tokenized["context_masks"] = tokenized["context_masks"] - 1
+            for my_idx in range(len(tokenized["context_masks"])):
+                if tokenized["context_masks"][my_idx] <= 1:
+                    tokenized["context_masks"][my_idx] = 0
+                # text_types = [pair[-1] for pair in texts[idx]]
+                # assert all([tid==1 for tid in text_types]) or all([tid==0 for tid in text_types])
+                # tokenized['text_type'] = text_types[0]
+            # elif len(texts[idx][0])==2:
+            #     input_texts = [pair[0] for pair in texts[idx]]
+            #     text_types = [pair[-1] for pair in texts[idx]]
+            #     assert all([tid == 1 for tid in text_types]) or all([tid == 0 for tid in text_types])
+            #     tokenized = self.tokenize(input_texts)
+            #     tokenized['text_type'] = text_types[0]
+            # else:
+            #     raise ValueError('tokenization error')
+            sentence_features.append(tokenized)
+
+        return sentence_features, labels
+
+    def _load_sbert_model(self, model_path):
+        """
+        Loads a full sentence-transformers model
+        """
+        # Check if the config_sentence_transformers.json file exists (exists since v2 of the framework)
+        config_sentence_transformers_json_path = os.path.join(model_path, "config_sentence_transformers.json")
+        if os.path.exists(config_sentence_transformers_json_path):
+            with open(config_sentence_transformers_json_path) as fIn:
+                self._model_config = json.load(fIn)
+
+        # Check if a readme exists
+        model_card_path = os.path.join(model_path, "README.md")
+        if os.path.exists(model_card_path):
+            try:
+                with open(model_card_path, encoding="utf8") as fIn:
+                    self._model_card_text = fIn.read()
+            except Exception:
+                pass
+
+        # Load the modules of sentence transformer
+        modules_json_path = os.path.join(model_path, "modules.json")
+        with open(modules_json_path) as fIn:
+            modules_config = json.load(fIn)
+
+        modules = OrderedDict()
+        for module_config in modules_config:
+            if module_config["idx"] == 0:
+                print("load INSTRUCTOR_Transformer")
+                module_class = INSTRUCTOR_Transformer
+            elif module_config["idx"] == 1:
+                module_class = INSTRUCTOR_Pooling
+            else:
+                module_class = import_from_string(module_config["type"])
+            module = module_class.load(os.path.join(model_path, module_config["path"]))
+            modules[module_config["name"]] = module
+
+        return modules
+
+    def encode(
+        self,
+        sentences,
+        batch_size: int = 32,
+        show_progress_bar: bool = None,
+        output_value: str = "sentence_embedding",
+        convert_to_numpy: bool = True,
+        convert_to_tensor: bool = False,
+        device: str = None,
+        normalize_embeddings: bool = False,
+    ):
+        """
+        Computes sentence embeddings
+
+        :param sentences: the sentences to embed
+        :param batch_size: the batch size used for the computation
+        :param show_progress_bar: Output a progress bar when encode sentences
+        :param output_value:  Default sentence_embedding, to get sentence embeddings. Can be set to token_embeddings to get wordpiece token embeddings. Set to None, to get all output values
+        :param convert_to_numpy: If true, the output is a list of numpy vectors. Else, it is a list of pytorch tensors.
+        :param convert_to_tensor: If true, you get one large tensor as return. Overwrites any setting from convert_to_numpy
+        :param device: Which torch.device to use for the computation
+        :param normalize_embeddings: If set to true, returned vectors will have length 1. In that case, the faster dot-product (util.dot_score) instead of cosine similarity can be used.
+
+        :return:
+           By default, a list of tensors is returned. If convert_to_tensor, a stacked tensor is returned. If convert_to_numpy, a numpy matrix is returned.
+        """
+        self.eval()
+        if show_progress_bar is None:
+            show_progress_bar = False
+
+        if convert_to_tensor:
+            convert_to_numpy = False
+
+        if output_value != "sentence_embedding":
+            convert_to_tensor = False
+            convert_to_numpy = False
+
+        input_was_string = False
+        if isinstance(sentences, str) or not hasattr(
+            sentences, "__len__"
+        ):  # Cast an individual sentence to a list with length 1
+            sentences = [sentences]
+            input_was_string = True
+
+        if device is None:
+            device = self._target_device
+
+        self.to(device)
+
+        all_embeddings = []
+        if isinstance(sentences[0], list):
+            lengths = []
+            for sen in sentences:
+                lengths.append(-self._text_length(sen[1]))
+            length_sorted_idx = np.argsort(lengths)
+        else:
+            length_sorted_idx = np.argsort([-self._text_length(sen) for sen in sentences])
+        sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
+
+        for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar):
+            sentences_batch = sentences_sorted[start_index : start_index + batch_size]
+            features = self.tokenize(sentences_batch)
+            features = batch_to_device(features, device)
+
+            with torch.no_grad():
+                out_features = self.forward(features)
+
+                if output_value == "token_embeddings":
+                    embeddings = []
+                    for token_emb, attention in zip(out_features[output_value], out_features["attention_mask"]):
+                        last_mask_id = len(attention) - 1
+                        while last_mask_id > 0 and attention[last_mask_id].item() == 0:
+                            last_mask_id -= 1
+
+                        embeddings.append(token_emb[0 : last_mask_id + 1])
+                elif output_value is None:  # Return all outputs
+                    embeddings = []
+                    for sent_idx in range(len(out_features["sentence_embedding"])):
+                        row = {name: out_features[name][sent_idx] for name in out_features}
+                        embeddings.append(row)
+                else:  # Sentence embeddings
+                    embeddings = out_features[output_value]
+                    embeddings = embeddings.detach()
+                    if normalize_embeddings:
+                        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
+
+                    # fixes for #522 and #487 to avoid oom problems on gpu with large datasets
+                    if convert_to_numpy:
+                        embeddings = embeddings.cpu()
+
+                all_embeddings.extend(embeddings)
+
+        all_embeddings = [all_embeddings[idx] for idx in np.argsort(length_sorted_idx)]
+
+        if convert_to_tensor:
+            all_embeddings = torch.stack(all_embeddings)
+        elif convert_to_numpy:
+            all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings])
+
+        if input_was_string:
+            all_embeddings = all_embeddings[0]
+
+        return all_embeddings
+
+
+class GaudiHuggingFaceInstructEmbeddings(HuggingFaceInstructEmbeddings):
+    """Child class that uses a GaudiINSTRUCTOR client"""
+
+    def __init__(self, embedding_input_size=-1, **kwargs):
+        super().__init__(**kwargs)
+        self.client = GaudiINSTRUCTOR(
+            self.model_name,
+            embedding_input_size=embedding_input_size,
+            cache_folder=self.cache_folder,
+            **self.model_kwargs,
+        )
diff --git a/gaudi_utils/pipeline.py b/gaudi_utils/pipeline.py
new file mode 100644
index 00000000..8768c5f5
--- /dev/null
+++ b/gaudi_utils/pipeline.py
@@ -0,0 +1,361 @@
+import copy
+import json
+import os
+from pathlib import Path
+from typing import List
+
+import habana_frameworks.torch.hpu as torch_hpu
+import torch
+from habana_frameworks.torch.distributed.hccl import initialize_distributed_hpu
+from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+from huggingface_hub import snapshot_download
+from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+from optimum.habana.utils import set_seed
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList
+from transformers.utils import is_offline_mode
+
+
+def get_repo_root(model_name_or_path, local_rank=-1, token=None):
+    """
+    Downloads the specified model checkpoint and returns the repository where it was downloaded.
+    """
+    if Path(model_name_or_path).is_dir():
+        # If it is a local model, no need to download anything
+        return model_name_or_path
+    else:
+        # Checks if online or not
+        if is_offline_mode():
+            if local_rank == 0:
+                print("Offline mode: forcing local_files_only=True")
+
+        # Only download PyTorch weights by default
+        allow_patterns = ["*.bin"]
+
+        # Download only on first process
+        if local_rank in [-1, 0]:
+            cache_dir = snapshot_download(
+                model_name_or_path,
+                local_files_only=is_offline_mode(),
+                cache_dir=os.getenv("TRANSFORMERS_CACHE", None),
+                allow_patterns=allow_patterns,
+                max_workers=16,
+                token=token,
+            )
+            if local_rank == -1:
+                # If there is only one process, then the method is finished
+                return cache_dir
+
+        # Make all processes wait so that other processes can get the checkpoint directly from cache
+        torch.distributed.barrier()
+
+        return snapshot_download(
+            model_name_or_path,
+            local_files_only=is_offline_mode(),
+            cache_dir=os.getenv("TRANSFORMERS_CACHE", None),
+            allow_patterns=allow_patterns,
+            token=token,
+        )
+
+
+def get_checkpoint_files(model_name_or_path, local_rank):
+    """
+    Gets the list of files for the specified model checkpoint.
+    """
+    cached_repo_dir = get_repo_root(model_name_or_path, local_rank)
+
+    # Extensions: .bin | .pt
+    # Creates a list of paths from all downloaded files in cache dir
+    file_list = [str(entry) for entry in Path(cached_repo_dir).rglob("*.[bp][it][n]") if entry.is_file()]
+    return file_list
+
+
+def write_checkpoints_json(model_name_or_path, local_rank, checkpoints_json):
+    """
+    Dumps metadata into a JSON file for DeepSpeed-inference.
+    """
+    checkpoint_files = get_checkpoint_files(model_name_or_path, local_rank)
+    if local_rank == 0:
+        data = {"type": "ds_model", "checkpoints": checkpoint_files, "version": 1.0}
+        with open(checkpoints_json, "w") as fp:
+            json.dump(data, fp)
+
+
+def model_on_meta(config):
+    """
+    Checks if load the model to meta.
+    """
+    return config.model_type in ["bloom", "llama"]
+
+
+def get_optimized_model_name(config):
+    from optimum.habana.transformers.generation import MODELS_OPTIMIZED_WITH_STATIC_SHAPES
+
+    for model_type in MODELS_OPTIMIZED_WITH_STATIC_SHAPES:
+        if model_type == config.model_type:
+            return model_type
+
+    return None
+
+
+def model_is_optimized(config):
+    """
+    Checks if the given config belongs to a model in optimum/habana/transformers/models, which has a
+    new input token_idx.
+    """
+    return get_optimized_model_name(config) is not None
+
+
+def get_ds_injection_policy(config):
+    """
+    Defines injection policies for model parallelism via DeepSpeed.
+    """
+    model_type = get_optimized_model_name(config)
+    policy = {}
+    if model_type:
+        if model_type == "bloom":
+            from transformers.models.bloom.modeling_bloom import BloomBlock
+
+            policy = {BloomBlock: ("self_attention.dense", "mlp.dense_4h_to_h")}
+
+        if model_type == "opt":
+            from transformers.models.opt.modeling_opt import OPTDecoderLayer
+
+            policy = {OPTDecoderLayer: ("self_attn.out_proj", ".fc2")}
+
+        if model_type == "gpt2":
+            from transformers.models.gpt2.modeling_gpt2 import GPT2MLP
+
+            policy = {GPT2MLP: ("attn.c_proj", "mlp.c_proj")}
+
+        if model_type == "gptj":
+            from transformers.models.gptj.modeling_gptj import GPTJBlock
+
+            policy = {GPTJBlock: ("attn.out_proj", "mlp.fc_out")}
+
+        if model_type == "gpt_neox":
+            from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXLayer
+
+            policy = {GPTNeoXLayer: ("attention.dense", "mlp.dense_4h_to_h")}
+
+        if model_type == "llama":
+            from transformers.models.llama.modeling_llama import LlamaDecoderLayer
+
+            policy = {LlamaDecoderLayer: ("self_attn.o_proj", "mlp.down_proj")}
+
+    return policy
+
+
+class CustomStoppingCriteria(StoppingCriteria):
+    """ "
+    A custom stopping criteria which stops text generation when a stop token is generated.
+    """
+
+    def __init__(self, stop_token_id):
+        super().__init__()
+        self.stop_token_id = stop_token_id
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
+        return self.stop_token_id in input_ids[0]
+
+
+class GaudiTextGenerationPipeline:
+    """
+    An end-to-end text-generation pipeline that can used to initialize LangChain classes. It supports both single-hpu and multi-hpu inference.
+    """
+
+    def __init__(self, model_name_or_path=None, **kwargs):
+        self.use_deepspeed = "deepspeed" in os.environ["_"]
+
+        if self.use_deepspeed:
+            world_size, _, self.local_rank = initialize_distributed_hpu()
+
+            import deepspeed
+
+            # Initialize Deepspeed processes
+            deepspeed.init_distributed(dist_backend="hccl")
+
+        self.task = "text-generation"
+        self.device = "hpu"
+
+        # Tweak generation so that it runs faster on Gaudi
+        adapt_transformers_to_gaudi()
+        set_seed(27)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+
+        model_dtype = torch.bfloat16
+
+        if self.use_deepspeed:
+            config = AutoConfig.from_pretrained(model_name_or_path)
+            is_optimized = model_is_optimized(config)
+            load_to_meta = model_on_meta(config)
+
+            if load_to_meta:
+                # Construct model with fake meta tensors, later will be replaced on devices during ds-inference ckpt load
+                with deepspeed.OnDevice(dtype=model_dtype, device="meta"):
+                    model = AutoModelForCausalLM.from_config(config, torch_dtype=model_dtype)
+            else:
+                get_repo_root(model_name_or_path, local_rank=self.local_rank)
+                # placement on hpu if meta tensors are not supported
+                with deepspeed.OnDevice(dtype=model_dtype, device="hpu"):
+                    model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=model_dtype)
+            model = model.eval()
+
+            # Initialize the model
+            ds_inference_kwargs = {"dtype": model_dtype}
+            ds_inference_kwargs["tensor_parallel"] = {"tp_size": world_size}
+            ds_inference_kwargs["enable_cuda_graph"] = True
+
+            if load_to_meta:
+                # model loaded to meta is managed differently
+                checkpoints_json = "checkpoints.json"
+                write_checkpoints_json(model_name_or_path, self.local_rank, checkpoints_json)
+
+            # Make sure all devices/nodes have access to the model checkpoints
+            torch.distributed.barrier()
+
+            ds_inference_kwargs["injection_policy"] = get_ds_injection_policy(config)
+            if load_to_meta:
+                ds_inference_kwargs["checkpoint"] = checkpoints_json
+
+            model = deepspeed.init_inference(model, **ds_inference_kwargs)
+            model = model.module
+        else:
+            get_repo_root(model_name_or_path)
+            model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=model_dtype)
+            model = model.eval().to(self.device)
+            is_optimized = model_is_optimized(model.config)
+            model = wrap_in_hpu_graph(model)
+
+        self.model = model
+
+        # Used for padding input to fixed length
+        self.tokenizer.padding_side = "left"
+        self.max_padding_length = kwargs.get("max_padding_length", self.model.config.max_position_embeddings)
+
+        # Define config params for llama models
+        if self.model.config.model_type == "llama":
+            self.model.generation_config.pad_token_id = 0
+            self.model.generation_config.bos_token_id = 1
+            self.model.generation_config.eos_token_id = 2
+            self.tokenizer.bos_token_id = self.model.generation_config.bos_token_id
+            self.tokenizer.eos_token_id = self.model.generation_config.eos_token_id
+            self.tokenizer.pad_token_id = self.model.generation_config.pad_token_id
+            self.tokenizer.pad_token = self.tokenizer.decode(self.tokenizer.pad_token_id)
+            self.tokenizer.eos_token = self.tokenizer.decode(self.tokenizer.eos_token_id)
+            self.tokenizer.bos_token = self.tokenizer.decode(self.tokenizer.bos_token_id)
+
+        # Applicable to models that do not have pad tokens
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+            self.model.generation_config.pad_token_id = self.model.generation_config.eos_token_id
+
+        # Edit generation configuration based on input arguments
+
+        self.generation_config = copy.deepcopy(self.model.generation_config)
+        self.generation_config.max_new_tokens = kwargs.get("max_new_tokens", 100)
+        self.generation_config.use_cache = kwargs.get("use_kv_cache", True)
+        self.generation_config.static_shapes = is_optimized
+        self.generation_config.do_sample = kwargs.get("do_sample", False)
+        self.generation_config.num_beams = kwargs.get("num_beams", 1)
+        self.generation_config.temperature = kwargs.get("temperature", 1.0)
+        self.generation_config.top_p = kwargs.get("top_p", 1.0)
+        self.generation_config.repetition_penalty = kwargs.get("repetition_penalty", 1.0)
+        self.generation_config.num_return_sequences = kwargs.get("num_return_sequences", 1)
+        self.generation_config.bad_words_ids = None
+        self.generation_config.force_words_ids = None
+
+        # Define stopping criteria based on eos token id
+        self.stopping_criteria = StoppingCriteriaList([CustomStoppingCriteria(self.generation_config.eos_token_id)])
+
+        self._postprocess_params = {}
+
+        if self.use_deepspeed:
+            torch.distributed.barrier()
+
+    def __call__(self, prompt: List[str]):
+        model_inputs = self.tokenizer.encode_plus(
+            prompt[0], return_tensors="pt", max_length=self.max_padding_length, padding="max_length", truncation=True
+        )
+
+        for t in model_inputs:
+            if torch.is_tensor(model_inputs[t]):
+                model_inputs[t] = model_inputs[t].to(self.device)
+
+        output = self.model.generate(
+            **model_inputs,
+            generation_config=self.generation_config,
+            lazy_mode=True,
+            hpu_graphs=True,
+            profiling_steps=0,
+            profiling_warmup_steps=0,
+            ignore_eos = False
+        ).cpu()
+
+        output_text = self.tokenizer.decode(output[0], skip_special_tokens=True)
+        del output, model_inputs
+
+        return [{"generated_text": output_text}]
+
+    def get_process_rank(self):
+        """
+        Function that returns process id during distributed inference.
+        """
+        if self.use_deepspeed:
+            return self.local_rank
+        return -1
+
+    def compile_graph(self):
+        """
+        Function to compile computation graphs and synchronize hpus.
+        """
+        for _ in range(3):
+            self(["Here is my prompt"])
+        torch_hpu.synchronize()
+
+
+def main():
+    pipe = GaudiTextGenerationPipeline(
+        model_name_or_path="meta-llama/Llama-2-7b-chat-hf",
+        max_new_tokens=100,
+        temperature=0.2,
+        top_p=0.95,
+        repetition_penalty=1.15,
+        do_sample=True,
+    )
+    pipe.compile_graph()
+
+    # Test model on different input prompts
+    print("Test 1: short prompt")
+    print(pipe(["Once upon a time"]))
+    print("Success!\n")
+
+    print("Test 2: long prompt")
+    print(
+        pipe(
+            [
+                "Antibiotics are a type of medication used to treat bacterial infections. They work by either killing the bacteria or preventing them from reproducing, allowing the body’s immune system to fight off the infection. Antibiotics are usually taken orally in the form of pills, capsules, or liquid solutions, or sometimes administered intravenously. They are not effective against viral infections, and using them"
+            ]
+        )
+    )
+    print("Success!\n")
+
+    print("Test 3: qa prompt")
+    print(
+        pipe(
+            [
+                """Answer the question based on the context below. If the question cannot be answered using the information provided answer with "I don't know".
+
+Context: Large Language Models (LLMs) are the latest models used in NLP. Their superior performance over smaller models has made them incredibly useful for developers building NLP enabled applications. These models can be accessed via Hugging Face's `transformers` library, via OpenAI using the `openai` library, and via Cohere using the `cohere` library.
+
+Question: Which libraries and model providers offer LLMs?
+
+Answer: """
+            ]
+        )
+    )
+    print("Success!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/gaudi_utils/test_deepspeed.py b/gaudi_utils/test_deepspeed.py
new file mode 100644
index 00000000..e1584e18
--- /dev/null
+++ b/gaudi_utils/test_deepspeed.py
@@ -0,0 +1,34 @@
+import subprocess
+import unittest
+
+
+class TestTextGenPipelineDeepSpeed(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        """Overrides setUpClass from unittest to create artifacts for testing"""
+        self.base_command = ["python", "../gaudi_spawn.py", "--use_deepspeed", "--world_size"]
+
+    def test_world_size_two(self):
+        """Test DeepSpeed with world size of 2"""
+        self.command = self.base_command + ["2", "pipeline.py"]
+        result = subprocess.run(self.command)
+
+        self.assertEqual(result.returncode, 0)
+
+    def test_world_size_four(self):
+        """Test DeepSpeed with world size of 4"""
+        self.command = self.base_command + ["4", "pipeline.py"]
+        result = subprocess.run(self.command)
+
+        self.assertEqual(result.returncode, 0)
+
+    def test_world_size_eight(self):
+        """Test DeepSpeed with world size of 8"""
+        self.command = self.base_command + ["8", "pipeline.py"]
+        result = subprocess.run(self.command)
+
+        self.assertEqual(result.returncode, 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/gaudi_utils/test_pipeline.py b/gaudi_utils/test_pipeline.py
new file mode 100644
index 00000000..9b569722
--- /dev/null
+++ b/gaudi_utils/test_pipeline.py
@@ -0,0 +1,71 @@
+import time
+import unittest
+
+from pipeline import GaudiTextGenerationPipeline
+
+
+class TestGaudiTextGenPipeline(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        """Overrides setUpClass from unittest to create artifacts for testing"""
+        self.max_new_tokens = 100
+        self.pipe = GaudiTextGenerationPipeline(
+            model_name_or_path="meta-llama/Llama-2-7b-chat-hf",
+            max_new_tokens=self.max_new_tokens,
+            temperature=0.2,
+            top_p=0.95,
+            repetition_penalty=1.15,
+            do_sample=True,
+        )
+
+        # Inputs for testing
+        self.short_prompt = "Once upon a time"
+        self.long_prompt = "Antibiotics are a type of medication used to treat bacterial infections. They work by either killing the bacteria or preventing them from reproducing, allowing the body’s immune system to fight off the infection. Antibiotics are usually taken orally in the form of pills, capsules, or liquid solutions, or sometimes administered intravenously. They are not effective against viral infections, and using them"
+        self.qa_prompt = """Answer the question based on the context below. If the question cannot be answered using the information provided answer with "I don't know".
+
+Context: Large Language Models (LLMs) are the latest models used in NLP. Their superior performance over smaller models has made them incredibly useful for developers building NLP enabled applications. These models can be accessed via Hugging Face's `transformers` library, via OpenAI using the `openai` library, and via Cohere using the `cohere` library.
+
+Question: Which libraries and model providers offer LLMs?
+
+Answer: """
+
+    def test_graph_compilation(self):
+        """Measure latency for graph compilation."""
+        start_time = time.perf_counter()
+        self.pipe.compile_graph()
+        end_time = time.perf_counter()
+        print(f"Graph compilation latency: {end_time-start_time} seconds")
+
+    def test_short_prompt_input(self):
+        """Test llm with short prompt and measure latency and throughput"""
+        start_time = time.perf_counter()
+        output = self.pipe([self.short_prompt])
+        end_time = time.perf_counter()
+        print(f"Generated Text: {repr(output[0]['generated_text'])}")
+        print(f"Latency: {end_time-start_time} seconds")
+        throughput = self.max_new_tokens / (end_time - start_time)
+        print(f"Throughput (including tokenization): {throughput} tokens/second")
+
+    def test_long_prompt_input(self):
+        """Test llm with long prompt and measure latency and thoughput"""
+        start_time = time.perf_counter()
+        output = self.pipe([self.long_prompt])
+        end_time = time.perf_counter()
+        print(f"Generated Text: {repr(output[0]['generated_text'])}")
+        print(f"Latency: {end_time-start_time} seconds")
+        throughput = self.max_new_tokens / (end_time - start_time)
+        print(f"Throughput (including tokenization): {throughput} tokens/second")
+
+    def test_qa_prompt_input(self):
+        """Test llm with question answering prompt and measure latency and throughput"""
+        start_time = time.perf_counter()
+        output = self.pipe([self.qa_prompt])
+        end_time = time.perf_counter()
+        print(f"Generated Text: {repr(output[0]['generated_text'])}")
+        print(f"Latency: {end_time-start_time} seconds")
+        throughput = self.max_new_tokens / (end_time - start_time)
+        print(f"Throughput (including tokenization): {throughput} tokens/second")
+
+
+if __name__ == "__main__":
+    unittest.main()

From c56ffa826426a008d7fed465cefc049b00eaaa9d Mon Sep 17 00:00:00 2001
From: Siddhi Velankar <siddhi.velankar@intel.com>
Date: Wed, 16 Oct 2024 00:47:20 -0500
Subject: [PATCH 02/17] update load_model for gaudi

---
 run_localGPT.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/run_localGPT.py b/run_localGPT.py
index 185c983c..591f6e9f 100644
--- a/run_localGPT.py
+++ b/run_localGPT.py
@@ -59,6 +59,7 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
     """
     logging.info(f"Loading Model: {model_id}, on: {device_type}")
     logging.info("This action can take a few minutes!")
+    #process_rank = -1
 
     if model_basename is not None:
         if ".gguf" in model_basename.lower():
@@ -80,7 +81,21 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
     # main_classes/text_generation#transformers.GenerationConfig.from_pretrained.returns
 
     # Create a pipeline for text generation
-    pipe = pipeline(
+    if device_type == "hpu":
+        from gaudi_utils.pipeline import GaudiTextGenerationPipeline
+
+        pipe = GaudiTextGenerationPipeline(
+            model_name_or_path=model_id,
+            max_new_tokens=1000,
+            temperature=temperature,
+            top_p=top_p,
+            repetition_penalty=1.15,
+            do_sample=True,
+        )
+        pipe.compile_graph()
+        #process_rank = pipe.get_process_rank()
+    else:
+        pipe = pipeline(
         "text-generation",
         model=model,
         tokenizer=tokenizer,

From 28b8902e9ded02ab3ba8680781e766bd98a5b1d4 Mon Sep 17 00:00:00 2001
From: Siddhi Velankar <siddhi.velankar@intel.com>
Date: Wed, 16 Oct 2024 00:50:32 -0500
Subject: [PATCH 03/17] add embeddings for gaudi

---
 run_localGPT.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/run_localGPT.py b/run_localGPT.py
index 591f6e9f..60e9e698 100644
--- a/run_localGPT.py
+++ b/run_localGPT.py
@@ -141,8 +141,16 @@ def retrieval_qa_pipline(device_type, use_history, promptTemplate_type="llama"):
     (2) Provides additional arguments for instructor and BGE models to improve results, pursuant to the instructions contained on
     their respective huggingface repository, project page or github repository.
     """
+    if device_type == "hpu":
+        from gaudi_utils.embeddings import GaudiHuggingFaceEmbeddings
 
-    embeddings = get_embeddings(device_type)
+        embeddings = GaudiHuggingFaceEmbeddings(
+            embedding_input_size=EMBEDDING_INPUT_SIZE,
+            model_name=EMBEDDING_MODEL_NAME,
+            model_kwargs={"device": device_type},
+        )
+    else:
+        embeddings = get_embeddings(device_type)
 
     logging.info(f"Loaded embeddings from {EMBEDDING_MODEL_NAME}")
 

From 97bf47ddf6eb9f60d292b57d835675459e86da60 Mon Sep 17 00:00:00 2001
From: Siddhi Velankar <siddhi.velankar@intel.com>
Date: Wed, 16 Oct 2024 00:55:01 -0500
Subject: [PATCH 04/17] add EMBEDDING_INPUT_SIZE

---
 run_localGPT.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/run_localGPT.py b/run_localGPT.py
index 60e9e698..40f0d198 100644
--- a/run_localGPT.py
+++ b/run_localGPT.py
@@ -36,6 +36,7 @@
     MAX_NEW_TOKENS,
     MODELS_PATH,
     CHROMA_SETTINGS,
+    EMBEDDING_INPUT_SIZE,
 )
 
 
@@ -87,8 +88,8 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
         pipe = GaudiTextGenerationPipeline(
             model_name_or_path=model_id,
             max_new_tokens=1000,
-            temperature=temperature,
-            top_p=top_p,
+            temperature=0.2,
+            #top_p=top_p,
             repetition_penalty=1.15,
             do_sample=True,
         )

From ef1d16ec4b4e8af59faa9ef64f4be140cc1acc98 Mon Sep 17 00:00:00 2001
From: Siddhi Velankar <siddhi.velankar@intel.com>
Date: Wed, 16 Oct 2024 00:56:05 -0500
Subject: [PATCH 05/17] add EMBEDDING_INPUT_SIZE

---
 constants.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/constants.py b/constants.py
index cf27985d..1b484584 100644
--- a/constants.py
+++ b/constants.py
@@ -60,6 +60,9 @@
 # Default Instructor Model
 EMBEDDING_MODEL_NAME = "hkunlp/instructor-large"  # Uses 1.5 GB of VRAM (High Accuracy with lower VRAM usage)
 
+# Embedding input size for hpu
+EMBEDDING_INPUT_SIZE = -1
+
 ####
 #### OTHER EMBEDDING MODEL OPTIONS
 ####

From ad35a84bd7a752da4bffeda26e484200fea70be8 Mon Sep 17 00:00:00 2001
From: Siddhi Velankar <siddhi.velankar@intel.com>
Date: Wed, 16 Oct 2024 01:41:27 -0500
Subject: [PATCH 06/17] import nltk

---
 ingest.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ingest.py b/ingest.py
index 5e61627e..980dffa9 100644
--- a/ingest.py
+++ b/ingest.py
@@ -18,6 +18,9 @@
     SOURCE_DIRECTORY,
 )
 
+import nltk
+nltk.download('punkt_tab')
+nltk.download('averaged_perceptron_tagger_eng')
 
 def file_log(logentry):
     file1 = open("file_ingest.log", "a")

From f722d004fd46b7ef94eedb12ffc3a1c9212ac29c Mon Sep 17 00:00:00 2001
From: Siddhi Velankar <siddhi.velankar@intel.com>
Date: Tue, 22 Oct 2024 15:46:52 -0500
Subject: [PATCH 07/17] update embeddings

---
 gaudi_utils/embeddings.py | 281 +++++---------------------------------
 1 file changed, 32 insertions(+), 249 deletions(-)

diff --git a/gaudi_utils/embeddings.py b/gaudi_utils/embeddings.py
index 83db944a..7a2c3a18 100644
--- a/gaudi_utils/embeddings.py
+++ b/gaudi_utils/embeddings.py
@@ -1,257 +1,40 @@
-import json
-import os
-from collections import OrderedDict
-
-import numpy as np
+import logging
 import torch
-from InstructorEmbedding import INSTRUCTOR_Pooling, INSTRUCTOR_Transformer
-from InstructorEmbedding.instructor import batch_to_device, import_from_string
-from sentence_transformers import SentenceTransformer
-from tqdm.autonotebook import trange
-
-from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
-
 
-class GaudiSentenceTransformer(SentenceTransformer):
-    """Child class that overrides the tokenize method from SentenceTransformer"""
+from langchain.embeddings import HuggingFaceEmbeddings
+from habana_frameworks.torch.utils.library_loader import load_habana_module
+from optimum.habana.sentence_transformers.modeling_utils import (
+    adapt_sentence_transformers_to_gaudi,
+)
 
-    def __init__(self, model_name_or_path, embedding_input_size=-1, **kwargs):
-        super().__init__(model_name_or_path, **kwargs)
-        self.embedding_input_size = embedding_input_size
-
-    def tokenize(self, texts):
-        """Override tokenize method from SentenceTransformer"""
-        return self._first_module().tokenizer(
-            texts,
-            max_length=self.max_seq_length
-            if (self.embedding_input_size == -1 or self.embedding_input_size > self.max_seq_length)
-            else self.embedding_input_size,
-            padding="max_length",
-            return_tensors="pt",
-            truncation=True,
-        )
+from constants import EMBEDDING_MODEL_NAME
 
 
-class GaudiHuggingFaceEmbeddings(HuggingFaceEmbeddings):
-    """Child class that uses a GaudiSentenceTransformer client"""
+def load_embeddings():
+    """Load HuggingFace Embeddings object onto Gaudi or CPU"""
+    load_habana_module()
+    if torch.hpu.is_available():
+        logging.info("Loading embedding model on hpu")
 
-    def __init__(self, embedding_input_size=-1, **kwargs):
-        super().__init__(**kwargs)
-        self.client = GaudiSentenceTransformer(
-            self.model_name,
-            embedding_input_size=embedding_input_size,
-            cache_folder=self.cache_folder,
-            **self.model_kwargs,
+        adapt_sentence_transformers_to_gaudi()
+        embeddings = HuggingFaceEmbeddings(
+            model_name=EMBEDDING_MODEL_NAME, model_kwargs={"device": "hpu"}
         )
-
-
-class GaudiINSTRUCTOR(GaudiSentenceTransformer):
-    """INSTRUCTOR class for running on Gaudis. Code taken from instructor-embedding repo"""
-
-    def __init__(self, model_name_or_path, embedding_input_size=-1, **kwargs):
-        super().__init__(model_name_or_path, embedding_input_size=embedding_input_size, **kwargs)
-
-    def smart_batching_collate(self, batch):
-        num_texts = len(batch[0].texts)
-        texts = [[] for _ in range(num_texts)]
-        labels = []
-
-        for example in batch:
-            for idx, text in enumerate(example.texts):
-                texts[idx].append(text)
-
-            labels.append(example.label)
-
-        labels = torch.tensor(labels)
-
-        sentence_features = []
-        for idx in range(num_texts):
-            assert isinstance(texts[idx][0], list)
-            assert len(texts[idx][0]) == 2, "The input should have both instruction and input text"
-            # if len(texts[idx][0])==3:
-            # print('component 3')
-            num = len(texts[idx])
-            contexts = []
-            concatenated_input_texts = []
-            for local_idx in range(num):
-                assert len(texts[idx][local_idx]) == 2
-                contexts.append(texts[idx][local_idx][0])
-                concatenated_input_texts.append("".join(texts[idx][local_idx]))
-                assert isinstance(contexts[-1], str)
-                assert isinstance(concatenated_input_texts[-1], str)
-            tokenized = self.tokenize(concatenated_input_texts)
-            context_tok = self.tokenize(contexts)
-            tokenized["context_masks"] = torch.sum(context_tok["attention_mask"], dim=1)
-            tokenized["context_masks"] = tokenized["context_masks"] - 1
-            for my_idx in range(len(tokenized["context_masks"])):
-                if tokenized["context_masks"][my_idx] <= 1:
-                    tokenized["context_masks"][my_idx] = 0
-                # text_types = [pair[-1] for pair in texts[idx]]
-                # assert all([tid==1 for tid in text_types]) or all([tid==0 for tid in text_types])
-                # tokenized['text_type'] = text_types[0]
-            # elif len(texts[idx][0])==2:
-            #     input_texts = [pair[0] for pair in texts[idx]]
-            #     text_types = [pair[-1] for pair in texts[idx]]
-            #     assert all([tid == 1 for tid in text_types]) or all([tid == 0 for tid in text_types])
-            #     tokenized = self.tokenize(input_texts)
-            #     tokenized['text_type'] = text_types[0]
-            # else:
-            #     raise ValueError('tokenization error')
-            sentence_features.append(tokenized)
-
-        return sentence_features, labels
-
-    def _load_sbert_model(self, model_path):
-        """
-        Loads a full sentence-transformers model
-        """
-        # Check if the config_sentence_transformers.json file exists (exists since v2 of the framework)
-        config_sentence_transformers_json_path = os.path.join(model_path, "config_sentence_transformers.json")
-        if os.path.exists(config_sentence_transformers_json_path):
-            with open(config_sentence_transformers_json_path) as fIn:
-                self._model_config = json.load(fIn)
-
-        # Check if a readme exists
-        model_card_path = os.path.join(model_path, "README.md")
-        if os.path.exists(model_card_path):
-            try:
-                with open(model_card_path, encoding="utf8") as fIn:
-                    self._model_card_text = fIn.read()
-            except Exception:
-                pass
-
-        # Load the modules of sentence transformer
-        modules_json_path = os.path.join(model_path, "modules.json")
-        with open(modules_json_path) as fIn:
-            modules_config = json.load(fIn)
-
-        modules = OrderedDict()
-        for module_config in modules_config:
-            if module_config["idx"] == 0:
-                print("load INSTRUCTOR_Transformer")
-                module_class = INSTRUCTOR_Transformer
-            elif module_config["idx"] == 1:
-                module_class = INSTRUCTOR_Pooling
-            else:
-                module_class = import_from_string(module_config["type"])
-            module = module_class.load(os.path.join(model_path, module_config["path"]))
-            modules[module_config["name"]] = module
-
-        return modules
-
-    def encode(
-        self,
-        sentences,
-        batch_size: int = 32,
-        show_progress_bar: bool = None,
-        output_value: str = "sentence_embedding",
-        convert_to_numpy: bool = True,
-        convert_to_tensor: bool = False,
-        device: str = None,
-        normalize_embeddings: bool = False,
-    ):
-        """
-        Computes sentence embeddings
-
-        :param sentences: the sentences to embed
-        :param batch_size: the batch size used for the computation
-        :param show_progress_bar: Output a progress bar when encode sentences
-        :param output_value:  Default sentence_embedding, to get sentence embeddings. Can be set to token_embeddings to get wordpiece token embeddings. Set to None, to get all output values
-        :param convert_to_numpy: If true, the output is a list of numpy vectors. Else, it is a list of pytorch tensors.
-        :param convert_to_tensor: If true, you get one large tensor as return. Overwrites any setting from convert_to_numpy
-        :param device: Which torch.device to use for the computation
-        :param normalize_embeddings: If set to true, returned vectors will have length 1. In that case, the faster dot-product (util.dot_score) instead of cosine similarity can be used.
-
-        :return:
-           By default, a list of tensors is returned. If convert_to_tensor, a stacked tensor is returned. If convert_to_numpy, a numpy matrix is returned.
-        """
-        self.eval()
-        if show_progress_bar is None:
-            show_progress_bar = False
-
-        if convert_to_tensor:
-            convert_to_numpy = False
-
-        if output_value != "sentence_embedding":
-            convert_to_tensor = False
-            convert_to_numpy = False
-
-        input_was_string = False
-        if isinstance(sentences, str) or not hasattr(
-            sentences, "__len__"
-        ):  # Cast an individual sentence to a list with length 1
-            sentences = [sentences]
-            input_was_string = True
-
-        if device is None:
-            device = self._target_device
-
-        self.to(device)
-
-        all_embeddings = []
-        if isinstance(sentences[0], list):
-            lengths = []
-            for sen in sentences:
-                lengths.append(-self._text_length(sen[1]))
-            length_sorted_idx = np.argsort(lengths)
-        else:
-            length_sorted_idx = np.argsort([-self._text_length(sen) for sen in sentences])
-        sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
-
-        for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar):
-            sentences_batch = sentences_sorted[start_index : start_index + batch_size]
-            features = self.tokenize(sentences_batch)
-            features = batch_to_device(features, device)
-
-            with torch.no_grad():
-                out_features = self.forward(features)
-
-                if output_value == "token_embeddings":
-                    embeddings = []
-                    for token_emb, attention in zip(out_features[output_value], out_features["attention_mask"]):
-                        last_mask_id = len(attention) - 1
-                        while last_mask_id > 0 and attention[last_mask_id].item() == 0:
-                            last_mask_id -= 1
-
-                        embeddings.append(token_emb[0 : last_mask_id + 1])
-                elif output_value is None:  # Return all outputs
-                    embeddings = []
-                    for sent_idx in range(len(out_features["sentence_embedding"])):
-                        row = {name: out_features[name][sent_idx] for name in out_features}
-                        embeddings.append(row)
-                else:  # Sentence embeddings
-                    embeddings = out_features[output_value]
-                    embeddings = embeddings.detach()
-                    if normalize_embeddings:
-                        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
-
-                    # fixes for #522 and #487 to avoid oom problems on gpu with large datasets
-                    if convert_to_numpy:
-                        embeddings = embeddings.cpu()
-
-                all_embeddings.extend(embeddings)
-
-        all_embeddings = [all_embeddings[idx] for idx in np.argsort(length_sorted_idx)]
-
-        if convert_to_tensor:
-            all_embeddings = torch.stack(all_embeddings)
-        elif convert_to_numpy:
-            all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings])
-
-        if input_was_string:
-            all_embeddings = all_embeddings[0]
-
-        return all_embeddings
-
-
-class GaudiHuggingFaceInstructEmbeddings(HuggingFaceInstructEmbeddings):
-    """Child class that uses a GaudiINSTRUCTOR client"""
-
-    def __init__(self, embedding_input_size=-1, **kwargs):
-        super().__init__(**kwargs)
-        self.client = GaudiINSTRUCTOR(
-            self.model_name,
-            embedding_input_size=embedding_input_size,
-            cache_folder=self.cache_folder,
-            **self.model_kwargs,
+    else:
+        logging.info("Loading embedding model on cpu")
+        embeddings = HuggingFaceEmbeddings(
+            model_name=EMBEDDING_MODEL_NAME, model_kwargs={"device": "cpu"}
         )
+    return embeddings
+
+
+def calculate_similarity(model, response, expected_answer):
+    """Calculate similarity between response and expected answer using the model"""
+    response_embedding = model.client.encode(response, convert_to_tensor=True).squeeze()
+    expected_embedding = model.client.encode(
+        expected_answer, convert_to_tensor=True
+    ).squeeze()
+    similarity_score = torch.nn.functional.cosine_similarity(
+        response_embedding, expected_embedding, dim=0
+    )
+    return similarity_score.item()

From d85bd07e5c3d8cf695c5a54a0f9ccff44eecfe89 Mon Sep 17 00:00:00 2001
From: Siddhi Velankar <siddhi.velankar@intel.com>
Date: Tue, 22 Oct 2024 15:47:25 -0500
Subject: [PATCH 08/17] update pipeline

---
 gaudi_utils/pipeline.py | 253 +++++-----------------------------------
 1 file changed, 30 insertions(+), 223 deletions(-)

diff --git a/gaudi_utils/pipeline.py b/gaudi_utils/pipeline.py
index 8768c5f5..cd50f76f 100644
--- a/gaudi_utils/pipeline.py
+++ b/gaudi_utils/pipeline.py
@@ -1,17 +1,17 @@
 import copy
-import json
 import os
+import torch
 from pathlib import Path
 from typing import List
 
 import habana_frameworks.torch.hpu as torch_hpu
-import torch
-from habana_frameworks.torch.distributed.hccl import initialize_distributed_hpu
+
 from habana_frameworks.torch.hpu import wrap_in_hpu_graph
 from huggingface_hub import snapshot_download
+from optimum.habana.transformers.generation import MODELS_OPTIMIZED_WITH_STATIC_SHAPES
 from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
 from optimum.habana.utils import set_seed
-from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextGenerationPipeline
 from transformers.utils import is_offline_mode
 
 
@@ -57,39 +57,7 @@ def get_repo_root(model_name_or_path, local_rank=-1, token=None):
         )
 
 
-def get_checkpoint_files(model_name_or_path, local_rank):
-    """
-    Gets the list of files for the specified model checkpoint.
-    """
-    cached_repo_dir = get_repo_root(model_name_or_path, local_rank)
-
-    # Extensions: .bin | .pt
-    # Creates a list of paths from all downloaded files in cache dir
-    file_list = [str(entry) for entry in Path(cached_repo_dir).rglob("*.[bp][it][n]") if entry.is_file()]
-    return file_list
-
-
-def write_checkpoints_json(model_name_or_path, local_rank, checkpoints_json):
-    """
-    Dumps metadata into a JSON file for DeepSpeed-inference.
-    """
-    checkpoint_files = get_checkpoint_files(model_name_or_path, local_rank)
-    if local_rank == 0:
-        data = {"type": "ds_model", "checkpoints": checkpoint_files, "version": 1.0}
-        with open(checkpoints_json, "w") as fp:
-            json.dump(data, fp)
-
-
-def model_on_meta(config):
-    """
-    Checks if load the model to meta.
-    """
-    return config.model_type in ["bloom", "llama"]
-
-
 def get_optimized_model_name(config):
-    from optimum.habana.transformers.generation import MODELS_OPTIMIZED_WITH_STATIC_SHAPES
-
     for model_type in MODELS_OPTIMIZED_WITH_STATIC_SHAPES:
         if model_type == config.model_type:
             return model_type
@@ -105,75 +73,11 @@ def model_is_optimized(config):
     return get_optimized_model_name(config) is not None
 
 
-def get_ds_injection_policy(config):
-    """
-    Defines injection policies for model parallelism via DeepSpeed.
-    """
-    model_type = get_optimized_model_name(config)
-    policy = {}
-    if model_type:
-        if model_type == "bloom":
-            from transformers.models.bloom.modeling_bloom import BloomBlock
-
-            policy = {BloomBlock: ("self_attention.dense", "mlp.dense_4h_to_h")}
-
-        if model_type == "opt":
-            from transformers.models.opt.modeling_opt import OPTDecoderLayer
-
-            policy = {OPTDecoderLayer: ("self_attn.out_proj", ".fc2")}
-
-        if model_type == "gpt2":
-            from transformers.models.gpt2.modeling_gpt2 import GPT2MLP
-
-            policy = {GPT2MLP: ("attn.c_proj", "mlp.c_proj")}
-
-        if model_type == "gptj":
-            from transformers.models.gptj.modeling_gptj import GPTJBlock
-
-            policy = {GPTJBlock: ("attn.out_proj", "mlp.fc_out")}
-
-        if model_type == "gpt_neox":
-            from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXLayer
-
-            policy = {GPTNeoXLayer: ("attention.dense", "mlp.dense_4h_to_h")}
-
-        if model_type == "llama":
-            from transformers.models.llama.modeling_llama import LlamaDecoderLayer
-
-            policy = {LlamaDecoderLayer: ("self_attn.o_proj", "mlp.down_proj")}
-
-    return policy
-
-
-class CustomStoppingCriteria(StoppingCriteria):
-    """ "
-    A custom stopping criteria which stops text generation when a stop token is generated.
-    """
-
-    def __init__(self, stop_token_id):
-        super().__init__()
-        self.stop_token_id = stop_token_id
-
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
-        return self.stop_token_id in input_ids[0]
-
-
-class GaudiTextGenerationPipeline:
+class GaudiTextGenerationPipeline(TextGenerationPipeline):
     """
-    An end-to-end text-generation pipeline that can used to initialize LangChain classes. It supports both single-hpu and multi-hpu inference.
+    An end-to-end text-generation pipeline that can used to initialize LangChain classes.
     """
-
-    def __init__(self, model_name_or_path=None, **kwargs):
-        self.use_deepspeed = "deepspeed" in os.environ["_"]
-
-        if self.use_deepspeed:
-            world_size, _, self.local_rank = initialize_distributed_hpu()
-
-            import deepspeed
-
-            # Initialize Deepspeed processes
-            deepspeed.init_distributed(dist_backend="hccl")
-
+    def __init__(self, model_name_or_path=None, revision="main", **kwargs):
         self.task = "text-generation"
         self.device = "hpu"
 
@@ -181,60 +85,24 @@ def __init__(self, model_name_or_path=None, **kwargs):
         adapt_transformers_to_gaudi()
         set_seed(27)
 
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
-
+        # Initialize tokenizer and define datatype
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, revision=revision)
         model_dtype = torch.bfloat16
 
-        if self.use_deepspeed:
-            config = AutoConfig.from_pretrained(model_name_or_path)
-            is_optimized = model_is_optimized(config)
-            load_to_meta = model_on_meta(config)
-
-            if load_to_meta:
-                # Construct model with fake meta tensors, later will be replaced on devices during ds-inference ckpt load
-                with deepspeed.OnDevice(dtype=model_dtype, device="meta"):
-                    model = AutoModelForCausalLM.from_config(config, torch_dtype=model_dtype)
-            else:
-                get_repo_root(model_name_or_path, local_rank=self.local_rank)
-                # placement on hpu if meta tensors are not supported
-                with deepspeed.OnDevice(dtype=model_dtype, device="hpu"):
-                    model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=model_dtype)
-            model = model.eval()
-
-            # Initialize the model
-            ds_inference_kwargs = {"dtype": model_dtype}
-            ds_inference_kwargs["tensor_parallel"] = {"tp_size": world_size}
-            ds_inference_kwargs["enable_cuda_graph"] = True
-
-            if load_to_meta:
-                # model loaded to meta is managed differently
-                checkpoints_json = "checkpoints.json"
-                write_checkpoints_json(model_name_or_path, self.local_rank, checkpoints_json)
-
-            # Make sure all devices/nodes have access to the model checkpoints
-            torch.distributed.barrier()
-
-            ds_inference_kwargs["injection_policy"] = get_ds_injection_policy(config)
-            if load_to_meta:
-                ds_inference_kwargs["checkpoint"] = checkpoints_json
-
-            model = deepspeed.init_inference(model, **ds_inference_kwargs)
-            model = model.module
-        else:
-            get_repo_root(model_name_or_path)
-            model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=model_dtype)
-            model = model.eval().to(self.device)
-            is_optimized = model_is_optimized(model.config)
-            model = wrap_in_hpu_graph(model)
-
+        # Intialize model
+        get_repo_root(model_name_or_path)
+        model = AutoModelForCausalLM.from_pretrained(model_name_or_path, revision=revision, torch_dtype=model_dtype)
+        model = model.eval().to(self.device)
+        is_optimized = model_is_optimized(model.config)
+        model = wrap_in_hpu_graph(model)
         self.model = model
 
         # Used for padding input to fixed length
         self.tokenizer.padding_side = "left"
         self.max_padding_length = kwargs.get("max_padding_length", self.model.config.max_position_embeddings)
 
-        # Define config params for llama models
-        if self.model.config.model_type == "llama":
+        # Define config params for llama and mistral models
+        if self.model.config.model_type in ["llama", "mistral"]:
             self.model.generation_config.pad_token_id = 0
             self.model.generation_config.bos_token_id = 1
             self.model.generation_config.eos_token_id = 2
@@ -251,7 +119,6 @@ def __init__(self, model_name_or_path=None, **kwargs):
             self.model.generation_config.pad_token_id = self.model.generation_config.eos_token_id
 
         # Edit generation configuration based on input arguments
-
         self.generation_config = copy.deepcopy(self.model.generation_config)
         self.generation_config.max_new_tokens = kwargs.get("max_new_tokens", 100)
         self.generation_config.use_cache = kwargs.get("use_kv_cache", True)
@@ -264,47 +131,34 @@ def __init__(self, model_name_or_path=None, **kwargs):
         self.generation_config.num_return_sequences = kwargs.get("num_return_sequences", 1)
         self.generation_config.bad_words_ids = None
         self.generation_config.force_words_ids = None
+        self.generation_config.ignore_eos = False
 
-        # Define stopping criteria based on eos token id
-        self.stopping_criteria = StoppingCriteriaList([CustomStoppingCriteria(self.generation_config.eos_token_id)])
-
+        # Define empty post-process params dict as there is no postprocesing
         self._postprocess_params = {}
 
-        if self.use_deepspeed:
-            torch.distributed.barrier()
+        # Warm-up hpu and compile computation graphs
+        self.compile_graph()
 
     def __call__(self, prompt: List[str]):
-        model_inputs = self.tokenizer.encode_plus(
-            prompt[0], return_tensors="pt", max_length=self.max_padding_length, padding="max_length", truncation=True
-        )
+        """
+        __call__ method of pipeline class
+        """
+        # Tokenize input string
+        model_inputs = self.tokenizer.encode_plus(prompt[0], return_tensors="pt", max_length=self.max_padding_length, padding="max_length", truncation=True)
 
+        # Move tensors to hpu
         for t in model_inputs:
             if torch.is_tensor(model_inputs[t]):
                 model_inputs[t] = model_inputs[t].to(self.device)
 
-        output = self.model.generate(
-            **model_inputs,
-            generation_config=self.generation_config,
-            lazy_mode=True,
-            hpu_graphs=True,
-            profiling_steps=0,
-            profiling_warmup_steps=0,
-            ignore_eos = False
-        ).cpu()
+        # Call model's generate method
+        output = self.model.generate(**model_inputs, generation_config=self.generation_config, lazy_mode=True, hpu_graphs=True, profiling_steps=0, profiling_warmup_steps=0).cpu()
 
+        # Decode and return result
         output_text = self.tokenizer.decode(output[0], skip_special_tokens=True)
         del output, model_inputs
-
         return [{"generated_text": output_text}]
 
-    def get_process_rank(self):
-        """
-        Function that returns process id during distributed inference.
-        """
-        if self.use_deepspeed:
-            return self.local_rank
-        return -1
-
     def compile_graph(self):
         """
         Function to compile computation graphs and synchronize hpus.
@@ -312,50 +166,3 @@ def compile_graph(self):
         for _ in range(3):
             self(["Here is my prompt"])
         torch_hpu.synchronize()
-
-
-def main():
-    pipe = GaudiTextGenerationPipeline(
-        model_name_or_path="meta-llama/Llama-2-7b-chat-hf",
-        max_new_tokens=100,
-        temperature=0.2,
-        top_p=0.95,
-        repetition_penalty=1.15,
-        do_sample=True,
-    )
-    pipe.compile_graph()
-
-    # Test model on different input prompts
-    print("Test 1: short prompt")
-    print(pipe(["Once upon a time"]))
-    print("Success!\n")
-
-    print("Test 2: long prompt")
-    print(
-        pipe(
-            [
-                "Antibiotics are a type of medication used to treat bacterial infections. They work by either killing the bacteria or preventing them from reproducing, allowing the body’s immune system to fight off the infection. Antibiotics are usually taken orally in the form of pills, capsules, or liquid solutions, or sometimes administered intravenously. They are not effective against viral infections, and using them"
-            ]
-        )
-    )
-    print("Success!\n")
-
-    print("Test 3: qa prompt")
-    print(
-        pipe(
-            [
-                """Answer the question based on the context below. If the question cannot be answered using the information provided answer with "I don't know".
-
-Context: Large Language Models (LLMs) are the latest models used in NLP. Their superior performance over smaller models has made them incredibly useful for developers building NLP enabled applications. These models can be accessed via Hugging Face's `transformers` library, via OpenAI using the `openai` library, and via Cohere using the `cohere` library.
-
-Question: Which libraries and model providers offer LLMs?
-
-Answer: """
-            ]
-        )
-    )
-    print("Success!")
-
-
-if __name__ == "__main__":
-    main()

From afe103b36fbe8b752871d3fcc6455b33d19d6cfe Mon Sep 17 00:00:00 2001
From: Siddhi Velankar <siddhi.velankar@intel.com>
Date: Tue, 22 Oct 2024 15:48:11 -0500
Subject: [PATCH 09/17] update run code

---
 run_localGPT.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/run_localGPT.py b/run_localGPT.py
index 40f0d198..9239b5b7 100644
--- a/run_localGPT.py
+++ b/run_localGPT.py
@@ -89,9 +89,10 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
             model_name_or_path=model_id,
             max_new_tokens=1000,
             temperature=0.2,
-            #top_p=top_p,
+            top_p=0.95,
             repetition_penalty=1.15,
             do_sample=True,
+            max_padding_length=5000,
         )
         pipe.compile_graph()
         #process_rank = pipe.get_process_rank()
@@ -138,18 +139,18 @@ def retrieval_qa_pipline(device_type, use_history, promptTemplate_type="llama"):
 
     """
     (1) Chooses an appropriate langchain library based on the enbedding model name.  Matching code is contained within ingest.py.
-    
+
     (2) Provides additional arguments for instructor and BGE models to improve results, pursuant to the instructions contained on
     their respective huggingface repository, project page or github repository.
     """
     if device_type == "hpu":
-        from gaudi_utils.embeddings import GaudiHuggingFaceEmbeddings
+        from gaudi_utils.embeddings import load_embeddings
 
-        embeddings = GaudiHuggingFaceEmbeddings(
-            embedding_input_size=EMBEDDING_INPUT_SIZE,
-            model_name=EMBEDDING_MODEL_NAME,
-            model_kwargs={"device": device_type},
-        )
+        embeddings = load_embeddings()
+            #embedding_input_size=EMBEDDING_INPUT_SIZE,
+            #model_name=EMBEDDING_MODEL_NAME,
+            #model_kwargs={"device": device_type},
+        #)
     else:
         embeddings = get_embeddings(device_type)
 

From d659cfd9508e4fa74cc47e8c3d2d0fd51e6dc39f Mon Sep 17 00:00:00 2001
From: Siddhi Velankar <siddhi.velankar@intel.com>
Date: Tue, 22 Oct 2024 15:51:38 -0500
Subject: [PATCH 10/17] update model to mistral

---
 constants.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/constants.py b/constants.py
index 1b484584..e0bcc4e9 100644
--- a/constants.py
+++ b/constants.py
@@ -29,7 +29,7 @@
 )
 
 # Context Window and Max New Tokens
-CONTEXT_WINDOW_SIZE = 8096
+CONTEXT_WINDOW_SIZE = 2048
 MAX_NEW_TOKENS = CONTEXT_WINDOW_SIZE  # int(CONTEXT_WINDOW_SIZE/4)
 
 #### If you get a "not enough space in the buffer" error, you should reduce the values below, start with half of the original values and keep halving the value until the error stops appearing
@@ -109,8 +109,10 @@
 # MODEL_ID = "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF"
 # MODEL_BASENAME = "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf"
 
+MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2"
+
 # LLAMA 3 # use for Apple Silicon
-MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+#MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 MODEL_BASENAME = None
 
 # LLAMA 3 # use for NVIDIA GPUs

From f3ee3929155000a9cd2ef38ea905162328900108 Mon Sep 17 00:00:00 2001
From: Siddhi Velankar <siddhi.velankar@intel.com>
Date: Tue, 22 Oct 2024 15:52:54 -0500
Subject: [PATCH 11/17] add hpu

---
 load_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/load_models.py b/load_models.py
index ad278166..07ae4edd 100644
--- a/load_models.py
+++ b/load_models.py
@@ -135,7 +135,7 @@ def load_full_model(model_id, model_basename, device_type, logging):
     - Additional settings are provided for NVIDIA GPUs, such as loading in 4-bit and setting the compute dtype.
     """
 
-    if device_type.lower() in ["mps", "cpu"]:
+    if device_type.lower() in ["mps", "cpu", "hpu"]:
         logging.info("Using AutoModelForCausalLM")
         # tokenizer = LlamaTokenizer.from_pretrained(model_id, cache_dir="./models/")
         # model = LlamaForCausalLM.from_pretrained(model_id, cache_dir="./models/")

From 6d89108b4743741893285b62fc57bcedc779b247 Mon Sep 17 00:00:00 2001
From: siddhivelankar23 <siddhi.velankar@intel.com>
Date: Wed, 23 Oct 2024 19:26:01 +0000
Subject: [PATCH 12/17] add dockerfile for hpu

---
 Dockerfile_hpu | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 Dockerfile_hpu

diff --git a/Dockerfile_hpu b/Dockerfile_hpu
new file mode 100644
index 00000000..5d8f25cc
--- /dev/null
+++ b/Dockerfile_hpu
@@ -0,0 +1,45 @@
+FROM vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
+
+ENV HABANA_VISIBLE_DEVICES=all
+ENV OMPI_MCA_btl_vader_single_copy_mechanism=none
+ENV PT_HPU_LAZY_ACC_PAR_MODE=0
+ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=1
+
+# Install linux packages
+ENV DEBIAN_FRONTEND="noninteractive"  TZ=Etc/UTC
+RUN apt-get update && apt-get install -y tzdata bash-completion python3-pip openssh-server \
+    vim git iputils-ping net-tools protobuf-compiler curl bc gawk tmux \
+    && rm -rf /var/lib/apt/lists/*
+
+# Add repo contents
+ADD localGPT /root/localGPT
+WORKDIR /root/localGPT
+
+# Install python packages
+RUN pip install --upgrade pip \
+    && pip install langchain-experimental==0.0.62 \
+    && pip install langchain==0.0.329 \
+    && pip install protobuf==3.20.2 \
+    && pip install grpcio-tools \
+    && pip install pymilvus==2.4.0 \
+    && pip install chromadb==0.5.15 \
+    && pip install llama-cpp-python==0.1.66 \
+    && pip install pdfminer.six==20221105 \
+    && pip install transformers==4.43.1 \
+    && pip install optimum[habana]==1.13.1 \
+    && pip install InstructorEmbedding==1.0.1 \
+    && pip install sentence-transformers==3.0.1 \
+    && pip install faiss-cpu==1.7.4 \
+    && pip install huggingface_hub==0.16.4 \
+    && pip install protobuf==3.20.2 \
+    && pip install auto-gptq==0.2.2 \
+    && pip install docx2txt unstructured unstructured[pdf] urllib3 accelerate \
+    && pip install bitsandbytes \
+    && pip install click flask requests openpyxl \
+    && pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.17.0 \
+    && pip install python-multipart \
+    && pip install fastapi \
+    && pip install uvicorn \
+    && pip install gptcache==0.1.43 \
+    && pip install pypdf==4.3.1 \
+    && pip install python-jose[cryptography]

From 2cb80aeb589360c7f4a904b92d3e3a5c2233a0a5 Mon Sep 17 00:00:00 2001
From: Siddhi Velankar <siddhi.velankar@intel.com>
Date: Wed, 23 Oct 2024 14:27:17 -0500
Subject: [PATCH 13/17] delete gaudi_spawn.py

---
 gaudi_spawn.py | 65 --------------------------------------------------
 1 file changed, 65 deletions(-)
 delete mode 100644 gaudi_spawn.py

diff --git a/gaudi_spawn.py b/gaudi_spawn.py
deleted file mode 100644
index 142e5809..00000000
--- a/gaudi_spawn.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import sys
-from argparse import REMAINDER, ArgumentParser
-
-from optimum.habana.distributed import DistributedRunner
-
-
-def parse_args():
-    """
-    Helper function parsing the command line options.
-    @retval ArgumentParser
-    """
-    parser = ArgumentParser(
-        description=(
-            "Habana Gaudi distributed inference launch helper utility that will spawn up multiple distributed"
-            " processes."
-        )
-    )
-
-    # Optional arguments for the launch helper
-    parser.add_argument("--world_size", type=int, default=1, help="Number of HPUs to use (1, 4 or 8)")
-    parser.add_argument("--hostfile", type=str, default=None, help="Path to the file where hosts are specified.")
-    parser.add_argument("--use_mpi", action="store_true", help="Use MPI for distributed inference")
-    parser.add_argument("--use_deepspeed", action="store_true", help="Use DeepSpeed for distributed inference")
-
-    # positional
-    parser.add_argument(
-        "inference_script",
-        type=str,
-        help=(
-            "The full path to the single HPU inference "
-            "program/script to be launched in parallel, "
-            "followed by all the arguments for the "
-            "inference script."
-        ),
-    )
-
-    # rest from the training program
-    parser.add_argument("inference_script_args", nargs=REMAINDER)
-
-    return parser.parse_args()
-
-
-def main():
-    args = parse_args()
-
-    # Patch sys.argv
-    sys.argv = [args.inference_script] + args.inference_script_args
-    # Handle the case where arguments contain whitespaces
-    argv = ['"{}"'.format(arg) if " " in arg and arg[0] != '"' and arg[-1] != '"' else arg for arg in sys.argv]
-    command_list = [" ".join(argv)]
-
-    distributed_runner = DistributedRunner(
-        command_list=command_list,
-        world_size=args.world_size,
-        hostfile=args.hostfile,
-        use_mpi=False,
-        use_deepspeed=args.use_deepspeed,
-    )
-
-    ret_code = distributed_runner.run()
-    sys.exit(ret_code)
-
-
-if __name__ == "__main__":
-    main()

From d5635adc385366be53bdd331b13587dbf95f436d Mon Sep 17 00:00:00 2001
From: siddhivelankar23 <siddhi.velankar@intel.com>
Date: Wed, 23 Oct 2024 19:32:15 +0000
Subject: [PATCH 14/17] cleanup

---
 constants.py                  |  3 --
 gaudi_utils/test_deepspeed.py | 34 -----------------
 gaudi_utils/test_pipeline.py  | 71 -----------------------------------
 run_localGPT.py               | 11 +-----
 4 files changed, 2 insertions(+), 117 deletions(-)
 delete mode 100644 gaudi_utils/test_deepspeed.py
 delete mode 100644 gaudi_utils/test_pipeline.py

diff --git a/constants.py b/constants.py
index e0bcc4e9..74a6c0ab 100644
--- a/constants.py
+++ b/constants.py
@@ -60,9 +60,6 @@
 # Default Instructor Model
 EMBEDDING_MODEL_NAME = "hkunlp/instructor-large"  # Uses 1.5 GB of VRAM (High Accuracy with lower VRAM usage)
 
-# Embedding input size for hpu
-EMBEDDING_INPUT_SIZE = -1
-
 ####
 #### OTHER EMBEDDING MODEL OPTIONS
 ####
diff --git a/gaudi_utils/test_deepspeed.py b/gaudi_utils/test_deepspeed.py
deleted file mode 100644
index e1584e18..00000000
--- a/gaudi_utils/test_deepspeed.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import subprocess
-import unittest
-
-
-class TestTextGenPipelineDeepSpeed(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        """Overrides setUpClass from unittest to create artifacts for testing"""
-        self.base_command = ["python", "../gaudi_spawn.py", "--use_deepspeed", "--world_size"]
-
-    def test_world_size_two(self):
-        """Test DeepSpeed with world size of 2"""
-        self.command = self.base_command + ["2", "pipeline.py"]
-        result = subprocess.run(self.command)
-
-        self.assertEqual(result.returncode, 0)
-
-    def test_world_size_four(self):
-        """Test DeepSpeed with world size of 4"""
-        self.command = self.base_command + ["4", "pipeline.py"]
-        result = subprocess.run(self.command)
-
-        self.assertEqual(result.returncode, 0)
-
-    def test_world_size_eight(self):
-        """Test DeepSpeed with world size of 8"""
-        self.command = self.base_command + ["8", "pipeline.py"]
-        result = subprocess.run(self.command)
-
-        self.assertEqual(result.returncode, 0)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/gaudi_utils/test_pipeline.py b/gaudi_utils/test_pipeline.py
deleted file mode 100644
index 9b569722..00000000
--- a/gaudi_utils/test_pipeline.py
+++ /dev/null
@@ -1,71 +0,0 @@
-import time
-import unittest
-
-from pipeline import GaudiTextGenerationPipeline
-
-
-class TestGaudiTextGenPipeline(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        """Overrides setUpClass from unittest to create artifacts for testing"""
-        self.max_new_tokens = 100
-        self.pipe = GaudiTextGenerationPipeline(
-            model_name_or_path="meta-llama/Llama-2-7b-chat-hf",
-            max_new_tokens=self.max_new_tokens,
-            temperature=0.2,
-            top_p=0.95,
-            repetition_penalty=1.15,
-            do_sample=True,
-        )
-
-        # Inputs for testing
-        self.short_prompt = "Once upon a time"
-        self.long_prompt = "Antibiotics are a type of medication used to treat bacterial infections. They work by either killing the bacteria or preventing them from reproducing, allowing the body’s immune system to fight off the infection. Antibiotics are usually taken orally in the form of pills, capsules, or liquid solutions, or sometimes administered intravenously. They are not effective against viral infections, and using them"
-        self.qa_prompt = """Answer the question based on the context below. If the question cannot be answered using the information provided answer with "I don't know".
-
-Context: Large Language Models (LLMs) are the latest models used in NLP. Their superior performance over smaller models has made them incredibly useful for developers building NLP enabled applications. These models can be accessed via Hugging Face's `transformers` library, via OpenAI using the `openai` library, and via Cohere using the `cohere` library.
-
-Question: Which libraries and model providers offer LLMs?
-
-Answer: """
-
-    def test_graph_compilation(self):
-        """Measure latency for graph compilation."""
-        start_time = time.perf_counter()
-        self.pipe.compile_graph()
-        end_time = time.perf_counter()
-        print(f"Graph compilation latency: {end_time-start_time} seconds")
-
-    def test_short_prompt_input(self):
-        """Test llm with short prompt and measure latency and throughput"""
-        start_time = time.perf_counter()
-        output = self.pipe([self.short_prompt])
-        end_time = time.perf_counter()
-        print(f"Generated Text: {repr(output[0]['generated_text'])}")
-        print(f"Latency: {end_time-start_time} seconds")
-        throughput = self.max_new_tokens / (end_time - start_time)
-        print(f"Throughput (including tokenization): {throughput} tokens/second")
-
-    def test_long_prompt_input(self):
-        """Test llm with long prompt and measure latency and thoughput"""
-        start_time = time.perf_counter()
-        output = self.pipe([self.long_prompt])
-        end_time = time.perf_counter()
-        print(f"Generated Text: {repr(output[0]['generated_text'])}")
-        print(f"Latency: {end_time-start_time} seconds")
-        throughput = self.max_new_tokens / (end_time - start_time)
-        print(f"Throughput (including tokenization): {throughput} tokens/second")
-
-    def test_qa_prompt_input(self):
-        """Test llm with question answering prompt and measure latency and throughput"""
-        start_time = time.perf_counter()
-        output = self.pipe([self.qa_prompt])
-        end_time = time.perf_counter()
-        print(f"Generated Text: {repr(output[0]['generated_text'])}")
-        print(f"Latency: {end_time-start_time} seconds")
-        throughput = self.max_new_tokens / (end_time - start_time)
-        print(f"Throughput (including tokenization): {throughput} tokens/second")
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/run_localGPT.py b/run_localGPT.py
index 9239b5b7..0cc876d4 100644
--- a/run_localGPT.py
+++ b/run_localGPT.py
@@ -35,8 +35,7 @@
     MODEL_BASENAME,
     MAX_NEW_TOKENS,
     MODELS_PATH,
-    CHROMA_SETTINGS,
-    EMBEDDING_INPUT_SIZE,
+    CHROMA_SETTINGS,    
 )
 
 
@@ -60,8 +59,7 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
     """
     logging.info(f"Loading Model: {model_id}, on: {device_type}")
     logging.info("This action can take a few minutes!")
-    #process_rank = -1
-
+    
     if model_basename is not None:
         if ".gguf" in model_basename.lower():
             llm = load_quantized_model_gguf_ggml(model_id, model_basename, device_type, LOGGING)
@@ -95,7 +93,6 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
             max_padding_length=5000,
         )
         pipe.compile_graph()
-        #process_rank = pipe.get_process_rank()
     else:
         pipe = pipeline(
         "text-generation",
@@ -147,10 +144,6 @@ def retrieval_qa_pipline(device_type, use_history, promptTemplate_type="llama"):
         from gaudi_utils.embeddings import load_embeddings
 
         embeddings = load_embeddings()
-            #embedding_input_size=EMBEDDING_INPUT_SIZE,
-            #model_name=EMBEDDING_MODEL_NAME,
-            #model_kwargs={"device": device_type},
-        #)
     else:
         embeddings = get_embeddings(device_type)
 

From fcd15fc316dc891668a54e2cb526020b8cf23668 Mon Sep 17 00:00:00 2001
From: Siddhi Velankar <siddhi.velankar@intel.com>
Date: Wed, 23 Oct 2024 14:43:59 -0500
Subject: [PATCH 15/17] add hpu details to README

---
 README.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index cb22884f..ea9cbb04 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@
 - **Chat History**: Remembers your previous conversations (in a session).
 - **API**: LocalGPT has an API that you can use for building RAG Applications.
 - **Graphical Interface**: LocalGPT comes with two GUIs, one uses the API and the other is standalone (based on streamlit).
-- **GPU, CPU & MPS Support**: Supports multiple platforms out of the box, Chat with your data using `CUDA`, `CPU` or `MPS` and more!
+- **GPU, CPU, HPU & MPS Support**: Supports multiple platforms out of the box, Chat with your data using `CUDA`, `CPU`, `HPU (Intel® Gaudi®)` or `MPS` and more!
 
 ## Dive Deeper with Our Videos 🎥
 - [Detailed code-walkthrough](https://youtu.be/MlyoObdIHyo)
@@ -98,6 +98,7 @@ It includes CUDA, your system just needs Docker, BuildKit, your NVIDIA GPU drive
 Build as `docker build -t localgpt .`, requires BuildKit.
 Docker BuildKit does not support GPU during *docker build* time right now, only during *docker run*.
 Run as `docker run -it --mount src="$HOME/.cache",target=/root/.cache,type=bind --gpus=all localgpt`.
+For running the code on Intel® Gaudi® HPU, use the following Dockerfile - `Dockerfile_hpu`.
 
 ## Test dataset
 
@@ -173,6 +174,11 @@ You can also specify the device type just like `ingest.py`
 python run_localGPT.py --device_type mps # to run on Apple silicon
 ```
 
+```shell
+MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2" # in constants.py
+python run_localGPT.py --device_type hpu # to run on Intel® Gaudi® hpu
+```
+
 This will load the ingested vector store and embedding model. You will be presented with a prompt:
 
 ```shell

From f396f76c6020293013f482af3bae59c4aca9e521 Mon Sep 17 00:00:00 2001
From: Siddhi Velankar <siddhi.velankar@intel.com>
Date: Wed, 23 Oct 2024 14:46:46 -0500
Subject: [PATCH 16/17] cleanup readme

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ea9cbb04..b4a8c6fe 100644
--- a/README.md
+++ b/README.md
@@ -175,8 +175,9 @@ python run_localGPT.py --device_type mps # to run on Apple silicon
 ```
 
 ```shell
+# To run on Intel® Gaudi® hpu
 MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2" # in constants.py
-python run_localGPT.py --device_type hpu # to run on Intel® Gaudi® hpu
+python run_localGPT.py --device_type hpu
 ```
 
 This will load the ingested vector store and embedding model. You will be presented with a prompt:

From c83249b77c464f980b2c72421dffb9afd8e03dd8 Mon Sep 17 00:00:00 2001
From: Siddhi Velankar <siddhi.velankar@intel.com>
Date: Thu, 24 Oct 2024 14:13:06 -0500
Subject: [PATCH 17/17] default model llama3

---
 constants.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/constants.py b/constants.py
index 74a6c0ab..428ee459 100644
--- a/constants.py
+++ b/constants.py
@@ -106,10 +106,11 @@
 # MODEL_ID = "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF"
 # MODEL_BASENAME = "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf"
 
-MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2"
+# Use mistral to run on hpu
+# MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2"
 
 # LLAMA 3 # use for Apple Silicon
-#MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 MODEL_BASENAME = None
 
 # LLAMA 3 # use for NVIDIA GPUs