Refactor LLM text generation native comps (opea-project#1151)

Part work of code refactor to combine different text generation backends, remove duplcated native langchain and llama_index folder, consice the optimum habana implementation as a native integration OPEATextGen_Native. Add feature for issue opea-project#998 Signed-off-by: Xinyao Wang <[email protected]>
opea-aws-proserve · Jan 23, 2025 · 4e00bf4 · 4e00bf4
1 parent 8232554
commit 4e00bf4
Show file tree

Hide file tree

Showing 23 changed files with 533 additions and 1,436 deletions.
diff --git a/.github/workflows/docker/compose/llms-compose.yaml b/.github/workflows/docker/compose/llms-compose.yaml
@@ -7,6 +7,10 @@ services:
     build:
       dockerfile: comps/llms/src/text-generation/Dockerfile
     image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
+  llm-textgen-gaudi:
+    build:
+      dockerfile: comps/llms/src/text-generation/Dockerfile.intel_hpu
+    image: ${REGISTRY:-opea}/llm-textgen-gaudi:${TAG:-latest}
   llm-ollama:
     build:
       dockerfile: comps/llms/text-generation/ollama/langchain/Dockerfile
@@ -19,14 +23,6 @@ services:
     build:
       dockerfile: comps/llms/src/faq-generation/Dockerfile
     image: ${REGISTRY:-opea}/llm-faqgen:${TAG:-latest}
-  llm-native:
-    build:
-      dockerfile: comps/llms/text-generation/native/langchain/Dockerfile
-    image: ${REGISTRY:-opea}/llm-native:${TAG:-latest}
-  llm-native-llamaindex:
-    build:
-      dockerfile: comps/llms/text-generation/native/llama_index/Dockerfile
-    image: ${REGISTRY:-opea}/llm-native-llamaindex:${TAG:-latest}
   llm-eval:
     build:
       dockerfile: comps/llms/utils/lm-eval/Dockerfile

diff --git a/comps/llms/deployment/docker_compose/text-generation_native_llama_index.yaml b/comps/llms/deployment/docker_compose/text-generation_native_llama_index.yaml
diff --git a/comps/llms/src/text-generation/Dockerfile b/comps/llms/src/text-generation/Dockerfile
@@ -23,4 +23,3 @@ ENV PYTHONPATH=$PYTHONPATH:/home/user
 WORKDIR /home/user/comps/llms/src/text-generation
 
 ENTRYPOINT ["bash", "entrypoint.sh"]
-
diff --git a/...-generation/native/llama_index/Dockerfile → .../src/text-generation/Dockerfile.intel_hpu b/...-generation/native/llama_index/Dockerfile → .../src/text-generation/Dockerfile.intel_hpu
@@ -2,11 +2,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # HABANA environment
-FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0 AS hpu
+FROM vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1 AS hpu
 
 ENV LANG=en_US.UTF-8
 ARG REPO=https://github.com/huggingface/optimum-habana.git
-ARG REPO_VER=v1.12.1
+ARG REPO_VER=v1.15.0
 
 RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
     git-lfs \
@@ -23,17 +23,18 @@ RUN git lfs install
 
 COPY comps /home/user/comps
 
-RUN pip install --no-cache-dir --upgrade-strategy eager optimum[habana] && \
-    pip install --no-cache-dir git+https://github.com/HabanaAI/[email protected]
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
+    pip install --no-cache-dir --upgrade-strategy eager optimum[habana] && \
+    pip install --no-cache-dir git+https://github.com/HabanaAI/[email protected]
 
 RUN git clone ${REPO} /home/user/optimum-habana && \
     cd /home/user/optimum-habana && git checkout ${REPO_VER} && \
     cd examples/text-generation && pip install --no-cache-dir -r requirements.txt && \
-    cd /home/user/comps/llms/text-generation/native/llama_index && pip install --no-cache-dir -r requirements.txt && \
-    pip install --no-cache-dir --upgrade --force-reinstall pydantic
+    cd /home/user/comps/llms/src/text-generation/ && pip install --no-cache-dir -r requirements.txt && \
+    pip install --no-cache-dir --upgrade --force-reinstall pydantic numpy==1.23.5
 
 ENV PYTHONPATH=/root:/home/user
 
-WORKDIR /home/user/comps/llms/text-generation/native/llama_index
+WORKDIR /home/user/comps/llms/src/text-generation/
 
-ENTRYPOINT ["python", "llm.py"]
+ENTRYPOINT ["bash", "entrypoint.sh"]
diff --git a/...ext-generation/native/langchain/README.md → ...llms/src/text-generation/README_native.md b/...ext-generation/native/langchain/README.md → ...llms/src/text-generation/README_native.md
@@ -21,7 +21,7 @@ export HUGGINGFACEHUB_API_TOKEN="your_huggingface_token"
 
 ```bash
 cd ../../../../../
-docker build -t opea/llm-native:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/native/langchain/Dockerfile .
+docker build -t opea/llm-native:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/text-generation/Dockerfile .
 ```
 
 To start a docker container, you have two options:

diff --git a/comps/llms/src/text-generation/integrations/native.py b/comps/llms/src/text-generation/integrations/native.py
@@ -0,0 +1,255 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,  either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+
+sys.path.append("/test/GenAIComps/")
+
+import os
+import threading
+import time
+
+import torch
+from langchain_core.prompts import PromptTemplate
+
+from comps import CustomLogger, GeneratedDoc, LLMParamsDoc, OpeaComponent, OpeaComponentRegistry, ServiceType
+from comps.cores.proto.api_protocol import ChatCompletionRequest
+
+from .template import ChatTemplate
+from .utils import initialize_model
+
+logger = CustomLogger("opea_textgen_native")
+logflag = os.getenv("LOGFLAG", False)
+
+MODEL_NAME = os.getenv("LLM_MODEL_ID", "Qwen/Qwen2-7B-Instruct")
+
+input_sentences = [
+    "DeepSpeed is a machine learning framework",
+    "He is working on",
+    "He has a",
+    "He got all",
+    "Everyone is happy and I can",
+    "The new movie that got Oscar this year",
+    "In the far far distance from our galaxy,",
+    "Peace is the only way",
+]
+
+args_dict = {
+    "device": "hpu",
+    "model_name_or_path": MODEL_NAME,
+    "bf16": True,
+    "max_new_tokens": 100,
+    "max_input_tokens": 0,
+    "batch_size": 1,
+    "warmup": 3,
+    "n_iterations": 5,
+    "local_rank": 0,
+    "use_kv_cache": True,
+    "use_hpu_graphs": True,
+    "dataset_name": None,
+    "column_name": None,
+    "do_sample": False,
+    "num_beams": 1,
+    "trim_logits": False,
+    "seed": 27,
+    "profiling_warmup_steps": 0,
+    "profiling_steps": 0,
+    "profiling_record_shapes": False,
+    "prompt": None,
+    "bad_words": None,
+    "force_words": None,
+    "assistant_model": None,
+    "peft_model": None,
+    "num_return_sequences": 1,
+    "token": None,
+    "model_revision": "main",
+    "attn_softmax_bf16": False,
+    "output_dir": None,
+    "bucket_size": -1,
+    "bucket_internal": False,
+    "dataset_max_samples": -1,
+    "limit_hpu_graphs": False,
+    "reuse_cache": False,
+    "verbose_workers": False,
+    "simulate_dyn_prompt": None,
+    "reduce_recompile": False,
+    "use_flash_attention": False,
+    "flash_attention_recompute": False,
+    "flash_attention_causal_mask": False,
+    "flash_attention_fast_softmax": False,
+    "book_source": False,
+    "torch_compile": False,
+    "ignore_eos": True,
+    "temperature": 1.0,
+    "top_p": 1.0,
+    "top_k": None,
+    "const_serialization_path": None,
+    "disk_offload": False,
+    "trust_remote_code": False,
+    "quant_config": "",
+    "world_size": 0,
+    "show_graphs_count": False,
+    "load_quantized_model_with_inc": False,
+    "local_quantized_inc_model_path": None,
+    "load_quantized_model_with_autogptq": False,
+    "penalty_alpha": None,
+}
+
+
+class Args:
+    def __init__(self, **entries):
+        self.__dict__.update(entries)
+
+
+model = None
+assistant_model = None
+tokenizer = None
+generation_config = None
+args = Args(**args_dict)
+initialization_lock = threading.Lock()
+initialized = False
+
+
+def generate(
+    input_query: list,
+    device="hpu",
+    use_lazy_mode=True,
+    use_hpu_graphs=True,
+    profiling_steps=0,
+    profiling_warmup_steps=0,
+    ignore_eos=True,
+    profiling_record_shapes=False,
+):
+    """Generates sequences from the input sentences and returns them."""
+    logger.info(f"[llm - generate] starting to inference with prompt {input_query}")
+    encode_t0 = time.perf_counter()
+
+    # Tokenization
+    input_tokens = tokenizer.batch_encode_plus(
+        input_query,
+        return_tensors="pt",
+        padding=True,
+        return_token_type_ids=False,  # token_type_ids is not needed for falcon-three model
+    )
+    encode_duration = time.perf_counter() - encode_t0
+    logger.info(f"[llm - generate] input tokenized: {input_tokens}")
+
+    # Move inputs to target device(s)
+    for t in input_tokens:
+        logger.info(f"[llm - generate] t: {t}")
+        if torch.is_tensor(input_tokens[t]):
+            logger.info("[llm - generate] input[t] is tensor")
+            logger.info(f"[llm - generate] device: {model.device}")
+            input_tokens[t] = input_tokens[t].to(model.device)
+
+    logger.info("[llm - generate] inputs transferred.")
+
+    iteration_times = []
+    outputs = model.generate(
+        **input_tokens,
+        generation_config=generation_config,
+        assistant_model=assistant_model,
+        lazy_mode=use_lazy_mode,
+        hpu_graphs=use_hpu_graphs,
+        profiling_steps=profiling_steps,
+        profiling_warmup_steps=profiling_warmup_steps,
+        ignore_eos=ignore_eos,
+        iteration_times=iteration_times,
+        profiling_record_shapes=profiling_record_shapes,
+    ).cpu()
+    logger.info("[llm - generate] result generated")
+    first_token_time = iteration_times[0] + encode_duration
+    result = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+    logger.info(f"[llm - generate] result: {result}")
+    logger.info(f"[llm - generate] Time to first token = {first_token_time*1000}ms")
+    return result
+
+
+def initialize():
+    global model, assistant_model, tokenizer, generation_config, initialized
+    with initialization_lock:
+        if not initialized:
+            # initialize model and tokenizer
+            import habana_frameworks.torch.hpu as torch_hpu
+            from optimum.habana.utils import HabanaProfile
+
+            model, assistant_model, tokenizer, generation_config = initialize_model(args, logger)
+            logger.info("[llm] model and tokenizer initialized.")
+
+            # compilation and model warmup
+            HabanaProfile.disable()
+            logger.info("[llm - native] Graph compilation...")
+            for _ in range(args.warmup):
+                generate(input_sentences)
+            logger.info("[llm - native] model warm up finished.")
+            torch_hpu.synchronize()
+            HabanaProfile.enable()
+            logger.info("[llm - native] Ready to inference")
+            res = generate(["What is Deep Learning?"])
+            logger.info(f"[llm - native] test result: {res}")
+            initialized = True
+
+
+@OpeaComponentRegistry.register("OPEATextGen_Native")
+class OPEATextGen_Native(OpeaComponent):
+    """A specialized OPEA TextGen component derived from OpeaComponent for interacting with LLM services based on native optimum habana."""
+
+    def __init__(self, name: str, description: str, config: dict = None):
+        super().__init__(name, ServiceType.LLM.name.lower(), description, config)
+        initialize()
+        health_status = self.check_health()
+        if not health_status:
+            logger.error("OPEATextGen_Native health check failed.")
+        else:
+            logger.info("OPEATextGen_Native health check success.")
+
+    def check_health(self) -> bool:
+        """Checks the health of the LLM service.
+
+        Returns:
+            bool: True if the service is reachable and healthy, False otherwise.
+        """
+
+        try:
+            return initialized
+        except Exception as e:
+            logger.error(e)
+            logger.error("Health check failed")
+            return False
+
+    async def invoke(self, input: LLMParamsDoc):
+        """Invokes the LLM service to generate output for the provided input.
+
+        Args:
+            input (LLMParamsDoc): The input text(s).
+        """
+        prompt = input.query
+        prompt_template = None
+        if input.chat_template:
+            prompt_template = PromptTemplate.from_template(input.chat_template)
+            input_variables = prompt_template.input_variables
+        if prompt_template:
+            if sorted(input_variables) == ["context", "question"]:
+                prompt = prompt_template.format(question=input.query, context="\n".join(input.documents))
+            elif input_variables == ["question"]:
+                prompt = prompt_template.format(question=input.query)
+            else:
+                logger.info(f"{prompt_template} not used, we only support 2 input variables ['question', 'context']")
+        else:
+            if input.documents:
+                prompt = ChatTemplate.generate_rag_prompt(input.query, input.documents)
+        res = generate([prompt])
+
+        if logflag:
+            logger.info(f"[llm - native] inference result: {res}")
+        return GeneratedDoc(text=res[0], prompt=input.query)
diff --git a/comps/llms/src/text-generation/integrations/opea.py b/comps/llms/src/text-generation/integrations/opea.py
@@ -21,7 +21,7 @@
 # Environment variables
 MODEL_NAME = os.getenv("LLM_MODEL_ID")
 MODEL_CONFIGS = os.getenv("MODEL_CONFIGS")
-DEFAULT_ENDPOINT = os.getenv("LLM_ENDPOINT")
+DEFAULT_ENDPOINT = os.getenv("LLM_ENDPOINT", "http://localhost:8080")
 TOKEN_URL = os.getenv("TOKEN_URL")
 CLIENTID = os.getenv("CLIENTID")
 CLIENT_SECRET = os.getenv("CLIENT_SECRET")

diff --git a/comps/llms/src/text-generation/integrations/template.py b/comps/llms/src/text-generation/integrations/template.py
@@ -6,9 +6,9 @@
 
 class ChatTemplate:
     @staticmethod
-    def generate_rag_prompt(question, documents, model):
+    def generate_rag_prompt(question, documents, model=None):
         context_str = "\n".join(documents)
-        if model == "meta-llama/Meta-Llama-3.1-70B-Instruct" or model == "meta-llama/Meta-Llama-3.1-8B-Instruct":
+        if model in ["meta-llama/Meta-Llama-3.1-70B-Instruct", "meta-llama/Meta-Llama-3.1-8B-Instruct"]:
             template = """
             <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise <|eot_id|><|start_header_id|>user<|end_header_id|>
             Question: {question}
Original file line number	Diff line number	Diff line change
Expand Up		@@ -23,4 +23,3 @@ ENV PYTHONPATH=$PYTHONPATH:/home/user
		WORKDIR /home/user/comps/llms/src/text-generation

		ENTRYPOINT ["bash", "entrypoint.sh"]