Skip to content

Commit

Permalink
Refactor LLM text generation native comps (opea-project#1151)
Browse files Browse the repository at this point in the history
Part work of code refactor to combine different text generation backends, remove duplcated native langchain and llama_index folder, consice the optimum habana implementation as a native integration OPEATextGen_Native.

Add feature for issue opea-project#998

Signed-off-by: Xinyao Wang <[email protected]>
  • Loading branch information
XinyaoWa authored and smguggen committed Jan 23, 2025
1 parent 8232554 commit 4e00bf4
Show file tree
Hide file tree
Showing 23 changed files with 533 additions and 1,436 deletions.
12 changes: 4 additions & 8 deletions .github/workflows/docker/compose/llms-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ services:
build:
dockerfile: comps/llms/src/text-generation/Dockerfile
image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
llm-textgen-gaudi:
build:
dockerfile: comps/llms/src/text-generation/Dockerfile.intel_hpu
image: ${REGISTRY:-opea}/llm-textgen-gaudi:${TAG:-latest}
llm-ollama:
build:
dockerfile: comps/llms/text-generation/ollama/langchain/Dockerfile
Expand All @@ -19,14 +23,6 @@ services:
build:
dockerfile: comps/llms/src/faq-generation/Dockerfile
image: ${REGISTRY:-opea}/llm-faqgen:${TAG:-latest}
llm-native:
build:
dockerfile: comps/llms/text-generation/native/langchain/Dockerfile
image: ${REGISTRY:-opea}/llm-native:${TAG:-latest}
llm-native-llamaindex:
build:
dockerfile: comps/llms/text-generation/native/llama_index/Dockerfile
image: ${REGISTRY:-opea}/llm-native-llamaindex:${TAG:-latest}
llm-eval:
build:
dockerfile: comps/llms/utils/lm-eval/Dockerfile
Expand Down

This file was deleted.

1 change: 0 additions & 1 deletion comps/llms/src/text-generation/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,3 @@ ENV PYTHONPATH=$PYTHONPATH:/home/user
WORKDIR /home/user/comps/llms/src/text-generation

ENTRYPOINT ["bash", "entrypoint.sh"]

Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
# SPDX-License-Identifier: Apache-2.0

# HABANA environment
FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0 AS hpu
FROM vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1 AS hpu

ENV LANG=en_US.UTF-8
ARG REPO=https://github.com/huggingface/optimum-habana.git
ARG REPO_VER=v1.12.1
ARG REPO_VER=v1.15.0

RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
git-lfs \
Expand All @@ -23,17 +23,18 @@ RUN git lfs install

COPY comps /home/user/comps

RUN pip install --no-cache-dir --upgrade-strategy eager optimum[habana] && \
pip install --no-cache-dir git+https://github.com/HabanaAI/[email protected]
RUN pip install --no-cache-dir --upgrade pip setuptools && \
pip install --no-cache-dir --upgrade-strategy eager optimum[habana] && \
pip install --no-cache-dir git+https://github.com/HabanaAI/[email protected]

RUN git clone ${REPO} /home/user/optimum-habana && \
cd /home/user/optimum-habana && git checkout ${REPO_VER} && \
cd examples/text-generation && pip install --no-cache-dir -r requirements.txt && \
cd /home/user/comps/llms/text-generation/native/llama_index && pip install --no-cache-dir -r requirements.txt && \
pip install --no-cache-dir --upgrade --force-reinstall pydantic
cd /home/user/comps/llms/src/text-generation/ && pip install --no-cache-dir -r requirements.txt && \
pip install --no-cache-dir --upgrade --force-reinstall pydantic numpy==1.23.5

ENV PYTHONPATH=/root:/home/user

WORKDIR /home/user/comps/llms/text-generation/native/llama_index
WORKDIR /home/user/comps/llms/src/text-generation/

ENTRYPOINT ["python", "llm.py"]
ENTRYPOINT ["bash", "entrypoint.sh"]
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ export HUGGINGFACEHUB_API_TOKEN="your_huggingface_token"

```bash
cd ../../../../../
docker build -t opea/llm-native:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/native/langchain/Dockerfile .
docker build -t opea/llm-native:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/text-generation/Dockerfile .
```

To start a docker container, you have two options:
Expand Down
255 changes: 255 additions & 0 deletions comps/llms/src/text-generation/integrations/native.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
# Copyright (c) 2024 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys

sys.path.append("/test/GenAIComps/")

import os
import threading
import time

import torch
from langchain_core.prompts import PromptTemplate

from comps import CustomLogger, GeneratedDoc, LLMParamsDoc, OpeaComponent, OpeaComponentRegistry, ServiceType
from comps.cores.proto.api_protocol import ChatCompletionRequest

from .template import ChatTemplate
from .utils import initialize_model

logger = CustomLogger("opea_textgen_native")
logflag = os.getenv("LOGFLAG", False)

MODEL_NAME = os.getenv("LLM_MODEL_ID", "Qwen/Qwen2-7B-Instruct")

input_sentences = [
"DeepSpeed is a machine learning framework",
"He is working on",
"He has a",
"He got all",
"Everyone is happy and I can",
"The new movie that got Oscar this year",
"In the far far distance from our galaxy,",
"Peace is the only way",
]

args_dict = {
"device": "hpu",
"model_name_or_path": MODEL_NAME,
"bf16": True,
"max_new_tokens": 100,
"max_input_tokens": 0,
"batch_size": 1,
"warmup": 3,
"n_iterations": 5,
"local_rank": 0,
"use_kv_cache": True,
"use_hpu_graphs": True,
"dataset_name": None,
"column_name": None,
"do_sample": False,
"num_beams": 1,
"trim_logits": False,
"seed": 27,
"profiling_warmup_steps": 0,
"profiling_steps": 0,
"profiling_record_shapes": False,
"prompt": None,
"bad_words": None,
"force_words": None,
"assistant_model": None,
"peft_model": None,
"num_return_sequences": 1,
"token": None,
"model_revision": "main",
"attn_softmax_bf16": False,
"output_dir": None,
"bucket_size": -1,
"bucket_internal": False,
"dataset_max_samples": -1,
"limit_hpu_graphs": False,
"reuse_cache": False,
"verbose_workers": False,
"simulate_dyn_prompt": None,
"reduce_recompile": False,
"use_flash_attention": False,
"flash_attention_recompute": False,
"flash_attention_causal_mask": False,
"flash_attention_fast_softmax": False,
"book_source": False,
"torch_compile": False,
"ignore_eos": True,
"temperature": 1.0,
"top_p": 1.0,
"top_k": None,
"const_serialization_path": None,
"disk_offload": False,
"trust_remote_code": False,
"quant_config": "",
"world_size": 0,
"show_graphs_count": False,
"load_quantized_model_with_inc": False,
"local_quantized_inc_model_path": None,
"load_quantized_model_with_autogptq": False,
"penalty_alpha": None,
}


class Args:
def __init__(self, **entries):
self.__dict__.update(entries)


model = None
assistant_model = None
tokenizer = None
generation_config = None
args = Args(**args_dict)
initialization_lock = threading.Lock()
initialized = False


def generate(
input_query: list,
device="hpu",
use_lazy_mode=True,
use_hpu_graphs=True,
profiling_steps=0,
profiling_warmup_steps=0,
ignore_eos=True,
profiling_record_shapes=False,
):
"""Generates sequences from the input sentences and returns them."""
logger.info(f"[llm - generate] starting to inference with prompt {input_query}")
encode_t0 = time.perf_counter()

# Tokenization
input_tokens = tokenizer.batch_encode_plus(
input_query,
return_tensors="pt",
padding=True,
return_token_type_ids=False, # token_type_ids is not needed for falcon-three model
)
encode_duration = time.perf_counter() - encode_t0
logger.info(f"[llm - generate] input tokenized: {input_tokens}")

# Move inputs to target device(s)
for t in input_tokens:
logger.info(f"[llm - generate] t: {t}")
if torch.is_tensor(input_tokens[t]):
logger.info("[llm - generate] input[t] is tensor")
logger.info(f"[llm - generate] device: {model.device}")
input_tokens[t] = input_tokens[t].to(model.device)

logger.info("[llm - generate] inputs transferred.")

iteration_times = []
outputs = model.generate(
**input_tokens,
generation_config=generation_config,
assistant_model=assistant_model,
lazy_mode=use_lazy_mode,
hpu_graphs=use_hpu_graphs,
profiling_steps=profiling_steps,
profiling_warmup_steps=profiling_warmup_steps,
ignore_eos=ignore_eos,
iteration_times=iteration_times,
profiling_record_shapes=profiling_record_shapes,
).cpu()
logger.info("[llm - generate] result generated")
first_token_time = iteration_times[0] + encode_duration
result = tokenizer.batch_decode(outputs, skip_special_tokens=True)
logger.info(f"[llm - generate] result: {result}")
logger.info(f"[llm - generate] Time to first token = {first_token_time*1000}ms")
return result


def initialize():
global model, assistant_model, tokenizer, generation_config, initialized
with initialization_lock:
if not initialized:
# initialize model and tokenizer
import habana_frameworks.torch.hpu as torch_hpu
from optimum.habana.utils import HabanaProfile

model, assistant_model, tokenizer, generation_config = initialize_model(args, logger)
logger.info("[llm] model and tokenizer initialized.")

# compilation and model warmup
HabanaProfile.disable()
logger.info("[llm - native] Graph compilation...")
for _ in range(args.warmup):
generate(input_sentences)
logger.info("[llm - native] model warm up finished.")
torch_hpu.synchronize()
HabanaProfile.enable()
logger.info("[llm - native] Ready to inference")
res = generate(["What is Deep Learning?"])
logger.info(f"[llm - native] test result: {res}")
initialized = True


@OpeaComponentRegistry.register("OPEATextGen_Native")
class OPEATextGen_Native(OpeaComponent):
"""A specialized OPEA TextGen component derived from OpeaComponent for interacting with LLM services based on native optimum habana."""

def __init__(self, name: str, description: str, config: dict = None):
super().__init__(name, ServiceType.LLM.name.lower(), description, config)
initialize()
health_status = self.check_health()
if not health_status:
logger.error("OPEATextGen_Native health check failed.")
else:
logger.info("OPEATextGen_Native health check success.")

def check_health(self) -> bool:
"""Checks the health of the LLM service.
Returns:
bool: True if the service is reachable and healthy, False otherwise.
"""

try:
return initialized
except Exception as e:
logger.error(e)
logger.error("Health check failed")
return False

async def invoke(self, input: LLMParamsDoc):
"""Invokes the LLM service to generate output for the provided input.
Args:
input (LLMParamsDoc): The input text(s).
"""
prompt = input.query
prompt_template = None
if input.chat_template:
prompt_template = PromptTemplate.from_template(input.chat_template)
input_variables = prompt_template.input_variables
if prompt_template:
if sorted(input_variables) == ["context", "question"]:
prompt = prompt_template.format(question=input.query, context="\n".join(input.documents))
elif input_variables == ["question"]:
prompt = prompt_template.format(question=input.query)
else:
logger.info(f"{prompt_template} not used, we only support 2 input variables ['question', 'context']")
else:
if input.documents:
prompt = ChatTemplate.generate_rag_prompt(input.query, input.documents)
res = generate([prompt])

if logflag:
logger.info(f"[llm - native] inference result: {res}")
return GeneratedDoc(text=res[0], prompt=input.query)
2 changes: 1 addition & 1 deletion comps/llms/src/text-generation/integrations/opea.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
# Environment variables
MODEL_NAME = os.getenv("LLM_MODEL_ID")
MODEL_CONFIGS = os.getenv("MODEL_CONFIGS")
DEFAULT_ENDPOINT = os.getenv("LLM_ENDPOINT")
DEFAULT_ENDPOINT = os.getenv("LLM_ENDPOINT", "http://localhost:8080")
TOKEN_URL = os.getenv("TOKEN_URL")
CLIENTID = os.getenv("CLIENTID")
CLIENT_SECRET = os.getenv("CLIENT_SECRET")
Expand Down
4 changes: 2 additions & 2 deletions comps/llms/src/text-generation/integrations/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@

class ChatTemplate:
@staticmethod
def generate_rag_prompt(question, documents, model):
def generate_rag_prompt(question, documents, model=None):
context_str = "\n".join(documents)
if model == "meta-llama/Meta-Llama-3.1-70B-Instruct" or model == "meta-llama/Meta-Llama-3.1-8B-Instruct":
if model in ["meta-llama/Meta-Llama-3.1-70B-Instruct", "meta-llama/Meta-Llama-3.1-8B-Instruct"]:
template = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise <|eot_id|><|start_header_id|>user<|end_header_id|>
Question: {question}
Expand Down
Loading

0 comments on commit 4e00bf4

Please sign in to comment.