PaddlePaddle · Jiang-Jia-Jun · Sep 3, 2024 · Sep 2, 2024 · Sep 2, 2024
diff --git a/llm/client/fastdeploy_client/chatbot.py b/llm/client/fastdeploy_client/chatbot.py
@@ -60,28 +60,25 @@ def stream_generate(self,
         Streaming interface
 
         Args:
-            message (Union[str, List[str], ChatMessage]): 消息内容或ChatMessage对象
-            max_dec_len (int, optional): 最大解码长度. Defaults to 1024.
-            min_dec_len (int, optional): 最小解码长度. Defaults to 1.
-            topp (float, optional): 控制随机性参数，数值越大则随机性越大，范围是0~1. Defaults to 0.7.
-            temperature (float, optional): 温度值. Defaults to 0.95.
-            frequency_score (float, optional): 频率分数. Defaults to 0.0.
-            penalty_score (float, optional): 惩罚分数. Defaults to 1.0.
-            presence_score (float, optional): 存在分数. Defaults to 0.0.
-            system (str, optional): 系统设定. Defaults to None.
-            **kwargs: 其他参数
-                req_id (str, optional): 请求ID，用于区分不同的请求. Defaults to None.
-                eos_token_ids (List[int], optional): 指定结束的token id. Defaults to None.
-                benchmark (bool, optional): 设置benchmark模式，如果是则返回完整的response. Defaults to False.
-                timeout (int, optional): 请求超时时间，不设置则使用120s. Defaults to None.
+            message (Union[str, List[str], ChatMessage]):  message or ChatMessage object
+            max_dec_len (int, optional): max decoding length. Defaults to 1024.
+            min_dec_len (int, optional): min decoding length. Defaults to 1.
+            topp (float, optional): randomness of the generated tokens. Defaults to 0.7.
+            temperature (float, optional): temperature. Defaults to 0.95.
+            frequency_score (float, optional): frequency score. Defaults to 0.0.
+            penalty_score (float, optional): penalty score. Defaults to 1.0.
+            presence_score (float, optional): presence score. Defaults to 0.0.
+            system (str, optional): system settings. Defaults to None.
+            **kwargs: others
+
+            For more details, please refer to https://github.com/PaddlePaddle/FastDeploy/blob/develop/llm/docs/FastDeploy_usage_tutorial.md#%E8%AF%B7%E6%B1%82%E5%8F%82%E6%95%B0%E4%BB%8B%E7%BB%8D
 
         Returns:
-            返回一个生成器，每次yield返回一个字典。
-            正常情况下，生成器返回字典的示例{"req_id": "xxx", "token": "好的", "is_end": 0}，其中token为生成的字符，is_end表明是否为最后一个字符（0表示否，1表示是）
-            错误情况下，生成器返回错误信息的字典，示例 {"req_id": "xxx", "error_msg": "error message"}
+            return a generator object, which yields a dict.
+            Normal, return {'token': xxx, 'is_end': xxx, 'send_idx': xxx, ..., 'error_msg': '', 'error_code': 0}
+            Others, return {'error_msg': xxx, 'error_code': xxx}, error_msg not None, error_code != 0
         """
         try:
-            # 准备输入
             model_name = "model"
             inputs = [grpcclient.InferInput("IN", [1], triton_utils.np_to_triton_dtype(np.object_))]
             outputs = [grpcclient.InferRequestedOutput("OUT")]
@@ -96,14 +93,11 @@ def stream_generate(self,
             timeout = kwargs.get("timeout", self.timeout)
 
             with grpcclient.InferenceServerClient(url=self.url, verbose=False) as triton_client:
-                # 建立连接
                 triton_client.start_stream(callback=partial(triton_callback, output_data))
-                # 发送请求
                 triton_client.async_stream_infer(model_name=model_name,
                                                     inputs=inputs,
                                                     request_id=req_id,
                                                     outputs=outputs)
-                # 处理结果
                 answer_str = ""
                 enable_benchmark = is_enable_benchmark(**kwargs)
                 while True:
@@ -129,7 +123,6 @@ def stream_generate(self,
                         yield response
                         if response.get("is_end") == 1 or response.get("error_msg") is not None:
                             break
-            # 手动关闭
             triton_client.stop_stream(cancel_requests=True)
             triton_client.close()
 
@@ -150,27 +143,26 @@ def generate(self,
                  system=None,
                  **kwargs):
         """
-        整句返回，直接使用流式返回的接口。
+        Return the entire sentence using the streaming interface.
 
         Args:
-            message (Union[str, List[str], ChatMessage]): 消息内容或ChatMessage对象
-            max_dec_len (int, optional): 最大解码长度. Defaults to 1024.
-            min_dec_len (int, optional): 最小解码长度. Defaults to 1.
-            topp (float, optional): 控制随机性参数，数值越大则随机性越大，范围是0~1. Defaults to 0.7.
-            temperature (float, optional): 温度值. Defaults to 0.95.
-            frequency_score (float, optional): 频率分数. Defaults to 0.0.
-            penalty_score (float, optional): 惩罚分数. Defaults to 1.0.
-            presence_score (float, optional): 存在分数. Defaults to 0.0.
-            system (str, optional): 系统设定. Defaults to None.
-            **kwargs: 其他参数
-                req_id (str, optional): 请求ID，用于区分不同的请求. Defaults to None.
-                eos_token_ids (List[int], optional): 指定结束的token id. Defaults to None.
-                timeout (int, optional): 请求超时时间，不设置则使用120s. Defaults to None.
+            message (Union[str, List[str], ChatMessage]):  message or ChatMessage object
+            max_dec_len (int, optional): max decoding length. Defaults to 1024.
+            min_dec_len (int, optional): min decoding length. Defaults to 1.
+            topp (float, optional): randomness of the generated tokens. Defaults to 0.7.
+            temperature (float, optional): temperature. Defaults to 0.95.
+            frequency_score (float, optional): frequency score. Defaults to 0.0.
+            penalty_score (float, optional): penalty score. Defaults to 1.0.
+            presence_score (float, optional): presence score. Defaults to 0.0.
+            system (str, optional): system settings. Defaults to None.
+            **kwargs: others
+
+            For more details, please refer to https://github.com/PaddlePaddle/FastDeploy/blob/develop/llm/docs/FastDeploy_usage_tutorial.md#%E8%AF%B7%E6%B1%82%E5%8F%82%E6%95%B0%E4%BB%8B%E7%BB%8D
 
         Returns:
-            返回一个字典。
-            正常情况下，返回字典的示例{"req_id": "xxx", "results": "好的，我知道了。"}
-            错误情况下，返回错误信息的字典，示例 {"req_id": "xxx", "error_msg": "error message"}
+            return the entire sentence or error message.
+            Normal, return {'tokens_all': xxx, ..., 'error_msg': '', 'error_code': 0}
+            Others, return {'error_msg': xxx, 'error_code': xxx}, error_msg not None, error_code != 0
         """
         stream_response = self.stream_generate(message, max_dec_len,
                                                min_dec_len, topp, temperature,
@@ -205,7 +197,7 @@ def _prepare_input_data(self,
                         system=None,
                         **kwargs):
         """
-        准备输入数据。
+        Prepare to input data
         """
         inputs = {
             "max_dec_len": max_dec_len,
@@ -248,7 +240,7 @@ def _prepare_input_data(self,
 
     def _format_response(self, response, req_id):
         """
-        对服务返回字段进行格式化
+        Format the service return fields
         """
         response = json.loads(response.as_numpy("OUT")[0])
         if isinstance(response, (list, tuple)):
@@ -273,13 +265,17 @@ def _format_response(self, response, req_id):
 
 
 class OutputData:
-    """接收Triton服务返回的数据"""
+    """
+    Receive data returned by Triton service
+    """
     def __init__(self):
         self._completed_requests = queue.Queue()
 
 
 def triton_callback(output_data, result, error):
-    """Triton客户端的回调函数"""
+    """
+    callback function for Triton server
+    """
     if error:
         output_data._completed_requests.put(error)
     else:
@@ -288,17 +284,17 @@ def triton_callback(output_data, result, error):
 
 class ChatBot(object):
     """
-    对外的接口，用于创建ChatBotForPushMode的示例
+    External interface, create a client object ChatBotForPushMode
     """
     def __new__(cls, hostname, port, timeout=120):
         """
-        初始化函数，用于创建一个GRPCInferenceService客户端对象
+        initialize a GRPCInferenceService client
         Args:
-            hostname (str): 服务器的地址
-            port (int): 服务器的端口号
-            timeout (int): 请求超时时间，单位为秒，默认120秒
+            hostname (str): server hostname
+            port (int): GRPC port
+            timeout (int): timeout(s), default 120 seconds
         Returns:
-            ChatBotClass: 返回一个BaseChatBot对象
+            ChatBotClass: BaseChatBot object
         """
         if not isinstance(hostname, str) or not hostname:
             raise ValueError("Invalid hostname")

diff --git a/llm/client/fastdeploy_client/command.py b/llm/client/fastdeploy_client/command.py
@@ -21,7 +21,10 @@
 
 def _get_service_configuration():
     """
-    从环境变量获取服务配置信息
+    get service url from environment
+
+    Returns:
+        tuple: (hostname, port)
     """
     url = os.getenv("FASTDEPLOY_MODEL_URL")
 
@@ -38,7 +41,7 @@ def _get_service_configuration():
 
 def stream_generate(prompt):
     """
-    命令工具：流式返回
+    Streaming interface
     """
     hostname, port = _get_service_configuration()
     chatbot = ChatBot(hostname=hostname, port=port)
@@ -49,7 +52,7 @@ def stream_generate(prompt):
 
 def generate(prompt):
     """
-    命令工具：整句返回
+    entire sentence interface
     """
     hostname, port = _get_service_configuration()
     chatbot = ChatBot(hostname=hostname, port=port)
@@ -58,9 +61,6 @@ def generate(prompt):
 
 
 def main():
-    """
-    命令工具主入口
-    """
     if len(sys.argv) < 2 or sys.argv[1] not in ["generate", "stream_generate"]:
         logging.error("Usage 1: fdclient generate \"Hello, How are you?\"")
         return

diff --git a/llm/client/fastdeploy_client/message.py b/llm/client/fastdeploy_client/message.py
@@ -14,8 +14,7 @@
 
 class ChatMessage(object):
     """
-    多轮对话数据结构，当使用这个与ChatBot对话时
-    会将对话记录存储在此结构体内，支持多轮
+    multi-turn chat message with ChatBot
     """
     def __init__(self, prompt=None):
         if prompt is not None:
@@ -25,7 +24,7 @@ def __init__(self, prompt=None):
 
     def add_user_message(self, content):
         """
-        添加一个用户消息
+        add user message
         """
         if len(self.message) > 0 and self.message[-1]["role"] != "assistant":
             raise Exception("Cannot add user message, because the role of the "
@@ -34,7 +33,7 @@ def add_user_message(self, content):
 
     def add_assistant_message(self, content):
         """
-        添加一个assistant消息
+        add assistant message
         """
         if len(self.message) > 0 and self.message[-1]["role"] != "user":
             raise Exception("Cannot add user message, because the role of the "
@@ -43,7 +42,7 @@ def add_assistant_message(self, content):
 
     def next_prompt(self, content):
         """
-        添加一个新的对话，保留用于兼容。
+        add user message and return a new prompt
         """
         self.add_user_message(content)
 

diff --git a/llm/client/fastdeploy_client/utils.py b/llm/client/fastdeploy_client/utils.py
@@ -13,5 +13,7 @@
 # limitations under the License.
 
 def is_enable_benchmark(**kwargs):
-    """是否是benchmark模式"""
+    """
+    Check if enable benchmark
+    """
     return "benchmark" in kwargs and kwargs["benchmark"] == 1
diff --git a/llm/dockerfiles/Dockerfile_serving_cuda118_cudnn8 b/llm/dockerfiles/Dockerfile_serving_cuda118_cudnn8
@@ -1,15 +1,16 @@
 FROM registry.baidubce.com/paddlepaddle/fastdeploy:llm-base-gcc12.3-cuda11.8-cudnn8-nccl2.15.5
 
 WORKDIR /opt/output/
-COPY ./server/ /opt/output/Serving
+COPY ./server/ /opt/output/Serving/
 COPY ./client/ /opt/output/client/
 
+ENV LD_LIBRARY_PATH "/usr/local/cuda-11.8/compat/:$LD_LIBRARY_PATH"
+
+RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
 RUN python3 -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu118/ \
     && python3 -m pip install paddlenlp==3.0.0b0 \
-    && python3 -m pip install --no-cache-dir sentencepiece pycryptodome tritonclient[all]==2.41.1 \
-    && apt-get clean && rm -rf /var/lib/apt/lists/*
+    && python3 -m pip install --no-cache-dir sentencepiece pycryptodome tritonclient[all]==2.41.1
 
-ENV LD_LIBRARY_PATH "/usr/local/cuda-11.8/compat/:$LD_LIBRARY_PATH"
 RUN git clone https://gitee.com/paddlepaddle/PaddleNLP.git && cd PaddleNLP/csrc \
     && python3 setup_cuda.py build && python3 setup_cuda.py install --user \
     && cp -r /opt/output/PaddleNLP/paddlenlp /usr/local/lib/python3.10/dist-packages/ \

diff --git a/llm/dockerfiles/Dockerfile_serving_cuda123_cudnn9 b/llm/dockerfiles/Dockerfile_serving_cuda123_cudnn9
@@ -1,15 +1,16 @@
 FROM registry.baidubce.com/paddlepaddle/fastdeploy:llm-base-gcc12.3-cuda12.3-cudnn9-nccl2.15.5
 
 WORKDIR /opt/output/
-COPY ./server/ /opt/output/Serving
+COPY ./server/ /opt/output/Serving/
 COPY ./client/ /opt/output/client/
 
+ENV LD_LIBRARY_PATH "/usr/local/cuda-12.3/compat/:$LD_LIBRARY_PATH"
+
+RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
 RUN python3 -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu123/ \
     && python3 -m pip install paddlenlp==3.0.0b0 \
-    && python3 -m pip install --no-cache-dir sentencepiece pycryptodome tritonclient[all]==2.41.1 \
-    && apt-get clean && rm -rf /var/lib/apt/lists/*
+    && python3 -m pip install --no-cache-dir sentencepiece pycryptodome tritonclient[all]==2.41.1
 
-ENV LD_LIBRARY_PATH "/usr/local/cuda-12.3/compat/:$LD_LIBRARY_PATH"
 RUN git clone https://gitee.com/paddlepaddle/PaddleNLP.git && cd PaddleNLP/csrc \
     && python3 setup_cuda.py build && python3 setup_cuda.py install --user \
     && cp -r /opt/output/PaddleNLP/paddlenlp /usr/local/lib/python3.10/dist-packages/ \