diff --git a/server_vllm.py b/server_vllm.py index d97d547..e0cdd41 100644 --- a/server_vllm.py +++ b/server_vllm.py @@ -138,13 +138,11 @@ async def create_chat_completion(raw_request: Request): logger.info(f"args: {args}") - if args.served_model_name is not None: - logger.info( - "args.served_model_name is not used in this service and will be ignored. Served model will consist of args.model only." - ) - served_model = [args.model] + if args.served_model_name is not None: + served_model += args.served_model_name + engine_args = AsyncEngineArgs.from_cli_args(args) # A separate tokenizer to map token IDs to strings. tokenizer = get_tokenizer(