update doc

xorbitsai · Nov 29, 2024 · 270f49e · 270f49e
1 parent 70a07ec
commit 270f49e
Show file tree

Hide file tree

Showing 5 changed files with 53 additions and 11 deletions.
diff --git a/doc/source/models/builtin/llm/qwq-32b-preview.rst b/doc/source/models/builtin/llm/qwq-32b-preview.rst
@@ -30,7 +30,23 @@ chosen quantization method from the options listed above::
    xinference launch --model-engine ${engine} --model-name QwQ-32B-Preview --size-in-billions 32 --model-format pytorch --quantization ${quantization}
 
 
-Model Spec 2 (ggufv2, 32 Billion)
+Model Spec 2 (awq, 32 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 32
+- **Quantizations:** Int4
+- **Engines**: Transformers
+- **Model ID:** KirillR/QwQ-32B-Preview-AWQ
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/KirillR/QwQ-32B-Preview-AWQ>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name QwQ-32B-Preview --size-in-billions 32 --model-format awq --quantization ${quantization}
+
+
+Model Spec 3 (ggufv2, 32 Billion)
 ++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** ggufv2
@@ -46,47 +62,47 @@ chosen quantization method from the options listed above::
    xinference launch --model-engine ${engine} --model-name QwQ-32B-Preview --size-in-billions 32 --model-format ggufv2 --quantization ${quantization}
 
 
-Model Spec 3 (mlx, 32 Billion)
+Model Spec 4 (mlx, 32 Billion)
 ++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** mlx
 - **Model Size (in billions):** 32
 - **Quantizations:** 4-bit
 - **Engines**: MLX
 - **Model ID:** mlx-community/Qwen_QwQ-32B-Preview_MLX-4bit
-- **Model Hubs**:  `Hugging Face <https://huggingface.co/mlx-community/Qwen_QwQ-32B-Preview_MLX-4bit>`__
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/mlx-community/Qwen_QwQ-32B-Preview_MLX-4bit>`__, `ModelScope <https://modelscope.cn/models/okwinds/QwQ-32B-Preview-MLX-8bit>`__
 
 Execute the following command to launch the model, remember to replace ``${quantization}`` with your
 chosen quantization method from the options listed above::
 
    xinference launch --model-engine ${engine} --model-name QwQ-32B-Preview --size-in-billions 32 --model-format mlx --quantization ${quantization}
 
 
-Model Spec 4 (mlx, 32 Billion)
+Model Spec 5 (mlx, 32 Billion)
 ++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** mlx
 - **Model Size (in billions):** 32
 - **Quantizations:** 8-bit
 - **Engines**: MLX
 - **Model ID:** mlx-community/Qwen_QwQ-32B-Preview_MLX-8bit
-- **Model Hubs**:  `Hugging Face <https://huggingface.co/mlx-community/Qwen_QwQ-32B-Preview_MLX-8bit>`__
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/mlx-community/Qwen_QwQ-32B-Preview_MLX-8bit>`__, `ModelScope <https://modelscope.cn/models/okwinds/QwQ-32B-Preview-MLX-8bit>`__
 
 Execute the following command to launch the model, remember to replace ``${quantization}`` with your
 chosen quantization method from the options listed above::
 
    xinference launch --model-engine ${engine} --model-name QwQ-32B-Preview --size-in-billions 32 --model-format mlx --quantization ${quantization}
 
 
-Model Spec 5 (mlx, 32 Billion)
+Model Spec 6 (mlx, 32 Billion)
 ++++++++++++++++++++++++++++++++++++++++
 
 - **Model Format:** mlx
 - **Model Size (in billions):** 32
 - **Quantizations:** none
 - **Engines**: MLX
 - **Model ID:** mlx-community/QwQ-32B-Preview-bf16
-- **Model Hubs**:  `Hugging Face <https://huggingface.co/mlx-community/QwQ-32B-Preview-bf16>`__
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/mlx-community/QwQ-32B-Preview-bf16>`__, `ModelScope <https://modelscope.cn/models/okwinds/QwQ-32B-Preview-MLX-8bit>`__
 
 Execute the following command to launch the model, remember to replace ``${quantization}`` with your
 chosen quantization method from the options listed above::

diff --git a/setup.cfg b/setup.cfg
@@ -39,7 +39,7 @@ install_requires =
     typing_extensions
     modelscope>=1.10.0
     sse_starlette>=1.6.5  # ensure_bytes API break change: https://github.com/sysid/sse-starlette/issues/65
-    openai>1  # For typing
+    openai>=1.40.0  # For typing
     python-jose[cryptography]
     passlib[bcrypt]
     aioprometheus[starlette]>=23.12.0
@@ -71,7 +71,7 @@ dev =
     jieba>=0.42.0
     flake8>=3.8.0
     black
-    openai>1
+    openai>=1.40.0
     langchain
     langchain-community
     orjson

diff --git a/xinference/deploy/docker/requirements.txt b/xinference/deploy/docker/requirements.txt
@@ -14,7 +14,7 @@ huggingface-hub>=0.19.4
 typing_extensions
 modelscope>=1.10.0
 sse_starlette>=1.6.5  # ensure_bytes API break change: https://github.com/sysid/sse-starlette/issues/65
-openai>1  # For typing
+openai>=1.40.0  # For typing
 python-jose[cryptography]
 passlib[bcrypt]
 aioprometheus[starlette]>=23.12.0
@@ -25,7 +25,7 @@ opencv-contrib-python-headless
 setproctitle
 
 # all
-transformers>=4.43.2
+transformers>=4.45.0
 accelerate>=0.28.0
 sentencepiece
 transformers_stream_generator

diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
@@ -8540,6 +8540,14 @@
         ],
         "model_id": "Qwen/QwQ-32B-Preview"
       },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "KirillR/QwQ-32B-Preview-AWQ"
+      },
       {
         "model_format": "ggufv2",
         "model_size_in_billions": 32,

diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
@@ -6292,6 +6292,24 @@
         "model_id": "Qwen/QwQ-32B-Preview",
         "model_hub": "modelscope"
       },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "4-bit"
+        ],
+        "model_id": "okwinds/QwQ-32B-Preview-MLX-4bit",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "8-bit"
+        ],
+        "model_id": "okwinds/QwQ-32B-Preview-MLX-8bit",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "ggufv2",
         "model_size_in_billions": 32,