diff --git "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" index 69df4b8f1..07acd9b82 100644 --- "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" @@ -480,6 +480,8 @@ |[damo/nlp_polylm_13b_text_generation](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation)|polylm|default|-|-|[DAMO-NLP-MT/polylm-13b](https://huggingface.co/DAMO-NLP-MT/polylm-13b)| |[AI-ModelScope/aya-expanse-8b](https://modelscope.cn/models/AI-ModelScope/aya-expanse-8b)|aya|aya|transformers>=4.44.0|-|[CohereForAI/aya-expanse-8b](https://huggingface.co/CohereForAI/aya-expanse-8b)| |[AI-ModelScope/aya-expanse-32b](https://modelscope.cn/models/AI-ModelScope/aya-expanse-32b)|aya|aya|transformers>=4.44.0|-|[CohereForAI/aya-expanse-32b](https://huggingface.co/CohereForAI/aya-expanse-32b)| +|[moonshotai/Moonlight-16B-A3B](https://modelscope.cn/models/moonshotai/Moonlight-16B-A3B)|moonlight|moonlight|transformers<4.49|-|[moonshotai/Moonlight-16B-A3B](https://huggingface.co/moonshotai/Moonlight-16B-A3B)| +|[moonshotai/Moonlight-16B-A3B-Instruct](https://modelscope.cn/models/moonshotai/Moonlight-16B-A3B-Instruct)|moonlight|moonlight|transformers<4.49|-|[moonshotai/Moonlight-16B-A3B-Instruct](https://huggingface.co/moonshotai/Moonlight-16B-A3B-Instruct)| |[answerdotai/ModernBERT-base](https://modelscope.cn/models/answerdotai/ModernBERT-base)|modern_bert|dummy|transformers>=4.48|bert|[answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base)| |[answerdotai/ModernBERT-large](https://modelscope.cn/models/answerdotai/ModernBERT-large)|modern_bert|dummy|transformers>=4.48|bert|[answerdotai/ModernBERT-large](https://huggingface.co/answerdotai/ModernBERT-large)| |[iic/gte-modernbert-base](https://modelscope.cn/models/iic/gte-modernbert-base)|modern_bert_gte|dummy|transformers>=4.48|bert, embedding|[Alibaba-NLP/gte-modernbert-base](https://huggingface.co/Alibaba-NLP/gte-modernbert-base)| diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md index b86d9eea0..a7ea4828c 100644 --- a/docs/source_en/Instruction/Supported-models-and-datasets.md +++ b/docs/source_en/Instruction/Supported-models-and-datasets.md @@ -480,6 +480,8 @@ The table below introduces the models integrated with ms-swift: |[damo/nlp_polylm_13b_text_generation](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation)|polylm|default|-|-|[DAMO-NLP-MT/polylm-13b](https://huggingface.co/DAMO-NLP-MT/polylm-13b)| |[AI-ModelScope/aya-expanse-8b](https://modelscope.cn/models/AI-ModelScope/aya-expanse-8b)|aya|aya|transformers>=4.44.0|-|[CohereForAI/aya-expanse-8b](https://huggingface.co/CohereForAI/aya-expanse-8b)| |[AI-ModelScope/aya-expanse-32b](https://modelscope.cn/models/AI-ModelScope/aya-expanse-32b)|aya|aya|transformers>=4.44.0|-|[CohereForAI/aya-expanse-32b](https://huggingface.co/CohereForAI/aya-expanse-32b)| +|[moonshotai/Moonlight-16B-A3B](https://modelscope.cn/models/moonshotai/Moonlight-16B-A3B)|moonlight|moonlight|transformers<4.49|-|[moonshotai/Moonlight-16B-A3B](https://huggingface.co/moonshotai/Moonlight-16B-A3B)| +|[moonshotai/Moonlight-16B-A3B-Instruct](https://modelscope.cn/models/moonshotai/Moonlight-16B-A3B-Instruct)|moonlight|moonlight|transformers<4.49|-|[moonshotai/Moonlight-16B-A3B-Instruct](https://huggingface.co/moonshotai/Moonlight-16B-A3B-Instruct)| |[answerdotai/ModernBERT-base](https://modelscope.cn/models/answerdotai/ModernBERT-base)|modern_bert|dummy|transformers>=4.48|bert|[answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base)| |[answerdotai/ModernBERT-large](https://modelscope.cn/models/answerdotai/ModernBERT-large)|modern_bert|dummy|transformers>=4.48|bert|[answerdotai/ModernBERT-large](https://huggingface.co/answerdotai/ModernBERT-large)| |[iic/gte-modernbert-base](https://modelscope.cn/models/iic/gte-modernbert-base)|modern_bert_gte|dummy|transformers>=4.48|bert, embedding|[Alibaba-NLP/gte-modernbert-base](https://huggingface.co/Alibaba-NLP/gte-modernbert-base)| diff --git a/swift/llm/model/constant.py b/swift/llm/model/constant.py index c2d8d80b4..db8be8390 100644 --- a/swift/llm/model/constant.py +++ b/swift/llm/model/constant.py @@ -103,6 +103,7 @@ class LLMModelType: mamba = 'mamba' polylm = 'polylm' aya = 'aya' + moonlight = 'moonlight' class BertModelType: diff --git a/swift/llm/model/model/deepseek.py b/swift/llm/model/model/deepseek.py index d6e142347..024f1f4bb 100644 --- a/swift/llm/model/model/deepseek.py +++ b/swift/llm/model/model/deepseek.py @@ -280,3 +280,19 @@ def get_model_tokenizer_deepseek_vl2(model_dir: str, *args, **kwargs): architectures=['Qwen2ForCausalLM', 'LlamaForCausalLM'], model_arch=ModelArch.llama, )) + +register_model( + ModelMeta( + LLMModelType.moonlight, + [ + ModelGroup([ + Model('moonshotai/Moonlight-16B-A3B', 'moonshotai/Moonlight-16B-A3B'), + Model('moonshotai/Moonlight-16B-A3B-Instruct', 'moonshotai/Moonlight-16B-A3B-Instruct'), + ]), + ], + TemplateType.moonlight, + get_model_tokenizer_with_flash_attn, + architectures=['DeepseekV3ForCausalLM'], + model_arch=ModelArch.deepseek_v2, + requires=['transformers<4.49'], + )) diff --git a/swift/llm/template/constant.py b/swift/llm/template/constant.py index 07186c180..d0b1b316e 100644 --- a/swift/llm/template/constant.py +++ b/swift/llm/template/constant.py @@ -72,6 +72,7 @@ class LLMTemplateType: xverse = 'xverse' bluelm = 'bluelm' orion = 'orion' + moonlight = 'moonlight' aya = 'aya' c4ai = 'c4ai' diff --git a/swift/llm/template/template/llm.py b/swift/llm/template/template/llm.py index 42c97f486..db6f5b877 100644 --- a/swift/llm/template/template/llm.py +++ b/swift/llm/template/template/llm.py @@ -245,3 +245,14 @@ class TeleChatTemplateMeta(TemplateMeta): suffix=['<|END_OF_TURN_TOKEN|>'], default_system=AYA_SYSTEM, system_prefix=['<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{SYSTEM}}<|END_OF_TURN_TOKEN|'])) + +register_template( + TemplateMeta( + LLMTemplateType.moonlight, + prefix=[], + system_prefix=['<|im_system|>system<|im_middle|>{{SYSTEM}}<|im_end|>'], + prompt=['<|im_user|>user<|im_middle|>{{QUERY}}<|im_end|><|im_assistant|>assistant<|im_middle|>'], + chat_sep=['<|im_end|>'], + suffix=['<|im_end|>'], + default_system='You are a helpful assistant', + )) diff --git a/tests/test_align/test_template/test_llm.py b/tests/test_align/test_template/test_llm.py index 3e451101b..c6e30bdb6 100644 --- a/tests/test_align/test_template/test_llm.py +++ b/tests/test_align/test_template/test_llm.py @@ -322,6 +322,14 @@ def test_mistral_small(): assert response == response2 +def test_moonlight(): + pt_engine = PtEngine('moonshotai/Moonlight-16B-A3B-Instruct') + res = _infer_model(pt_engine) + pt_engine.default_template.template_backend = 'jinja' + res2 = _infer_model(pt_engine) + assert res == res2, f'res: {res}, res2: {res2}' + + if __name__ == '__main__': from swift.llm import PtEngine, RequestConfig, get_template, get_model_tokenizer from swift.utils import get_logger, seed_everything @@ -351,4 +359,5 @@ def test_mistral_small(): # test_internlm3() # test_deepseek_r1_distill() # test_qwen2_5_prm() - test_mistral_small() + # test_mistral_small() + test_moonlight()