From 9173f22d0cfacf28be83e0ba9bc8c129bb8ba3ce Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Thu, 20 Feb 2025 00:10:31 +0800 Subject: [PATCH 1/6] update dataset --- ...345\222\214\346\225\260\346\215\256\351\233\206.md" | 10 ++++++++++ .../Instruction/Supported-models-and-datasets.md | 10 ++++++++++ swift/llm/dataset/data/dataset_info.json | 10 +++++++--- swift/llm/dataset/dataset/llm.py | 3 ++- 4 files changed, 29 insertions(+), 4 deletions(-) diff --git "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" index 5ef2ce6cc..ee790c58b 100644 --- "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" @@ -713,6 +713,7 @@ |[AI-ModelScope/LaTeX_OCR](https://modelscope.cn/datasets/AI-ModelScope/LaTeX_OCR)|default
human_handwrite
human_handwrite_print
synthetic_handwrite
small|162149|117.6±44.9, min=41, max=312|chat, ocr, multi-modal, vision|[linxy/LaTeX_OCR](https://huggingface.co/datasets/linxy/LaTeX_OCR)| |[AI-ModelScope/LongAlpaca-12k](https://modelscope.cn/datasets/AI-ModelScope/LongAlpaca-12k)|default|11998|9941.8±3417.1, min=4695, max=25826|long-sequence, QA|[Yukang/LongAlpaca-12k](https://huggingface.co/datasets/Yukang/LongAlpaca-12k)| |[AI-ModelScope/M3IT](https://modelscope.cn/datasets/AI-ModelScope/M3IT)|coco
vqa-v2
shapes
shapes-rephrased
coco-goi-rephrased
snli-ve
snli-ve-rephrased
okvqa
a-okvqa
viquae
textcap
docvqa
science-qa
imagenet
imagenet-open-ended
imagenet-rephrased
coco-goi
clevr
clevr-rephrased
nlvr
coco-itm
coco-itm-rephrased
vsr
vsr-rephrased
mocheg
mocheg-rephrased
coco-text
fm-iqa
activitynet-qa
msrvtt
ss
coco-cn
refcoco
refcoco-rephrased
multi30k
image-paragraph-captioning
visual-dialog
visual-dialog-rephrased
iqa
vcr
visual-mrc
ivqa
msrvtt-qa
msvd-qa
gqa
text-vqa
ocr-vqa
st-vqa
flickr8k-cn|huge dataset|-|chat, multi-modal, vision|-| +|[AI-ModelScope/MATH-lighteval](https://modelscope.cn/datasets/AI-ModelScope/MATH-lighteval)|default|7500|104.4±92.8, min=36, max=1683|grpo, math|[DigitalLearningGmbH/MATH-lighteval](https://huggingface.co/datasets/DigitalLearningGmbH/MATH-lighteval)| |[AI-ModelScope/Magpie-Qwen2-Pro-200K-Chinese](https://modelscope.cn/datasets/AI-ModelScope/Magpie-Qwen2-Pro-200K-Chinese)|default|200000|448.4±223.5, min=87, max=4098|chat, sft, 🔥, zh|[Magpie-Align/Magpie-Qwen2-Pro-200K-Chinese](https://huggingface.co/datasets/Magpie-Align/Magpie-Qwen2-Pro-200K-Chinese)| |[AI-ModelScope/Magpie-Qwen2-Pro-200K-English](https://modelscope.cn/datasets/AI-ModelScope/Magpie-Qwen2-Pro-200K-English)|default|200000|609.9±277.1, min=257, max=4098|chat, sft, 🔥, en|[Magpie-Align/Magpie-Qwen2-Pro-200K-English](https://huggingface.co/datasets/Magpie-Align/Magpie-Qwen2-Pro-200K-English)| |[AI-ModelScope/Magpie-Qwen2-Pro-300K-Filtered](https://modelscope.cn/datasets/AI-ModelScope/Magpie-Qwen2-Pro-300K-Filtered)|default|300000|556.6±288.6, min=175, max=4098|chat, sft, 🔥|[Magpie-Align/Magpie-Qwen2-Pro-300K-Filtered](https://huggingface.co/datasets/Magpie-Align/Magpie-Qwen2-Pro-300K-Filtered)| @@ -769,10 +770,13 @@ |[DAMO_NLP/jd](https://modelscope.cn/datasets/DAMO_NLP/jd)|default
cls|45012|66.9±87.0, min=41, max=1699|text-generation, classification, 🔥|-| |-|default|huge dataset|-|pretrain, quality|[HuggingFaceFW/fineweb](https://huggingface.co/datasets/HuggingFaceFW/fineweb)| |-|auto_math_text
khanacademy
openstax
stanford
stories
web_samples_v1
web_samples_v2
wikihow|huge dataset|-|multi-domain, en, qa|[HuggingFaceTB/cosmopedia](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia)| +|[HumanLLMs/Human-Like-DPO-Dataset](https://modelscope.cn/datasets/HumanLLMs/Human-Like-DPO-Dataset)|default|10884|47.5±7.9, min=32, max=85|rlhf, dpo|[HumanLLMs/Human-Like-DPO-Dataset](https://huggingface.co/datasets/HumanLLMs/Human-Like-DPO-Dataset)| +|[LLM-Research/xlam-function-calling-60k](https://modelscope.cn/datasets/LLM-Research/xlam-function-calling-60k)|default
grpo|120000|453.7±219.5, min=164, max=2779|agent|-| |[OmniData/Zhihu-KOL](https://modelscope.cn/datasets/OmniData/Zhihu-KOL)|default|huge dataset|-|zhihu, qa|[wangrui6/Zhihu-KOL](https://huggingface.co/datasets/wangrui6/Zhihu-KOL)| |[OmniData/Zhihu-KOL-More-Than-100-Upvotes](https://modelscope.cn/datasets/OmniData/Zhihu-KOL-More-Than-100-Upvotes)|default|271261|1003.4±1826.1, min=28, max=52541|zhihu, qa|[bzb2023/Zhihu-KOL-More-Than-100-Upvotes](https://huggingface.co/datasets/bzb2023/Zhihu-KOL-More-Than-100-Upvotes)| |[PowerInfer/LONGCOT-Refine-500K](https://modelscope.cn/datasets/PowerInfer/LONGCOT-Refine-500K)|default|521921|296.5±158.4, min=39, max=4634|chat, sft, 🔥, cot|[PowerInfer/LONGCOT-Refine-500K](https://huggingface.co/datasets/PowerInfer/LONGCOT-Refine-500K)| |[PowerInfer/QWQ-LONGCOT-500K](https://modelscope.cn/datasets/PowerInfer/QWQ-LONGCOT-500K)|default|498082|310.7±303.1, min=35, max=22941|chat, sft, 🔥, cot|[PowerInfer/QWQ-LONGCOT-500K](https://huggingface.co/datasets/PowerInfer/QWQ-LONGCOT-500K)| +|[ServiceNow-AI/R1-Distill-SFT](https://modelscope.cn/datasets/ServiceNow-AI/R1-Distill-SFT)|v0
v1|1850809|164.2±438.0, min=30, max=32469|chat, sft, cot, r1|[ServiceNow-AI/R1-Distill-SFT](https://huggingface.co/datasets/ServiceNow-AI/R1-Distill-SFT)| |[TIGER-Lab/MATH-plus](https://modelscope.cn/datasets/TIGER-Lab/MATH-plus)|train|893929|301.4±196.7, min=50, max=1162|qa, math, en, quality|[TIGER-Lab/MATH-plus](https://huggingface.co/datasets/TIGER-Lab/MATH-plus)| |[Tongyi-DataEngine/SA1B-Dense-Caption](https://modelscope.cn/datasets/Tongyi-DataEngine/SA1B-Dense-Caption)|default|huge dataset|-|zh, multi-modal, vqa|-| |[Tongyi-DataEngine/SA1B-Paired-Captions-Images](https://modelscope.cn/datasets/Tongyi-DataEngine/SA1B-Paired-Captions-Images)|default|7736284|106.4±18.5, min=48, max=193|zh, multi-modal, vqa|-| @@ -780,6 +784,7 @@ |[YorickHe/CoT_zh](https://modelscope.cn/datasets/YorickHe/CoT_zh)|default|74771|129.1±53.2, min=51, max=401|chat, general|-| |[ZhipuAI/LongWriter-6k](https://modelscope.cn/datasets/ZhipuAI/LongWriter-6k)|default|6000|5009.0±2932.8, min=117, max=30354|long, chat, sft, 🔥|[THUDM/LongWriter-6k](https://huggingface.co/datasets/THUDM/LongWriter-6k)| |-|default|huge dataset|-|pretrain, quality|[allenai/c4](https://huggingface.co/datasets/allenai/c4)| +|[bespokelabs/Bespoke-Stratos-17k](https://modelscope.cn/datasets/bespokelabs/Bespoke-Stratos-17k)|default|16710|480.7±236.1, min=266, max=3556|chat, sft, cot, r1|[bespokelabs/Bespoke-Stratos-17k](https://huggingface.co/datasets/bespokelabs/Bespoke-Stratos-17k)| |-|default|huge dataset|-|pretrain, quality|[cerebras/SlimPajama-627B](https://huggingface.co/datasets/cerebras/SlimPajama-627B)| |[codefuse-ai/CodeExercise-Python-27k](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k)|default|27224|337.3±154.2, min=90, max=2826|chat, coding, 🔥|-| |[codefuse-ai/Evol-instruction-66k](https://modelscope.cn/datasets/codefuse-ai/Evol-instruction-66k)|default|66862|440.1±208.4, min=46, max=2661|chat, coding, 🔥|-| @@ -794,15 +799,20 @@ |[iic/MSAgent-Pro](https://modelscope.cn/datasets/iic/MSAgent-Pro)|default|21910|1978.1±747.9, min=339, max=8064|chat, agent, multi-round, 🔥|-| |[iic/ms_agent](https://modelscope.cn/datasets/iic/ms_agent)|default|30000|645.8±218.0, min=199, max=2070|chat, agent, multi-round, 🔥|-| |[iic/ms_bench](https://modelscope.cn/datasets/iic/ms_bench)|default|316820|353.4±424.5, min=29, max=2924|chat, general, multi-round, 🔥|-| +|[liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT](https://modelscope.cn/datasets/liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT)|default|110000|72.1±60.9, min=29, max=2315|chat, sft, cot, r1|[Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT](https://huggingface.co/datasets/Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT)| |-|default|huge dataset|-|multi-modal, en, vqa, quality|[lmms-lab/GQA](https://huggingface.co/datasets/lmms-lab/GQA)| |-|0_30_s_academic_v0_1
0_30_s_youtube_v0_1
1_2_m_academic_v0_1
1_2_m_youtube_v0_1
2_3_m_academic_v0_1
2_3_m_youtube_v0_1
30_60_s_academic_v0_1
30_60_s_youtube_v0_1|1335486|273.7±78.8, min=107, max=638|chat, multi-modal, video|[lmms-lab/LLaVA-Video-178K](https://huggingface.co/datasets/lmms-lab/LLaVA-Video-178K)| |[lvjianjin/AdvertiseGen](https://modelscope.cn/datasets/lvjianjin/AdvertiseGen)|default|97484|130.9±21.9, min=73, max=232|text-generation, 🔥|[shibing624/AdvertiseGen](https://huggingface.co/datasets/shibing624/AdvertiseGen)| |[mapjack/openwebtext_dataset](https://modelscope.cn/datasets/mapjack/openwebtext_dataset)|default|huge dataset|-|pretrain, zh, quality|-| |[modelscope/DuReader_robust-QG](https://modelscope.cn/datasets/modelscope/DuReader_robust-QG)|default|17899|242.0±143.1, min=75, max=1416|text-generation, 🔥|-| +|[modelscope/MathR](https://modelscope.cn/datasets/modelscope/MathR)|default
clean|6089|188.7±75.3, min=64, max=3341|qa, math|-| +|[modelscope/MathR-32B-Distill](https://modelscope.cn/datasets/modelscope/MathR-32B-Distill)|data|25921|209.4±63.1, min=121, max=3407|qa, math|-| |[modelscope/chinese-poetry-collection](https://modelscope.cn/datasets/modelscope/chinese-poetry-collection)|default|1710|58.1±8.1, min=31, max=71|text-generation, poetry|-| |[modelscope/clue](https://modelscope.cn/datasets/modelscope/clue)|cmnli|391783|81.6±16.0, min=54, max=157|text-generation, classification|[clue](https://huggingface.co/datasets/clue)| |[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption)|train
validation|454617|389.6±68.4, min=70, max=587|chat, multi-modal, vision, 🔥|-| |[modelscope/gsm8k](https://modelscope.cn/datasets/modelscope/gsm8k)|main|7473|88.6±21.6, min=41, max=241|qa, math|-| +|[open-thoughts/OpenThoughts-114k](https://modelscope.cn/datasets/open-thoughts/OpenThoughts-114k)|default|113957|413.2±186.9, min=265, max=13868|chat, sft, cot, r1|[open-thoughts/OpenThoughts-114k](https://huggingface.co/datasets/open-thoughts/OpenThoughts-114k)| +|[sentence-transformers/stsb](https://modelscope.cn/datasets/sentence-transformers/stsb)|default|5748|21.0±0.0, min=21, max=21|similarity, 🔥|-| |[shenweizhou/alpha-umi-toolbench-processed-v2](https://modelscope.cn/datasets/shenweizhou/alpha-umi-toolbench-processed-v2)|backbone
caller
planner
summarizer|huge dataset|-|chat, agent, 🔥|-| |[simpleai/HC3](https://modelscope.cn/datasets/simpleai/HC3)|finance
finance_cls
medicine
medicine_cls|11021|296.0±153.3, min=65, max=2267|text-generation, classification, 🔥|[Hello-SimpleAI/HC3](https://huggingface.co/datasets/Hello-SimpleAI/HC3)| |[simpleai/HC3-Chinese](https://modelscope.cn/datasets/simpleai/HC3-Chinese)|baike
baike_cls
open_qa
open_qa_cls
nlpcc_dbqa
nlpcc_dbqa_cls
finance
finance_cls
medicine
medicine_cls
law
law_cls
psychology
psychology_cls|39781|179.9±70.2, min=90, max=1070|text-generation, classification, 🔥|[Hello-SimpleAI/HC3-Chinese](https://huggingface.co/datasets/Hello-SimpleAI/HC3-Chinese)| diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md index 63da4467b..ae058ba5b 100644 --- a/docs/source_en/Instruction/Supported-models-and-datasets.md +++ b/docs/source_en/Instruction/Supported-models-and-datasets.md @@ -714,6 +714,7 @@ The table below introduces information about the datasets integrated with ms-swi |[AI-ModelScope/LaTeX_OCR](https://modelscope.cn/datasets/AI-ModelScope/LaTeX_OCR)|default
human_handwrite
human_handwrite_print
synthetic_handwrite
small|162149|117.6±44.9, min=41, max=312|chat, ocr, multi-modal, vision|[linxy/LaTeX_OCR](https://huggingface.co/datasets/linxy/LaTeX_OCR)| |[AI-ModelScope/LongAlpaca-12k](https://modelscope.cn/datasets/AI-ModelScope/LongAlpaca-12k)|default|11998|9941.8±3417.1, min=4695, max=25826|long-sequence, QA|[Yukang/LongAlpaca-12k](https://huggingface.co/datasets/Yukang/LongAlpaca-12k)| |[AI-ModelScope/M3IT](https://modelscope.cn/datasets/AI-ModelScope/M3IT)|coco
vqa-v2
shapes
shapes-rephrased
coco-goi-rephrased
snli-ve
snli-ve-rephrased
okvqa
a-okvqa
viquae
textcap
docvqa
science-qa
imagenet
imagenet-open-ended
imagenet-rephrased
coco-goi
clevr
clevr-rephrased
nlvr
coco-itm
coco-itm-rephrased
vsr
vsr-rephrased
mocheg
mocheg-rephrased
coco-text
fm-iqa
activitynet-qa
msrvtt
ss
coco-cn
refcoco
refcoco-rephrased
multi30k
image-paragraph-captioning
visual-dialog
visual-dialog-rephrased
iqa
vcr
visual-mrc
ivqa
msrvtt-qa
msvd-qa
gqa
text-vqa
ocr-vqa
st-vqa
flickr8k-cn|huge dataset|-|chat, multi-modal, vision|-| +|[AI-ModelScope/MATH-lighteval](https://modelscope.cn/datasets/AI-ModelScope/MATH-lighteval)|default|7500|104.4±92.8, min=36, max=1683|grpo, math|[DigitalLearningGmbH/MATH-lighteval](https://huggingface.co/datasets/DigitalLearningGmbH/MATH-lighteval)| |[AI-ModelScope/Magpie-Qwen2-Pro-200K-Chinese](https://modelscope.cn/datasets/AI-ModelScope/Magpie-Qwen2-Pro-200K-Chinese)|default|200000|448.4±223.5, min=87, max=4098|chat, sft, 🔥, zh|[Magpie-Align/Magpie-Qwen2-Pro-200K-Chinese](https://huggingface.co/datasets/Magpie-Align/Magpie-Qwen2-Pro-200K-Chinese)| |[AI-ModelScope/Magpie-Qwen2-Pro-200K-English](https://modelscope.cn/datasets/AI-ModelScope/Magpie-Qwen2-Pro-200K-English)|default|200000|609.9±277.1, min=257, max=4098|chat, sft, 🔥, en|[Magpie-Align/Magpie-Qwen2-Pro-200K-English](https://huggingface.co/datasets/Magpie-Align/Magpie-Qwen2-Pro-200K-English)| |[AI-ModelScope/Magpie-Qwen2-Pro-300K-Filtered](https://modelscope.cn/datasets/AI-ModelScope/Magpie-Qwen2-Pro-300K-Filtered)|default|300000|556.6±288.6, min=175, max=4098|chat, sft, 🔥|[Magpie-Align/Magpie-Qwen2-Pro-300K-Filtered](https://huggingface.co/datasets/Magpie-Align/Magpie-Qwen2-Pro-300K-Filtered)| @@ -770,10 +771,13 @@ The table below introduces information about the datasets integrated with ms-swi |[DAMO_NLP/jd](https://modelscope.cn/datasets/DAMO_NLP/jd)|default
cls|45012|66.9±87.0, min=41, max=1699|text-generation, classification, 🔥|-| |-|default|huge dataset|-|pretrain, quality|[HuggingFaceFW/fineweb](https://huggingface.co/datasets/HuggingFaceFW/fineweb)| |-|auto_math_text
khanacademy
openstax
stanford
stories
web_samples_v1
web_samples_v2
wikihow|huge dataset|-|multi-domain, en, qa|[HuggingFaceTB/cosmopedia](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia)| +|[HumanLLMs/Human-Like-DPO-Dataset](https://modelscope.cn/datasets/HumanLLMs/Human-Like-DPO-Dataset)|default|10884|47.5±7.9, min=32, max=85|rlhf, dpo|[HumanLLMs/Human-Like-DPO-Dataset](https://huggingface.co/datasets/HumanLLMs/Human-Like-DPO-Dataset)| +|[LLM-Research/xlam-function-calling-60k](https://modelscope.cn/datasets/LLM-Research/xlam-function-calling-60k)|default
grpo|120000|453.7±219.5, min=164, max=2779|agent|-| |[OmniData/Zhihu-KOL](https://modelscope.cn/datasets/OmniData/Zhihu-KOL)|default|huge dataset|-|zhihu, qa|[wangrui6/Zhihu-KOL](https://huggingface.co/datasets/wangrui6/Zhihu-KOL)| |[OmniData/Zhihu-KOL-More-Than-100-Upvotes](https://modelscope.cn/datasets/OmniData/Zhihu-KOL-More-Than-100-Upvotes)|default|271261|1003.4±1826.1, min=28, max=52541|zhihu, qa|[bzb2023/Zhihu-KOL-More-Than-100-Upvotes](https://huggingface.co/datasets/bzb2023/Zhihu-KOL-More-Than-100-Upvotes)| |[PowerInfer/LONGCOT-Refine-500K](https://modelscope.cn/datasets/PowerInfer/LONGCOT-Refine-500K)|default|521921|296.5±158.4, min=39, max=4634|chat, sft, 🔥, cot|[PowerInfer/LONGCOT-Refine-500K](https://huggingface.co/datasets/PowerInfer/LONGCOT-Refine-500K)| |[PowerInfer/QWQ-LONGCOT-500K](https://modelscope.cn/datasets/PowerInfer/QWQ-LONGCOT-500K)|default|498082|310.7±303.1, min=35, max=22941|chat, sft, 🔥, cot|[PowerInfer/QWQ-LONGCOT-500K](https://huggingface.co/datasets/PowerInfer/QWQ-LONGCOT-500K)| +|[ServiceNow-AI/R1-Distill-SFT](https://modelscope.cn/datasets/ServiceNow-AI/R1-Distill-SFT)|v0
v1|1850809|164.2±438.0, min=30, max=32469|chat, sft, cot, r1|[ServiceNow-AI/R1-Distill-SFT](https://huggingface.co/datasets/ServiceNow-AI/R1-Distill-SFT)| |[TIGER-Lab/MATH-plus](https://modelscope.cn/datasets/TIGER-Lab/MATH-plus)|train|893929|301.4±196.7, min=50, max=1162|qa, math, en, quality|[TIGER-Lab/MATH-plus](https://huggingface.co/datasets/TIGER-Lab/MATH-plus)| |[Tongyi-DataEngine/SA1B-Dense-Caption](https://modelscope.cn/datasets/Tongyi-DataEngine/SA1B-Dense-Caption)|default|huge dataset|-|zh, multi-modal, vqa|-| |[Tongyi-DataEngine/SA1B-Paired-Captions-Images](https://modelscope.cn/datasets/Tongyi-DataEngine/SA1B-Paired-Captions-Images)|default|7736284|106.4±18.5, min=48, max=193|zh, multi-modal, vqa|-| @@ -781,6 +785,7 @@ The table below introduces information about the datasets integrated with ms-swi |[YorickHe/CoT_zh](https://modelscope.cn/datasets/YorickHe/CoT_zh)|default|74771|129.1±53.2, min=51, max=401|chat, general|-| |[ZhipuAI/LongWriter-6k](https://modelscope.cn/datasets/ZhipuAI/LongWriter-6k)|default|6000|5009.0±2932.8, min=117, max=30354|long, chat, sft, 🔥|[THUDM/LongWriter-6k](https://huggingface.co/datasets/THUDM/LongWriter-6k)| |-|default|huge dataset|-|pretrain, quality|[allenai/c4](https://huggingface.co/datasets/allenai/c4)| +|[bespokelabs/Bespoke-Stratos-17k](https://modelscope.cn/datasets/bespokelabs/Bespoke-Stratos-17k)|default|16710|480.7±236.1, min=266, max=3556|chat, sft, cot, r1|[bespokelabs/Bespoke-Stratos-17k](https://huggingface.co/datasets/bespokelabs/Bespoke-Stratos-17k)| |-|default|huge dataset|-|pretrain, quality|[cerebras/SlimPajama-627B](https://huggingface.co/datasets/cerebras/SlimPajama-627B)| |[codefuse-ai/CodeExercise-Python-27k](https://modelscope.cn/datasets/codefuse-ai/CodeExercise-Python-27k)|default|27224|337.3±154.2, min=90, max=2826|chat, coding, 🔥|-| |[codefuse-ai/Evol-instruction-66k](https://modelscope.cn/datasets/codefuse-ai/Evol-instruction-66k)|default|66862|440.1±208.4, min=46, max=2661|chat, coding, 🔥|-| @@ -795,15 +800,20 @@ The table below introduces information about the datasets integrated with ms-swi |[iic/MSAgent-Pro](https://modelscope.cn/datasets/iic/MSAgent-Pro)|default|21910|1978.1±747.9, min=339, max=8064|chat, agent, multi-round, 🔥|-| |[iic/ms_agent](https://modelscope.cn/datasets/iic/ms_agent)|default|30000|645.8±218.0, min=199, max=2070|chat, agent, multi-round, 🔥|-| |[iic/ms_bench](https://modelscope.cn/datasets/iic/ms_bench)|default|316820|353.4±424.5, min=29, max=2924|chat, general, multi-round, 🔥|-| +|[liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT](https://modelscope.cn/datasets/liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT)|default|110000|72.1±60.9, min=29, max=2315|chat, sft, cot, r1|[Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT](https://huggingface.co/datasets/Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT)| |-|default|huge dataset|-|multi-modal, en, vqa, quality|[lmms-lab/GQA](https://huggingface.co/datasets/lmms-lab/GQA)| |-|0_30_s_academic_v0_1
0_30_s_youtube_v0_1
1_2_m_academic_v0_1
1_2_m_youtube_v0_1
2_3_m_academic_v0_1
2_3_m_youtube_v0_1
30_60_s_academic_v0_1
30_60_s_youtube_v0_1|1335486|273.7±78.8, min=107, max=638|chat, multi-modal, video|[lmms-lab/LLaVA-Video-178K](https://huggingface.co/datasets/lmms-lab/LLaVA-Video-178K)| |[lvjianjin/AdvertiseGen](https://modelscope.cn/datasets/lvjianjin/AdvertiseGen)|default|97484|130.9±21.9, min=73, max=232|text-generation, 🔥|[shibing624/AdvertiseGen](https://huggingface.co/datasets/shibing624/AdvertiseGen)| |[mapjack/openwebtext_dataset](https://modelscope.cn/datasets/mapjack/openwebtext_dataset)|default|huge dataset|-|pretrain, zh, quality|-| |[modelscope/DuReader_robust-QG](https://modelscope.cn/datasets/modelscope/DuReader_robust-QG)|default|17899|242.0±143.1, min=75, max=1416|text-generation, 🔥|-| +|[modelscope/MathR](https://modelscope.cn/datasets/modelscope/MathR)|default
clean|6089|188.7±75.3, min=64, max=3341|qa, math|-| +|[modelscope/MathR-32B-Distill](https://modelscope.cn/datasets/modelscope/MathR-32B-Distill)|data|25921|209.4±63.1, min=121, max=3407|qa, math|-| |[modelscope/chinese-poetry-collection](https://modelscope.cn/datasets/modelscope/chinese-poetry-collection)|default|1710|58.1±8.1, min=31, max=71|text-generation, poetry|-| |[modelscope/clue](https://modelscope.cn/datasets/modelscope/clue)|cmnli|391783|81.6±16.0, min=54, max=157|text-generation, classification|[clue](https://huggingface.co/datasets/clue)| |[modelscope/coco_2014_caption](https://modelscope.cn/datasets/modelscope/coco_2014_caption)|train
validation|454617|389.6±68.4, min=70, max=587|chat, multi-modal, vision, 🔥|-| |[modelscope/gsm8k](https://modelscope.cn/datasets/modelscope/gsm8k)|main|7473|88.6±21.6, min=41, max=241|qa, math|-| +|[open-thoughts/OpenThoughts-114k](https://modelscope.cn/datasets/open-thoughts/OpenThoughts-114k)|default|113957|413.2±186.9, min=265, max=13868|chat, sft, cot, r1|[open-thoughts/OpenThoughts-114k](https://huggingface.co/datasets/open-thoughts/OpenThoughts-114k)| +|[sentence-transformers/stsb](https://modelscope.cn/datasets/sentence-transformers/stsb)|default|5748|21.0±0.0, min=21, max=21|similarity, 🔥|-| |[shenweizhou/alpha-umi-toolbench-processed-v2](https://modelscope.cn/datasets/shenweizhou/alpha-umi-toolbench-processed-v2)|backbone
caller
planner
summarizer|huge dataset|-|chat, agent, 🔥|-| |[simpleai/HC3](https://modelscope.cn/datasets/simpleai/HC3)|finance
finance_cls
medicine
medicine_cls|11021|296.0±153.3, min=65, max=2267|text-generation, classification, 🔥|[Hello-SimpleAI/HC3](https://huggingface.co/datasets/Hello-SimpleAI/HC3)| |[simpleai/HC3-Chinese](https://modelscope.cn/datasets/simpleai/HC3-Chinese)|baike
baike_cls
open_qa
open_qa_cls
nlpcc_dbqa
nlpcc_dbqa_cls
finance
finance_cls
medicine
medicine_cls
law
law_cls
psychology
psychology_cls|39781|179.9±70.2, min=90, max=1070|text-generation, classification, 🔥|[Hello-SimpleAI/HC3-Chinese](https://huggingface.co/datasets/Hello-SimpleAI/HC3-Chinese)| diff --git a/swift/llm/dataset/data/dataset_info.json b/swift/llm/dataset/data/dataset_info.json index 2cf38c956..35e4c48f6 100644 --- a/swift/llm/dataset/data/dataset_info.json +++ b/swift/llm/dataset/data/dataset_info.json @@ -640,12 +640,16 @@ "tags": ["rlhf", "dpo"] }, { - "ms_dataset_id": "DigitalLearningGmbH/MATH-lighteval", - "hf_dataset_id": "AI-ModelScope/MATH-lighteval", + "ms_dataset_id": "AI-ModelScope/MATH-lighteval", + "hf_dataset_id": "DigitalLearningGmbH/MATH-lighteval", "columns": { "problem": "query" }, "tags": ["grpo", "math"] - + }, + { + "ms_dataset_id": "liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT", + "hf_dataset_id": "Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT", + "tags": ["chat", "sft", "cot", "r1"] } ] diff --git a/swift/llm/dataset/dataset/llm.py b/swift/llm/dataset/dataset/llm.py index 5a4bdc523..e102d78ec 100644 --- a/swift/llm/dataset/dataset/llm.py +++ b/swift/llm/dataset/dataset/llm.py @@ -427,10 +427,11 @@ def preprocess(self, row: Dict[str, Any], all_tools=None) -> Optional[Dict[str, register_dataset( DatasetMeta(ms_dataset_id='modelscope/MathR', subsets=['default', 'clean'], split=['train'], tags=['qa', 'math'])) + register_dataset( DatasetMeta( ms_dataset_id='modelscope/MathR-32B-Distill', - subsets=['default', 'clean'], + subsets=['data'], split=['train'], tags=['qa', 'math'])) From cd36488d5385a94bf1e3a36f59459a14ed37041d Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Thu, 20 Feb 2025 00:39:48 +0800 Subject: [PATCH 2/6] lint pass --- swift/llm/dataset/dataset/llm.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/swift/llm/dataset/dataset/llm.py b/swift/llm/dataset/dataset/llm.py index e102d78ec..01ad277ce 100644 --- a/swift/llm/dataset/dataset/llm.py +++ b/swift/llm/dataset/dataset/llm.py @@ -429,11 +429,7 @@ def preprocess(self, row: Dict[str, Any], all_tools=None) -> Optional[Dict[str, DatasetMeta(ms_dataset_id='modelscope/MathR', subsets=['default', 'clean'], split=['train'], tags=['qa', 'math'])) register_dataset( - DatasetMeta( - ms_dataset_id='modelscope/MathR-32B-Distill', - subsets=['data'], - split=['train'], - tags=['qa', 'math'])) + DatasetMeta(ms_dataset_id='modelscope/MathR-32B-Distill', subsets=['data'], split=['train'], tags=['qa', 'math'])) class HC3Preprocessor(ResponsePreprocessor): From 1dddccfd9dfee9a286363ecc886730c2323cfade Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Thu, 20 Feb 2025 10:19:09 +0800 Subject: [PATCH 3/6] refactor grpo dataset --- swift/llm/dataset/dataset/llm.py | 33 +++--------------------- swift/llm/dataset/preprocessor/core.py | 35 ++++++++++++++------------ 2 files changed, 22 insertions(+), 46 deletions(-) diff --git a/swift/llm/dataset/dataset/llm.py b/swift/llm/dataset/dataset/llm.py index 01ad277ce..78437192f 100644 --- a/swift/llm/dataset/dataset/llm.py +++ b/swift/llm/dataset/dataset/llm.py @@ -397,19 +397,6 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]: register_dataset(DatasetMeta(ms_dataset_id='swift/ToolBench', tags=['chat', 'agent', 'multi-round'])) - -class CompetitionMathPreprocessor(ResponsePreprocessor): - - def preprocess(self, row: Dict[str, Any], all_tools=None) -> Optional[Dict[str, Any]]: - query = row['problem'] - response = row['solution'] - row = { - 'query': query, - 'response': response, - } - return super().preprocess(row) - - register_dataset( DatasetMeta( ms_dataset_id='tastelikefeet/competition_math', @@ -418,7 +405,6 @@ def preprocess(self, row: Dict[str, Any], all_tools=None) -> Optional[Dict[str, name='default', subset='default', split=['train', 'test'], - preprocess_func=CompetitionMathPreprocessor(), ), ], tags=['qa', 'math'])) @@ -584,28 +570,15 @@ def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]: name = answer['name'] args = json.dumps(answer['arguments']) response = f'Action: {name}\nAction Input: {args}' - key = 'response' if self.response else 'solution' - row = {'query': query, key: response, 'tools': row['tools']} + row = {'query': query, 'response': response, 'solution': response, 'tools': row['tools']} return super().preprocess(row) register_dataset( DatasetMeta( ms_dataset_id='LLM-Research/xlam-function-calling-60k', - subsets=[ - SubsetDataset( - name='default', - subset='dataset', - split=['train'], - preprocess_func=XlamFunctionCallingPreprocessor(response=True), - ), - SubsetDataset( - name='grpo', - subset='dataset', - split=['train'], - preprocess_func=XlamFunctionCallingPreprocessor(response=False), - ), - ], + subsets=['dataset'], + preprocess_func=XlamFunctionCallingPreprocessor(), tags=['agent'])) diff --git a/swift/llm/dataset/preprocessor/core.py b/swift/llm/dataset/preprocessor/core.py index 91374484d..3efea9dbc 100644 --- a/swift/llm/dataset/preprocessor/core.py +++ b/swift/llm/dataset/preprocessor/core.py @@ -129,11 +129,13 @@ def rows_to_batched(rows: List[Dict[str, Any]]): return batched @staticmethod - def _fix_streaming_keys(row): + def _remove_prefix_keys(row, prefix: str): for k in list(row.keys()): - if k.startswith('__@'): - new_k = k[len('__@'):] - row[new_k] = row.pop(k) + if k.startswith(prefix): + new_k = k[len(prefix):] + new_v = row.pop(k) + if new_k not in row: + row[new_k] = new_v @staticmethod def _check_objects(row): @@ -160,7 +162,7 @@ def batched_preprocess(self, batched_row: Dict[str, Any], *, strict: bool, from ...template import MaxLengthError batched_row = dict(batched_row) assert len(batched_row) > 0 - self._fix_streaming_keys(batched_row) + self._remove_prefix_keys(batched_row, '__@') # compat streaming rows = self.batched_to_rows(batched_row) new_rows = [] @@ -191,7 +193,7 @@ def batched_preprocess(self, batched_row: Dict[str, Any], *, strict: bool, row = [] new_rows += row res = self.rows_to_batched(new_rows) - + self._remove_prefix_keys(res, '__#') # compat GRPO if len(res) == 0: res['messages'] = [] @@ -285,21 +287,22 @@ def __call__( if self.dataset_sample is not None: dataset = sample_dataset(dataset, self.dataset_sample, self.random_state) + map_kwargs = {'batched': True, 'batch_size': batch_size} + if isinstance(dataset, HfDataset): + map_kwargs['num_proc'] = num_proc + # compat GRPO: The solution field will be retained. + dataset = RowPreprocessor.get_features_dataset(dataset) + if 'solution' in dataset.features: + dataset = dataset.map(lambda x: {'__#solution': x['solution']}, **map_kwargs) dataset = self._rename_columns(dataset) dataset = self.prepare_dataset(dataset) dataset = self._cast_pil_image(dataset) - map_kwargs = {} - ignore_max_length_error = False - if isinstance(dataset, HfDataset): - map_kwargs['num_proc'] = num_proc - if num_proc > 1: - ignore_max_length_error = True + + ignore_max_length_error = True if isinstance(dataset, HfDataset) and num_proc > 1 else False with self._patch_arrow_writer(): try: dataset_mapped = dataset.map( self.batched_preprocess, - batched=True, - batch_size=batch_size, fn_kwargs={ 'strict': strict, 'ignore_max_length_error': ignore_max_length_error @@ -321,8 +324,8 @@ class ResponsePreprocessor(RowPreprocessor): def __init__(self, *, columns: Optional[Dict[str, str]] = None, **kwargs) -> None: super().__init__(columns=columns, **kwargs) system_keys = ['system', 'system_prompt'] - query_keys = ['query', 'prompt', 'input', 'instruction', 'question'] - response_keys = ['response', 'answer', 'output', 'targets', 'target', 'answer_key', 'answers' + query_keys = ['query', 'prompt', 'input', 'instruction', 'question', 'problem'] + response_keys = ['response', 'answer', 'output', 'targets', 'target', 'answer_key', 'answers', 'solution' ] + ['text', 'completion', 'content'] for key in system_keys: self.columns[key] = 'system' From 53eac75e101d9732b2a05862ba2c3ac400630d72 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Thu, 20 Feb 2025 11:15:45 +0800 Subject: [PATCH 4/6] fix sample --- examples/train/grpo/full_vllm.sh | 6 +++--- swift/llm/infer/infer_engine/infer_engine.py | 3 --- swift/llm/sampling/utils.py | 6 ++++-- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/examples/train/grpo/full_vllm.sh b/examples/train/grpo/full_vllm.sh index 6a96641ce..7100bc24b 100644 --- a/examples/train/grpo/full_vllm.sh +++ b/examples/train/grpo/full_vllm.sh @@ -7,7 +7,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ NPROC_PER_NODE=7 \ swift rlhf \ --rlhf_type grpo \ - --model Qwen/Qwen2.5-7B-Instruct \ + --model Qwen/Qwen2.5-7B \ --reward_funcs accuracy format \ --use_vllm true \ --vllm_device auto \ @@ -15,12 +15,12 @@ swift rlhf \ --vllm_max_model_len 8192 \ --train_type full \ --torch_dtype bfloat16 \ - --dataset 'AI-MO/NuminaMath-TIR#5000' \ + --dataset 'AI-MO/NuminaMath-TIR#2000' \ --max_completion_length 2048 \ --num_train_epochs 1 \ --per_device_train_batch_size 1 \ --per_device_eval_batch_size 1 \ - --learning_rate 1e-6 \ + --learning_rate 5e-7 \ --gradient_accumulation_steps 2 \ --eval_steps 200 \ --save_steps 200 \ diff --git a/swift/llm/infer/infer_engine/infer_engine.py b/swift/llm/infer/infer_engine/infer_engine.py index 2934e2dda..fc2fa6ab2 100644 --- a/swift/llm/infer/infer_engine/infer_engine.py +++ b/swift/llm/infer/infer_engine/infer_engine.py @@ -122,9 +122,6 @@ def _update_metrics(result, metrics: Optional[List[Metric]] = None): metric.update(response) return result_origin - def __call__(self, *args, **kwargs): - return self.infer(*args, **kwargs) - def infer(self, infer_requests: List[InferRequest], request_config: Optional[RequestConfig] = None, diff --git a/swift/llm/sampling/utils.py b/swift/llm/sampling/utils.py index 3d48c6a53..b8ed30fb1 100644 --- a/swift/llm/sampling/utils.py +++ b/swift/llm/sampling/utils.py @@ -38,11 +38,13 @@ def get_reward(model: Any, Index 0: The min-max normalized scores matched the infer_requests Index 1: The mask filtered by the threshold """ - parameters = inspect.signature(model.infer).parameters + from swift.llm import InferEngine + infer_func = model.infer if isinstance(model, InferEngine) else model.__call__ + parameters = inspect.signature(infer_func).parameters gt_param = {} if 'ground_truths' in parameters: gt_param = {'ground_truths': ground_truths} - rewards = model(infer_requests, request_config=request_config, **gt_param) + rewards = infer_func(infer_requests, request_config=request_config, **gt_param) from swift.llm.infer.protocol import ChatCompletionResponse if isinstance(rewards[0], ChatCompletionResponse): rewards = [float(r.choices[0].message.content) for r in rewards] From 153e15f14d4a144f3045c2b1e79188d6a63a6ca7 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Thu, 20 Feb 2025 11:16:55 +0800 Subject: [PATCH 5/6] update --- examples/train/grpo/full_vllm.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/train/grpo/full_vllm.sh b/examples/train/grpo/full_vllm.sh index 7100bc24b..6a96641ce 100644 --- a/examples/train/grpo/full_vllm.sh +++ b/examples/train/grpo/full_vllm.sh @@ -7,7 +7,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ NPROC_PER_NODE=7 \ swift rlhf \ --rlhf_type grpo \ - --model Qwen/Qwen2.5-7B \ + --model Qwen/Qwen2.5-7B-Instruct \ --reward_funcs accuracy format \ --use_vllm true \ --vllm_device auto \ @@ -15,12 +15,12 @@ swift rlhf \ --vllm_max_model_len 8192 \ --train_type full \ --torch_dtype bfloat16 \ - --dataset 'AI-MO/NuminaMath-TIR#2000' \ + --dataset 'AI-MO/NuminaMath-TIR#5000' \ --max_completion_length 2048 \ --num_train_epochs 1 \ --per_device_train_batch_size 1 \ --per_device_eval_batch_size 1 \ - --learning_rate 5e-7 \ + --learning_rate 1e-6 \ --gradient_accumulation_steps 2 \ --eval_steps 200 \ --save_steps 200 \ From fa9ef3d3d816512031577d694f1eac33121f7e76 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Thu, 20 Feb 2025 11:37:53 +0800 Subject: [PATCH 6/6] update --- swift/llm/dataset/dataset/llm.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/swift/llm/dataset/dataset/llm.py b/swift/llm/dataset/dataset/llm.py index 78437192f..60a7a3bbc 100644 --- a/swift/llm/dataset/dataset/llm.py +++ b/swift/llm/dataset/dataset/llm.py @@ -557,10 +557,6 @@ def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]: class XlamFunctionCallingPreprocessor(ResponsePreprocessor): - def __init__(self, response=True): - self.response = response - super().__init__() - def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]: query = row['query'] answers = row['response']