From bccf84de6cda0352a11297a2c8a77f7cacc08c44 Mon Sep 17 00:00:00 2001 From: tastelikefeet <58414341+tastelikefeet@users.noreply.github.com> Date: Fri, 21 Feb 2025 17:46:29 +0800 Subject: [PATCH 01/24] update distill docs (#3216) --- README.md | 1 + README_CN.md | 1 + examples/sampler/distill/distill.sh | 11 +++++++++++ examples/sampler/{ => mcts}/mcts.py | 0 examples/sampler/{ => mcts}/mcts.sh | 0 examples/sampler/{ => mcts}/system_prompt.txt | 0 6 files changed, 13 insertions(+) create mode 100644 examples/sampler/distill/distill.sh rename examples/sampler/{ => mcts}/mcts.py (100%) rename examples/sampler/{ => mcts}/mcts.sh (100%) rename examples/sampler/{ => mcts}/system_prompt.txt (100%) diff --git a/README.md b/README.md index 7542762b55..9fa14059aa 100644 --- a/README.md +++ b/README.md @@ -78,6 +78,7 @@ You can contact us and communicate with us by adding our group: ## 🎉 News +- 🎁 2025.02.21: Support distill from LLM API,Please check[this example](examples/sampler/distill/distill.sh) - 🎁 2025.02.17: Support SwanLab, just add [a few of arguments](docs/source_en/Instruction/Command-line-parameters.md#swanlab) you can use swanlab to analysis your training results - 🎁 2025.02.16: Support LMDeploy in GRPO, use `--use_lmdeploy true`. Please check [this script](examples/train/grpo/full_lmdeploy.sh) - 🔥 2025.02.12: Support for GRPO(Group Relative Policy Optimization) algorithm for llm and mllm, document can be found in [here](docs/source_en/Instruction/GRPO.md) diff --git a/README_CN.md b/README_CN.md index 751bb5a818..fd3eade659 100644 --- a/README_CN.md +++ b/README_CN.md @@ -74,6 +74,7 @@ - **模型量化**:支持AWQ、GPTQ和BNB的量化导出,导出的模型支持使用vLLM/LmDeploy推理加速,并支持继续训练。 ## 🎉 新闻 +- 🎁 2025.02.21: 支持大模型API蒸馏采样,请查看[示例](examples/sampler/distill/distill.sh) - 🎁 2025.02.17: 支持SwanLab, 仅需添加[几个新的参数](docs/source/Instruction/命令行参数.md#swanlab)就可以在swanlab上验证你的训练效果 - 🎁 2025.02.16: 在GRPO算法中支持LMDeploy, 请查看`--use_lmdeploy true`. 具体参考[这个脚本](examples/train/grpo/full_lmdeploy.sh) - 🔥 2025.02.12: 支持GRPO(Group Relative Policy Optimization) 训练算法,训练脚本可以在[这里](docs/source/Instruction/GRPO.md)找到 diff --git a/examples/sampler/distill/distill.sh b/examples/sampler/distill/distill.sh new file mode 100644 index 0000000000..895017b903 --- /dev/null +++ b/examples/sampler/distill/distill.sh @@ -0,0 +1,11 @@ +OPENAI_API_KEY="xxx" \ +swift sample \ + --sampler_type distill \ + --sampler_engine client \ + --model deepseek-r1 \ + --stream true \ + --dataset tastelikefeet/competition_math#5 \ + --num_return_sequences 1 \ + --temperature 0.6 \ + --top_p 0.95 \ + --engine_kwargs '{"base_url":"https://dashscope.aliyuncs.com/compatible-mode/v1"}' diff --git a/examples/sampler/mcts.py b/examples/sampler/mcts/mcts.py similarity index 100% rename from examples/sampler/mcts.py rename to examples/sampler/mcts/mcts.py diff --git a/examples/sampler/mcts.sh b/examples/sampler/mcts/mcts.sh similarity index 100% rename from examples/sampler/mcts.sh rename to examples/sampler/mcts/mcts.sh diff --git a/examples/sampler/system_prompt.txt b/examples/sampler/mcts/system_prompt.txt similarity index 100% rename from examples/sampler/system_prompt.txt rename to examples/sampler/mcts/system_prompt.txt From b73b549df1a4cf5f7e9fcea97d008a9e7410b146 Mon Sep 17 00:00:00 2001 From: jinghanhu Date: Fri, 21 Feb 2025 18:26:06 +0800 Subject: [PATCH 02/24] compatible with trl0.16 (#3209) * beta=0 * profiling_decorator * gradient_checkpointing * temp * update * update requirement * fix import * restore prepare_inputs * update * update requirement * rm multi step args * update args * update * update docs --------- Co-authored-by: hjh Co-authored-by: Jintao Huang --- README.md | 2 +- README_CN.md | 2 +- .../SWIFT\345\256\211\350\243\205.md" | 2 +- docs/source/Instruction/GRPO.md | 4 +- ...44\350\241\214\345\217\202\346\225\260.md" | 2 + .../GetStarted/SWIFT-installation.md | 2 +- .../Instruction/Command-line-parameters.md | 2 + docs/source_en/Instruction/GRPO.md | 4 +- examples/train/grpo/full_vllm.sh | 2 +- examples/train/grpo/grpo.sh | 2 +- examples/train/grpo/lora_vllm.sh | 2 +- examples/train/grpo/multi_node/multi_node1.sh | 2 +- examples/train/grpo/plugin/run_external_rm.sh | 2 +- requirements/framework.txt | 2 +- swift/llm/argument/rlhf_args.py | 8 +- swift/trainers/arguments.py | 1 - swift/trainers/rlhf_trainer/grpo_trainer.py | 75 ++++++++++++++----- 17 files changed, 82 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 9fa14059aa..470961c7cd 100644 --- a/README.md +++ b/README.md @@ -117,7 +117,7 @@ Running Environment: | transformers | >=4.33 | 4.48.3 | | | modelscope | >=1.19 | | | | peft | >=0.11.0,<0.15.0 | | | -| trl | >=0.13,<0.16 | 0.15 | RLHF | +| trl | >=0.13,<0.17 | 0.15 | RLHF | | deepspeed | >=0.14 | | Training | | vllm | >=0.5.1 | 0.7.2 | Inference/Deployment/Evaluation | | lmdeploy | lmdeploy>=0.5,<0.6.5 | 0.6.4 | Inference/Deployment/Evaluation | diff --git a/README_CN.md b/README_CN.md index fd3eade659..c98fc846d7 100644 --- a/README_CN.md +++ b/README_CN.md @@ -112,7 +112,7 @@ pip install -e . | transformers | >=4.33 | 4.48.3 || | modelscope | >=1.19 | || | peft | >=0.11.0,<0.15.0 | || -| trl | >=0.13,<0.16 | 0.15 |RLHF| +| trl | >=0.13,<0.17 | 0.15 |RLHF| | deepspeed | >=0.14 | |训练| | vllm | >=0.5.1 | 0.7.2 |推理/部署/评测| | lmdeploy | lmdeploy>=0.5,<0.6.5 | 0.6.4 |推理/部署/评测| diff --git "a/docs/source/GetStarted/SWIFT\345\256\211\350\243\205.md" "b/docs/source/GetStarted/SWIFT\345\256\211\350\243\205.md" index 2c2ad44386..22149b3d95 100644 --- "a/docs/source/GetStarted/SWIFT\345\256\211\350\243\205.md" +++ "b/docs/source/GetStarted/SWIFT\345\256\211\350\243\205.md" @@ -60,7 +60,7 @@ pip install ms-swift==2.* | transformers | >=4.33 | 4.48.3 || | modelscope | >=1.19 | || | peft | >=0.11.0,<0.15.0 | || -| trl | >=0.13,<0.16 | 0.15 |RLHF| +| trl | >=0.13,<0.17 | 0.15 |RLHF| | deepspeed | >=0.14 | |训练| | vllm | >=0.5.1 | 0.7.2 |推理/部署/评测| | lmdeploy | lmdeploy>=0.5,<0.6.5 | 0.6.4 |推理/部署/评测| diff --git a/docs/source/Instruction/GRPO.md b/docs/source/Instruction/GRPO.md index 15f306f1da..df60beddd9 100644 --- a/docs/source/Instruction/GRPO.md +++ b/docs/source/Instruction/GRPO.md @@ -7,7 +7,7 @@ 环境安装 ```bash pip install math_verify # reward function -pip install "trl>=0.15" +pip install git+https://github.com/huggingface/trl.git" ``` **注意**:训练过程中 loss 接近0 是正常情况, 参考[issue](https://github.com/huggingface/open-r1/issues/239#issuecomment-2646297851) @@ -95,6 +95,8 @@ A conversation between User and Assistant. The user asks a question, and the Ass - vllm_gpu_memory_utilization: vLLM透传参数 - vllm_max_model_len: vLLM透传参数 - reward_model: 同model, 使用奖励模型作为奖励函数,与reward_funcs至少需要指定一个 +- num_iterations: 每个批次代更新次数,默认为1. +- epsilon: clip 系数 奖励函数超参,见[内置奖励函数](#内置奖励函数) diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" index 23bbb2ad4f..f04913b221 100644 --- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" +++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" @@ -384,6 +384,8 @@ reward模型参数将在PPO、GRPO中使用。 - top_k: 默认为50 - top_p: 默认为0.9 - repetition_penalty: 重复惩罚项。默认为1. +- num_iterations: 每个批次代更新次数,默认为1. +- epsilon: clip 系数 cosine 奖励参数 - cosine_min_len_value_wrong:cosine 奖励函数参数,生成错误答案时,最小长度对应的奖励值。默认值为0.0 diff --git a/docs/source_en/GetStarted/SWIFT-installation.md b/docs/source_en/GetStarted/SWIFT-installation.md index 398f9145e5..95c7cc94ee 100644 --- a/docs/source_en/GetStarted/SWIFT-installation.md +++ b/docs/source_en/GetStarted/SWIFT-installation.md @@ -61,7 +61,7 @@ You can view the image [here](https://modelscope.cn/docs/intro/environment-setup | transformers | >=4.33 | 4.48.3 | | | modelscope | >=1.19 | | | | peft | >=0.11.0,<0.15.0 | | | -| trl | >=0.13,<0.16 | 0.15 | RLHF | +| trl | >=0.13,<0.17 | 0.15 | RLHF | | deepspeed | >=0.14 | | Training | | vllm | >=0.5.1 | 0.7.2 | Inference/Deployment/Evaluation | | lmdeploy | lmdeploy>=0.5,<0.6.5 | 0.6.4 | Inference/Deployment/Evaluation | diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md index 0eaa308417..096dbafcac 100644 --- a/docs/source_en/Instruction/Command-line-parameters.md +++ b/docs/source_en/Instruction/Command-line-parameters.md @@ -395,6 +395,8 @@ The meanings of the following parameters can be referenced [here](https://huggin - top_k: Default is 50. - top_p: Default is 0.9. - repetition_penalty: Repetition penalty term. Default is 1. +- num_iterations: number of iterations per batch. Default is 1. +- epsilon: epsilon value for clipping. Default is 0.2 cosine reward function arguments - `cosine_min_len_value_wrong` (default: 0.0): Reward value corresponding to the minimum length when the answer is incorrect. Default is 0.0 diff --git a/docs/source_en/Instruction/GRPO.md b/docs/source_en/Instruction/GRPO.md index 7d854ee62d..f4a0e1bf0f 100644 --- a/docs/source_en/Instruction/GRPO.md +++ b/docs/source_en/Instruction/GRPO.md @@ -8,7 +8,7 @@ environments ```bash pip install math_verify # reward function -pip install "trl>=0.15" +pip install git+https://github.com/huggingface/trl.git" ``` **Note**: It is normal for the loss to approach zero during training. Refer to this [issue](https://github.com/huggingface/open-r1/issues/239#issuecomment-2646297851) for more details. @@ -97,6 +97,8 @@ Hyperparameters - vllm_gpu_memory_utilization: vLLM pass-through parameter. - vllm_max_model_len: vLLM pass-through parameter. - reward_model: Same as the model, using a reward model as a reward function. At least one of reward_funcs and reward_model needs to be specified. +- num_iterations: number of iterations per batch. Default is 1. +- epsilon: epsilon value for clipping. Default is 0.2 The hyperparameters for the reward function can be found in the [Built-in Reward Functions section](#built-in-reward-functions). diff --git a/examples/train/grpo/full_vllm.sh b/examples/train/grpo/full_vllm.sh index 6a96641ce3..5f3ec364aa 100644 --- a/examples/train/grpo/full_vllm.sh +++ b/examples/train/grpo/full_vllm.sh @@ -1,6 +1,6 @@ # One GPU is left for vLLM inference acceleration. # pip install math_verify # reward function -# pip install "trl>=0.15" +# pip install git+https://github.com/huggingface/trl.git # GPU memory: 8 * 80GiB CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ diff --git a/examples/train/grpo/grpo.sh b/examples/train/grpo/grpo.sh index 952d3b59da..4f805c5c45 100644 --- a/examples/train/grpo/grpo.sh +++ b/examples/train/grpo/grpo.sh @@ -1,5 +1,5 @@ # pip install math_verify # reward function -# pip install "trl>=0.15" +# pip install git+https://github.com/huggingface/trl.git # GPU memory: 80GiB # You can set `--reward_model` to use a reward model to provide rewards. CUDA_VISIBLE_DEVICES=0 \ diff --git a/examples/train/grpo/lora_vllm.sh b/examples/train/grpo/lora_vllm.sh index a5be714283..73d35f215b 100644 --- a/examples/train/grpo/lora_vllm.sh +++ b/examples/train/grpo/lora_vllm.sh @@ -1,5 +1,5 @@ # pip install math_verify # reward function -# pip install "trl>=0.15" +# pip install git+https://github.com/huggingface/trl.git # GPU memory: 2 * 80GiB MASTER_PORT=29501 \ diff --git a/examples/train/grpo/multi_node/multi_node1.sh b/examples/train/grpo/multi_node/multi_node1.sh index f4526157bc..e092fad91e 100755 --- a/examples/train/grpo/multi_node/multi_node1.sh +++ b/examples/train/grpo/multi_node/multi_node1.sh @@ -1,5 +1,5 @@ # pip install math_verify # reward function -# pip install "trl>=0.15" +# pip install git+https://github.com/huggingface/trl.git export CUDA_VISIBLE_DEVICES=0,1,2,3 export NNODES=2 export NODE_RANK=0 diff --git a/examples/train/grpo/plugin/run_external_rm.sh b/examples/train/grpo/plugin/run_external_rm.sh index c0d232b666..84b7800400 100644 --- a/examples/train/grpo/plugin/run_external_rm.sh +++ b/examples/train/grpo/plugin/run_external_rm.sh @@ -1,5 +1,5 @@ # pip install math_verify # reward function -# pip install "trl>=0.15" +# pip install git+https://github.com/huggingface/trl.git # GPU memory: 80GiB CUDA_VISIBLE_DEVICES=0 \ diff --git a/requirements/framework.txt b/requirements/framework.txt index a720819f48..3072671f37 100644 --- a/requirements/framework.txt +++ b/requirements/framework.txt @@ -30,6 +30,6 @@ tiktoken tqdm transformers>=4.33,<4.50 transformers_stream_generator -trl>=0.13,<0.16 +trl>=0.13,<0.17 uvicorn zstandard diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py index 2d3907def2..8f175cc77e 100644 --- a/swift/llm/argument/rlhf_args.py +++ b/swift/llm/argument/rlhf_args.py @@ -53,6 +53,10 @@ class GRPOArguments(GRPOArgumentsMixin): vllm_gpu_memory_utilization: float = 0.9 vllm_max_model_len: Optional[int] = None + # multi step + num_iterations: int = 1 + epsilon: float = 0.2 + @dataclass class RLHFArguments(GRPOArguments, PPOArguments, RewardModelArguments, TrainArguments): @@ -113,7 +117,9 @@ def __post_init__(self): self.loss_scale = 'default' else: self.loss_scale = 'last_round' - if self.rlhf_type in ['dpo', 'kto', 'ppo', 'grpo'] and self.train_type == 'full': + if self.rlhf_type == 'grpo' and self.beta == 0.0: + self.ref_model = None + elif self.rlhf_type in ['dpo', 'kto', 'ppo', 'grpo'] and self.train_type == 'full': self.ref_model = self.ref_model or self.model self.ref_model_type = self.ref_model_type or self.model_type self.ref_model_revision = self.ref_model_revision or self.model_revision diff --git a/swift/trainers/arguments.py b/swift/trainers/arguments.py index b5c4e7e09d..9410b8f7fe 100644 --- a/swift/trainers/arguments.py +++ b/swift/trainers/arguments.py @@ -70,7 +70,6 @@ def place_model_on_device(self): @dataclass class GRPOArgumentsMixin: - # vllm_device, vllm_gpu_memory_utilization, and vllm_max_model_len are defined in HfGRPOConfig. num_infer_workers: int = 1 vllm_max_num_seqs: int = 256 diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py index c86b9d3048..666ddc3c8c 100644 --- a/swift/trainers/rlhf_trainer/grpo_trainer.py +++ b/swift/trainers/rlhf_trainer/grpo_trainer.py @@ -11,7 +11,6 @@ import torch.nn as nn from accelerate.utils import gather, gather_object, is_peft_model, set_seed from transformers import PreTrainedModel -from transformers.utils.versions import require_version from trl import GRPOTrainer as HFGRPOTrainer from trl.models import unwrap_model_for_generation @@ -22,8 +21,12 @@ from ..mixin import SwiftMixin from .rlhf_mixin import RLHFTrainerMixin +try: + from trl.extras.profiling import profiling_decorator +except ImportError: + raise ImportError('Please install trl from source using: pip install git+https://github.com/huggingface/trl.git') + del HFGRPOTrainer.__init__ -del HFGRPOTrainer._prepare_inputs logger = get_logger() if is_wandb_available(): @@ -39,7 +42,6 @@ def __init__(self, reward_funcs: Optional[List[Union[str, Callable]]] = None, *_args, **kwargs): - require_version('trl>=0.15') args = kwargs['args'] self.processing_class = kwargs.get('template').tokenizer @@ -79,7 +81,7 @@ def __init__(self, self.num_generations = args.num_generations model.warnings_issued['estimate_tokens'] = True kwargs['data_collator'] = lambda features: features - self._metrics = defaultdict(list) + self._metrics = {'train': defaultdict(list), 'eval': defaultdict(list)} use_vllm = args.use_vllm use_lmdeploy = args.use_lmdeploy @@ -204,6 +206,15 @@ def __init__(self, self.log_completions = args.log_completions self.jsonl_writer = JsonlWriter(os.path.join(self.args.output_dir, 'completions.jsonl')) + # Multi-step + self.num_iterations = args.num_iterations # = 𝜇 in the GRPO paper + self.epsilon = args.epsilon + # Tracks the number of iterations (forward + backward passes), including those within a gradient accumulation cycle. # noqa + self._step = 0 + # Buffer the batch to reuse generated outputs across multiple updates. For more details, see + # `_get_train_sampler` and `_prepare_inputs`. + self._buffered_inputs = [None] * args.gradient_accumulation_steps + @property def infer_rank(self): rank, local_rank, world_size, local_world_size = get_dist_setting() @@ -253,6 +264,7 @@ def _template_context(template): template.set_mode(mode) template.max_length = max_length + @profiling_decorator def _move_model_to_vllm_lmdeploy(self): from accelerate.utils.other import is_compiled_module with unwrap_model_for_generation( @@ -299,7 +311,9 @@ def reorder_outputs(outputs, distributed_idx): return [index_to_output[idx] for idx in sorted(index_to_output.keys())] - def _prepare_inputs(self, inputs) -> Dict[str, Union[torch.Tensor, Any]]: + def _generate_and_score_completions( + self, inputs: dict[str, Union[torch.Tensor, Any]]) -> dict[str, Union[torch.Tensor, Any]]: + device = self.accelerator.device rank, local_rank, world_size, local_world_size = get_dist_setting() # Generate completions using either vLLM or regular generation @@ -363,7 +377,14 @@ def _prepare_inputs(self, inputs) -> Dict[str, Union[torch.Tensor, Any]]: outputs['completion_mask'] = labels[:, -logits_to_keep:] != -100 with torch.inference_mode(): - if self.ref_model is not None: + if self.num_iterations > 1: + outputs['old_per_token_logps'] = self._get_per_token_logps(self.model, outputs) + else: + outputs['old_per_token_logps'] = None + + if self.beta == 0.0: + ref_per_token_logps = None + elif self.ref_model is not None: ref_per_token_logps = self._get_per_token_logps(self.ref_model, outputs) else: with self.accelerator.unwrap_model(self.model).disable_adapter(): @@ -401,6 +422,9 @@ def _prepare_inputs(self, inputs) -> Dict[str, Union[torch.Tensor, Any]]: advantages = advantages[process_slice] # Log the metrics + mode = 'eval' if self.control.should_evaluate else 'train' + completion_length = self.accelerator.gather_for_metrics(outputs['completion_mask'].sum(1)).float().mean().item() + self._metrics[mode]['completion_length'].append(completion_length) reward_per_func = rewards_per_func.mean(0) for i, reward_func in enumerate(self.reward_funcs): if isinstance(reward_func, nn.Module): # Module instead of PretrainedModel for compat with compiled models @@ -410,10 +434,10 @@ def _prepare_inputs(self, inputs) -> Dict[str, Union[torch.Tensor, Any]]: reward_func_name = reward_func.__name__ # function else: reward_func_name = reward_func.__class__.__name__ # method - self._metrics[f'rewards/{reward_func_name}'].append(reward_per_func[i].item()) + self._metrics[mode][f'rewards/{reward_func_name}'].append(reward_per_func[i].item()) - self._metrics['reward'].append(rewards.mean().item()) - self._metrics['reward_std'].append(std_grouped_rewards.mean().item()) + self._metrics[mode]['reward'].append(rewards.mean().item()) + self._metrics[mode]['reward_std'].append(std_grouped_rewards.mean().item()) outputs.update({ 'ref_per_token_logps': ref_per_token_logps, 'advantages': advantages, @@ -434,6 +458,7 @@ def _prepare_inputs(self, inputs) -> Dict[str, Union[torch.Tensor, Any]]: return outputs + @profiling_decorator def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None): if return_outputs: raise ValueError('The GRPOTrainer does not support returning outputs') @@ -442,28 +467,38 @@ def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=N per_token_logps = self._get_per_token_logps(model, inputs) # Compute the KL divergence between the model and the reference model - ref_per_token_logps = inputs['ref_per_token_logps'] - per_token_kl = torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1 + if self.beta != 0.0: + ref_per_token_logps = inputs['ref_per_token_logps'] + per_token_kl = ( + torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1) - # x - x.detach() allows for preserving gradients from x advantages = inputs['advantages'] - per_token_loss = torch.exp(per_token_logps - per_token_logps.detach()) * advantages.unsqueeze(1) - per_token_loss = -(per_token_loss - self.beta * per_token_kl) + old_per_token_logps = inputs['old_per_token_logps'] if self.num_iterations > 1 else per_token_logps.detach() + coef_1 = torch.exp(per_token_logps - old_per_token_logps) + coef_2 = torch.clamp(coef_1, 1 - self.epsilon, 1 + self.epsilon) + per_token_loss1 = coef_1 * advantages.unsqueeze(1) + per_token_loss2 = coef_2 * advantages.unsqueeze(1) + per_token_loss = -torch.min(per_token_loss1, per_token_loss2) + if self.beta != 0.0: + per_token_loss = per_token_loss + self.beta * per_token_kl loss = (per_token_loss * completion_mask).sum() / completion_mask.sum() # Log the metrics - completion_length = self.accelerator.gather_for_metrics(completion_mask.sum(1)).float().mean().item() - self._metrics['completion_length'].append(completion_length) + mode = 'eval' if self.control.should_evaluate else 'train' - mean_kl = ((per_token_kl * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)).mean() - self._metrics['kl'].append(self.accelerator.gather_for_metrics(mean_kl).mean().item()) + if self.beta != 0.0: + mean_kl = ((per_token_kl * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)).mean() + self._metrics[mode]['kl'].append(self.accelerator.gather_for_metrics(mean_kl).mean().item()) + is_clipped = (per_token_loss1 < per_token_loss2).float() + clip_ratio = (is_clipped * completion_mask).sum() / completion_mask.sum() + self._metrics[mode]['clip_ratio'].append(self.accelerator.gather_for_metrics(clip_ratio).mean().item()) return loss # Get the per-token log probabilities for the completions for the model and the reference model + @profiling_decorator def _get_per_token_logps(self, model, inputs): - # pip install trl>=0.15 from trl.trainer.utils import selective_log_softmax logits_to_keep = inputs['logits_to_keep'] input_ids = inputs['input_ids'] @@ -486,6 +521,6 @@ def _get_per_token_logps(self, model, inputs): def evaluation_loop(self, *args, **kwargs): metric_key_prefix = kwargs['metric_key_prefix'] output = super().evaluation_loop(*args, **kwargs) - metrics = {f'{metric_key_prefix}_{key}': sum(val) / len(val) for key, val in self._metrics.items()} + metrics = {f'{metric_key_prefix}_{key}': sum(val) / len(val) for key, val in self._metrics['eval'].items()} output.metrics.update(metrics) return output From f6009d6ab2daa02807a2dfdd79eab8097c5a4e5c Mon Sep 17 00:00:00 2001 From: Jintao Date: Fri, 21 Feb 2025 23:07:20 +0800 Subject: [PATCH 03/24] support r1 awq (#3206) --- README.md | 10 +++++----- README_CN.md | 8 ++++---- .../GetStarted/SWIFT\345\256\211\350\243\205.md" | 8 ++++---- ...345\222\214\346\225\260\346\215\256\351\233\206.md" | 7 +++++-- docs/source_en/GetStarted/SWIFT-installation.md | 8 ++++---- .../Instruction/Supported-models-and-datasets.md | 7 +++++-- examples/infer/vllm/mllm_ddp.sh | 6 +++--- swift/llm/dataset/data/dataset_info.json | 10 ++++++++-- swift/llm/model/model/deepseek.py | 6 ++++++ 9 files changed, 44 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 470961c7cd..530e6127df 100644 --- a/README.md +++ b/README.md @@ -114,12 +114,12 @@ Running Environment: | python | >=3.9 | 3.10 | | | cuda | | cuda12 | No need to install if using CPU, NPU, MPS | | torch | >=2.0 | | | -| transformers | >=4.33 | 4.48.3 | | +| transformers | >=4.33 | 4.49 | | | modelscope | >=1.19 | | | -| peft | >=0.11.0,<0.15.0 | | | -| trl | >=0.13,<0.17 | 0.15 | RLHF | -| deepspeed | >=0.14 | | Training | -| vllm | >=0.5.1 | 0.7.2 | Inference/Deployment/Evaluation | +| peft | >=0.11,<0.15 | || +| trl | >=0.13,<0.17 | 0.15 |RLHF| +| deepspeed | >=0.14 | 0.14.5 | Training | +| vllm | >=0.5.1 | 0.7.3 | Inference/Deployment/Evaluation | | lmdeploy | lmdeploy>=0.5,<0.6.5 | 0.6.4 | Inference/Deployment/Evaluation | | evalscope | | >=0.11 | Evaluation | diff --git a/README_CN.md b/README_CN.md index c98fc846d7..3f607ca8e2 100644 --- a/README_CN.md +++ b/README_CN.md @@ -109,12 +109,12 @@ pip install -e . | python | >=3.9 | 3.10 || | cuda | | cuda12 |使用cpu、npu、mps则无需安装| | torch | >=2.0 | || -| transformers | >=4.33 | 4.48.3 || +| transformers | >=4.33 | 4.49 || | modelscope | >=1.19 | || -| peft | >=0.11.0,<0.15.0 | || +| peft | >=0.11,<0.15 | || | trl | >=0.13,<0.17 | 0.15 |RLHF| -| deepspeed | >=0.14 | |训练| -| vllm | >=0.5.1 | 0.7.2 |推理/部署/评测| +| deepspeed | >=0.14 | 0.14.5 |训练| +| vllm | >=0.5.1 | 0.7.3 |推理/部署/评测| | lmdeploy | lmdeploy>=0.5,<0.6.5 | 0.6.4 |推理/部署/评测| | evalscope | | >=0.11 |评测| diff --git "a/docs/source/GetStarted/SWIFT\345\256\211\350\243\205.md" "b/docs/source/GetStarted/SWIFT\345\256\211\350\243\205.md" index 22149b3d95..f5bb6b1f8f 100644 --- "a/docs/source/GetStarted/SWIFT\345\256\211\350\243\205.md" +++ "b/docs/source/GetStarted/SWIFT\345\256\211\350\243\205.md" @@ -57,12 +57,12 @@ pip install ms-swift==2.* | python | >=3.9 | 3.10 || | cuda | | cuda12 |使用cpu、npu、mps则无需安装| | torch | >=2.0 | || -| transformers | >=4.33 | 4.48.3 || +| transformers | >=4.33 | 4.49 || | modelscope | >=1.19 | || -| peft | >=0.11.0,<0.15.0 | || +| peft | >=0.11,<0.15 | || | trl | >=0.13,<0.17 | 0.15 |RLHF| -| deepspeed | >=0.14 | |训练| -| vllm | >=0.5.1 | 0.7.2 |推理/部署/评测| +| deepspeed | >=0.14 | 0.14.5 |训练| +| vllm | >=0.5.1 | 0.7.3 |推理/部署/评测| | lmdeploy | lmdeploy>=0.5,<0.6.5 | 0.6.4 |推理/部署/评测| | evalscope | | >=0.11 |评测| diff --git "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" index 2244e5037e..69df4b8f1a 100644 --- "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" @@ -352,8 +352,10 @@ |[deepseek-ai/DeepSeek-V2.5-1210](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2.5-1210)|deepseek_v2_5|deepseek_v2_5|transformers>=4.39.3|-|[deepseek-ai/DeepSeek-V2.5-1210](https://huggingface.co/deepseek-ai/DeepSeek-V2.5-1210)| |[deepseek-ai/DeepSeek-V3-Base](https://modelscope.cn/models/deepseek-ai/DeepSeek-V3-Base)|deepseek_v2_5|deepseek_v2_5|transformers>=4.39.3|-|[deepseek-ai/DeepSeek-V3-Base](https://huggingface.co/deepseek-ai/DeepSeek-V3-Base)| |[deepseek-ai/DeepSeek-V3](https://modelscope.cn/models/deepseek-ai/DeepSeek-V3)|deepseek_v2_5|deepseek_v2_5|transformers>=4.39.3|-|[deepseek-ai/DeepSeek-V3](https://huggingface.co/deepseek-ai/DeepSeek-V3)| +|[cognitivecomputations/DeepSeek-V3-awq](https://modelscope.cn/models/cognitivecomputations/DeepSeek-V3-awq)|deepseek_v2_5|deepseek_v2_5|transformers>=4.39.3|-|[cognitivecomputations/DeepSeek-V3-AWQ](https://huggingface.co/cognitivecomputations/DeepSeek-V3-AWQ)| |[deepseek-ai/DeepSeek-R1](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1)|deepseek_r1|deepseek_r1|transformers>=4.39.3|-|[deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1)| |[deepseek-ai/DeepSeek-R1-Zero](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Zero)|deepseek_r1|deepseek_r1|transformers>=4.39.3|-|[deepseek-ai/DeepSeek-R1-Zero](https://huggingface.co/deepseek-ai/DeepSeek-R1-Zero)| +|[cognitivecomputations/DeepSeek-R1-awq](https://modelscope.cn/models/cognitivecomputations/DeepSeek-R1-awq)|deepseek_r1|deepseek_r1|transformers>=4.39.3|-|[cognitivecomputations/DeepSeek-R1-AWQ](https://huggingface.co/cognitivecomputations/DeepSeek-R1-AWQ)| |[deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B)|deepseek_r1_distill|deepseek_r1|transformers>=4.37|-|[deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B)| |[deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B)|deepseek_r1_distill|deepseek_r1|transformers>=4.37|-|[deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B)| |[deepseek-ai/DeepSeek-R1-Distill-Qwen-14B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B)|deepseek_r1_distill|deepseek_r1|transformers>=4.37|-|[deepseek-ai/DeepSeek-R1-Distill-Qwen-14B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B)| @@ -704,7 +706,7 @@ | ---------- | ----------- | -------------| ------------------| ---- | ------------- | |[AI-MO/NuminaMath-1.5](https://modelscope.cn/datasets/AI-MO/NuminaMath-1.5)|default|896215|116.1±80.8, min=31, max=5064|grpo, math|[AI-MO/NuminaMath-1.5](https://huggingface.co/datasets/AI-MO/NuminaMath-1.5)| |[AI-MO/NuminaMath-CoT](https://modelscope.cn/datasets/AI-MO/NuminaMath-CoT)|default|859494|113.1±60.2, min=35, max=2120|grpo, math|[AI-MO/NuminaMath-CoT](https://huggingface.co/datasets/AI-MO/NuminaMath-CoT)| -|[AI-MO/NuminaMath-TIR](https://modelscope.cn/datasets/AI-MO/NuminaMath-TIR)|default|72441|100.9±52.2, min=36, max=1683|grpo, math|[AI-MO/NuminaMath-TIR](https://huggingface.co/datasets/AI-MO/NuminaMath-TIR)| +|[AI-MO/NuminaMath-TIR](https://modelscope.cn/datasets/AI-MO/NuminaMath-TIR)|default|72441|100.9±52.2, min=36, max=1683|grpo, math, 🔥|[AI-MO/NuminaMath-TIR](https://huggingface.co/datasets/AI-MO/NuminaMath-TIR)| |[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA)|chinese_traditional
coig_pc
exam
finance
douban
human_value
logi_qa
ruozhiba
segmentfault
wiki
wikihow
xhs
zhihu|44694|331.2±693.8, min=34, max=19288|general, 🔥|-| |[AI-ModelScope/CodeAlpaca-20k](https://modelscope.cn/datasets/AI-ModelScope/CodeAlpaca-20k)|default|20022|99.3±57.6, min=30, max=857|code, en|[HuggingFaceH4/CodeAlpaca_20K](https://huggingface.co/datasets/HuggingFaceH4/CodeAlpaca_20K)| |[AI-ModelScope/DISC-Law-SFT](https://modelscope.cn/datasets/AI-ModelScope/DISC-Law-SFT)|default|166758|1799.0±474.9, min=769, max=3151|chat, law, 🔥|[ShengbinYue/DISC-Law-SFT](https://huggingface.co/datasets/ShengbinYue/DISC-Law-SFT)| @@ -771,6 +773,7 @@ |[AI-ModelScope/wikipedia-cn-20230720-filtered](https://modelscope.cn/datasets/AI-ModelScope/wikipedia-cn-20230720-filtered)|default|huge dataset|-|pretrain, quality|[pleisto/wikipedia-cn-20230720-filtered](https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered)| |[AI-ModelScope/zhihu_rlhf_3k](https://modelscope.cn/datasets/AI-ModelScope/zhihu_rlhf_3k)|default|3460|594.5±365.9, min=31, max=1716|rlhf, dpo, zh|[liyucheng/zhihu_rlhf_3k](https://huggingface.co/datasets/liyucheng/zhihu_rlhf_3k)| |[DAMO_NLP/jd](https://modelscope.cn/datasets/DAMO_NLP/jd)|default
cls|45012|66.9±87.0, min=41, max=1699|text-generation, classification, 🔥|-| +|[FreedomIntelligence/medical-o1-reasoning-SFT](https://modelscope.cn/datasets/FreedomIntelligence/medical-o1-reasoning-SFT)|en
zh|50143|98.0±53.6, min=36, max=1508|medical, o1, 🔥|[FreedomIntelligence/medical-o1-reasoning-SFT](https://huggingface.co/datasets/FreedomIntelligence/medical-o1-reasoning-SFT)| |-|default|huge dataset|-|pretrain, quality|[HuggingFaceFW/fineweb](https://huggingface.co/datasets/HuggingFaceFW/fineweb)| |-|auto_math_text
khanacademy
openstax
stanford
stories
web_samples_v1
web_samples_v2
wikihow|huge dataset|-|multi-domain, en, qa|[HuggingFaceTB/cosmopedia](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia)| |[HumanLLMs/Human-Like-DPO-Dataset](https://modelscope.cn/datasets/HumanLLMs/Human-Like-DPO-Dataset)|default|10884|47.5±7.9, min=32, max=85|rlhf, dpo|[HumanLLMs/Human-Like-DPO-Dataset](https://huggingface.co/datasets/HumanLLMs/Human-Like-DPO-Dataset)| @@ -802,7 +805,7 @@ |[iic/MSAgent-Pro](https://modelscope.cn/datasets/iic/MSAgent-Pro)|default|21910|1978.1±747.9, min=339, max=8064|chat, agent, multi-round, 🔥|-| |[iic/ms_agent](https://modelscope.cn/datasets/iic/ms_agent)|default|30000|645.8±218.0, min=199, max=2070|chat, agent, multi-round, 🔥|-| |[iic/ms_bench](https://modelscope.cn/datasets/iic/ms_bench)|default|316820|353.4±424.5, min=29, max=2924|chat, general, multi-round, 🔥|-| -|[liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT](https://modelscope.cn/datasets/liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT)|default|110000|72.1±60.9, min=29, max=2315|chat, sft, cot, r1|[Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT](https://huggingface.co/datasets/Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT)| +|[liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT](https://modelscope.cn/datasets/liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT)|default|110000|72.1±60.9, min=29, max=2315|chat, sft, cot, r1, 🔥|[Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT](https://huggingface.co/datasets/Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT)| |-|default|huge dataset|-|multi-modal, en, vqa, quality|[lmms-lab/GQA](https://huggingface.co/datasets/lmms-lab/GQA)| |-|0_30_s_academic_v0_1
0_30_s_youtube_v0_1
1_2_m_academic_v0_1
1_2_m_youtube_v0_1
2_3_m_academic_v0_1
2_3_m_youtube_v0_1
30_60_s_academic_v0_1
30_60_s_youtube_v0_1|1335486|273.7±78.8, min=107, max=638|chat, multi-modal, video|[lmms-lab/LLaVA-Video-178K](https://huggingface.co/datasets/lmms-lab/LLaVA-Video-178K)| |[lvjianjin/AdvertiseGen](https://modelscope.cn/datasets/lvjianjin/AdvertiseGen)|default|97484|130.9±21.9, min=73, max=232|text-generation, 🔥|[shibing624/AdvertiseGen](https://huggingface.co/datasets/shibing624/AdvertiseGen)| diff --git a/docs/source_en/GetStarted/SWIFT-installation.md b/docs/source_en/GetStarted/SWIFT-installation.md index 95c7cc94ee..e225f3d689 100644 --- a/docs/source_en/GetStarted/SWIFT-installation.md +++ b/docs/source_en/GetStarted/SWIFT-installation.md @@ -58,12 +58,12 @@ You can view the image [here](https://modelscope.cn/docs/intro/environment-setup | python | >=3.9 | 3.10 | | | cuda | | cuda12 | No need to install if using CPU, NPU, MPS | | torch | >=2.0 | | | -| transformers | >=4.33 | 4.48.3 | | +| transformers | >=4.33 | 4.49 | | | modelscope | >=1.19 | | | -| peft | >=0.11.0,<0.15.0 | | | +| peft | >=0.11,<0.15 | | | | trl | >=0.13,<0.17 | 0.15 | RLHF | -| deepspeed | >=0.14 | | Training | -| vllm | >=0.5.1 | 0.7.2 | Inference/Deployment/Evaluation | +| deepspeed | >=0.14 | 0.14.5 | Training | +| vllm | >=0.5.1 | 0.7.3 | Inference/Deployment/Evaluation | | lmdeploy | lmdeploy>=0.5,<0.6.5 | 0.6.4 | Inference/Deployment/Evaluation | | evalscope | | >=0.11 | Evaluation | diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md index 8d2470716e..b86d9eea0c 100644 --- a/docs/source_en/Instruction/Supported-models-and-datasets.md +++ b/docs/source_en/Instruction/Supported-models-and-datasets.md @@ -352,8 +352,10 @@ The table below introduces the models integrated with ms-swift: |[deepseek-ai/DeepSeek-V2.5-1210](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2.5-1210)|deepseek_v2_5|deepseek_v2_5|transformers>=4.39.3|-|[deepseek-ai/DeepSeek-V2.5-1210](https://huggingface.co/deepseek-ai/DeepSeek-V2.5-1210)| |[deepseek-ai/DeepSeek-V3-Base](https://modelscope.cn/models/deepseek-ai/DeepSeek-V3-Base)|deepseek_v2_5|deepseek_v2_5|transformers>=4.39.3|-|[deepseek-ai/DeepSeek-V3-Base](https://huggingface.co/deepseek-ai/DeepSeek-V3-Base)| |[deepseek-ai/DeepSeek-V3](https://modelscope.cn/models/deepseek-ai/DeepSeek-V3)|deepseek_v2_5|deepseek_v2_5|transformers>=4.39.3|-|[deepseek-ai/DeepSeek-V3](https://huggingface.co/deepseek-ai/DeepSeek-V3)| +|[cognitivecomputations/DeepSeek-V3-awq](https://modelscope.cn/models/cognitivecomputations/DeepSeek-V3-awq)|deepseek_v2_5|deepseek_v2_5|transformers>=4.39.3|-|[cognitivecomputations/DeepSeek-V3-AWQ](https://huggingface.co/cognitivecomputations/DeepSeek-V3-AWQ)| |[deepseek-ai/DeepSeek-R1](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1)|deepseek_r1|deepseek_r1|transformers>=4.39.3|-|[deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1)| |[deepseek-ai/DeepSeek-R1-Zero](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Zero)|deepseek_r1|deepseek_r1|transformers>=4.39.3|-|[deepseek-ai/DeepSeek-R1-Zero](https://huggingface.co/deepseek-ai/DeepSeek-R1-Zero)| +|[cognitivecomputations/DeepSeek-R1-awq](https://modelscope.cn/models/cognitivecomputations/DeepSeek-R1-awq)|deepseek_r1|deepseek_r1|transformers>=4.39.3|-|[cognitivecomputations/DeepSeek-R1-AWQ](https://huggingface.co/cognitivecomputations/DeepSeek-R1-AWQ)| |[deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B)|deepseek_r1_distill|deepseek_r1|transformers>=4.37|-|[deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B)| |[deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B)|deepseek_r1_distill|deepseek_r1|transformers>=4.37|-|[deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B)| |[deepseek-ai/DeepSeek-R1-Distill-Qwen-14B](https://modelscope.cn/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B)|deepseek_r1_distill|deepseek_r1|transformers>=4.37|-|[deepseek-ai/DeepSeek-R1-Distill-Qwen-14B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B)| @@ -705,7 +707,7 @@ The table below introduces information about the datasets integrated with ms-swi | ---------- | ----------- | -------------| ------------------| ---- | ------------- | |[AI-MO/NuminaMath-1.5](https://modelscope.cn/datasets/AI-MO/NuminaMath-1.5)|default|896215|116.1±80.8, min=31, max=5064|grpo, math|[AI-MO/NuminaMath-1.5](https://huggingface.co/datasets/AI-MO/NuminaMath-1.5)| |[AI-MO/NuminaMath-CoT](https://modelscope.cn/datasets/AI-MO/NuminaMath-CoT)|default|859494|113.1±60.2, min=35, max=2120|grpo, math|[AI-MO/NuminaMath-CoT](https://huggingface.co/datasets/AI-MO/NuminaMath-CoT)| -|[AI-MO/NuminaMath-TIR](https://modelscope.cn/datasets/AI-MO/NuminaMath-TIR)|default|72441|100.9±52.2, min=36, max=1683|grpo, math|[AI-MO/NuminaMath-TIR](https://huggingface.co/datasets/AI-MO/NuminaMath-TIR)| +|[AI-MO/NuminaMath-TIR](https://modelscope.cn/datasets/AI-MO/NuminaMath-TIR)|default|72441|100.9±52.2, min=36, max=1683|grpo, math, 🔥|[AI-MO/NuminaMath-TIR](https://huggingface.co/datasets/AI-MO/NuminaMath-TIR)| |[AI-ModelScope/COIG-CQIA](https://modelscope.cn/datasets/AI-ModelScope/COIG-CQIA)|chinese_traditional
coig_pc
exam
finance
douban
human_value
logi_qa
ruozhiba
segmentfault
wiki
wikihow
xhs
zhihu|44694|331.2±693.8, min=34, max=19288|general, 🔥|-| |[AI-ModelScope/CodeAlpaca-20k](https://modelscope.cn/datasets/AI-ModelScope/CodeAlpaca-20k)|default|20022|99.3±57.6, min=30, max=857|code, en|[HuggingFaceH4/CodeAlpaca_20K](https://huggingface.co/datasets/HuggingFaceH4/CodeAlpaca_20K)| |[AI-ModelScope/DISC-Law-SFT](https://modelscope.cn/datasets/AI-ModelScope/DISC-Law-SFT)|default|166758|1799.0±474.9, min=769, max=3151|chat, law, 🔥|[ShengbinYue/DISC-Law-SFT](https://huggingface.co/datasets/ShengbinYue/DISC-Law-SFT)| @@ -772,6 +774,7 @@ The table below introduces information about the datasets integrated with ms-swi |[AI-ModelScope/wikipedia-cn-20230720-filtered](https://modelscope.cn/datasets/AI-ModelScope/wikipedia-cn-20230720-filtered)|default|huge dataset|-|pretrain, quality|[pleisto/wikipedia-cn-20230720-filtered](https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered)| |[AI-ModelScope/zhihu_rlhf_3k](https://modelscope.cn/datasets/AI-ModelScope/zhihu_rlhf_3k)|default|3460|594.5±365.9, min=31, max=1716|rlhf, dpo, zh|[liyucheng/zhihu_rlhf_3k](https://huggingface.co/datasets/liyucheng/zhihu_rlhf_3k)| |[DAMO_NLP/jd](https://modelscope.cn/datasets/DAMO_NLP/jd)|default
cls|45012|66.9±87.0, min=41, max=1699|text-generation, classification, 🔥|-| +|[FreedomIntelligence/medical-o1-reasoning-SFT](https://modelscope.cn/datasets/FreedomIntelligence/medical-o1-reasoning-SFT)|en
zh|50143|98.0±53.6, min=36, max=1508|medical, o1, 🔥|[FreedomIntelligence/medical-o1-reasoning-SFT](https://huggingface.co/datasets/FreedomIntelligence/medical-o1-reasoning-SFT)| |-|default|huge dataset|-|pretrain, quality|[HuggingFaceFW/fineweb](https://huggingface.co/datasets/HuggingFaceFW/fineweb)| |-|auto_math_text
khanacademy
openstax
stanford
stories
web_samples_v1
web_samples_v2
wikihow|huge dataset|-|multi-domain, en, qa|[HuggingFaceTB/cosmopedia](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia)| |[HumanLLMs/Human-Like-DPO-Dataset](https://modelscope.cn/datasets/HumanLLMs/Human-Like-DPO-Dataset)|default|10884|47.5±7.9, min=32, max=85|rlhf, dpo|[HumanLLMs/Human-Like-DPO-Dataset](https://huggingface.co/datasets/HumanLLMs/Human-Like-DPO-Dataset)| @@ -803,7 +806,7 @@ The table below introduces information about the datasets integrated with ms-swi |[iic/MSAgent-Pro](https://modelscope.cn/datasets/iic/MSAgent-Pro)|default|21910|1978.1±747.9, min=339, max=8064|chat, agent, multi-round, 🔥|-| |[iic/ms_agent](https://modelscope.cn/datasets/iic/ms_agent)|default|30000|645.8±218.0, min=199, max=2070|chat, agent, multi-round, 🔥|-| |[iic/ms_bench](https://modelscope.cn/datasets/iic/ms_bench)|default|316820|353.4±424.5, min=29, max=2924|chat, general, multi-round, 🔥|-| -|[liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT](https://modelscope.cn/datasets/liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT)|default|110000|72.1±60.9, min=29, max=2315|chat, sft, cot, r1|[Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT](https://huggingface.co/datasets/Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT)| +|[liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT](https://modelscope.cn/datasets/liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT)|default|110000|72.1±60.9, min=29, max=2315|chat, sft, cot, r1, 🔥|[Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT](https://huggingface.co/datasets/Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT)| |-|default|huge dataset|-|multi-modal, en, vqa, quality|[lmms-lab/GQA](https://huggingface.co/datasets/lmms-lab/GQA)| |-|0_30_s_academic_v0_1
0_30_s_youtube_v0_1
1_2_m_academic_v0_1
1_2_m_youtube_v0_1
2_3_m_academic_v0_1
2_3_m_youtube_v0_1
30_60_s_academic_v0_1
30_60_s_youtube_v0_1|1335486|273.7±78.8, min=107, max=638|chat, multi-modal, video|[lmms-lab/LLaVA-Video-178K](https://huggingface.co/datasets/lmms-lab/LLaVA-Video-178K)| |[lvjianjin/AdvertiseGen](https://modelscope.cn/datasets/lvjianjin/AdvertiseGen)|default|97484|130.9±21.9, min=73, max=232|text-generation, 🔥|[shibing624/AdvertiseGen](https://huggingface.co/datasets/shibing624/AdvertiseGen)| diff --git a/examples/infer/vllm/mllm_ddp.sh b/examples/infer/vllm/mllm_ddp.sh index 64b865253b..6b673bf666 100644 --- a/examples/infer/vllm/mllm_ddp.sh +++ b/examples/infer/vllm/mllm_ddp.sh @@ -3,10 +3,10 @@ NPROC_PER_NODE=2 \ CUDA_VISIBLE_DEVICES=0,1 \ MAX_PIXELS=1003520 \ swift infer \ - --model Qwen/Qwen2-VL-2B-Instruct \ + --model Qwen/Qwen2-Audio-7B-Instruct \ --infer_backend vllm \ - --val_dataset AI-ModelScope/LaTeX_OCR#1000 \ + --val_dataset speech_asr/speech_asr_aishell1_trainsets:validation#1000 \ --gpu_memory_utilization 0.9 \ --max_model_len 8192 \ --max_new_tokens 2048 \ - --limit_mm_per_prompt '{"image": 5, "video": 2}' + --limit_mm_per_prompt '{"audio": 5}' diff --git a/swift/llm/dataset/data/dataset_info.json b/swift/llm/dataset/data/dataset_info.json index 6a708d42b2..d0c1442d1f 100644 --- a/swift/llm/dataset/data/dataset_info.json +++ b/swift/llm/dataset/data/dataset_info.json @@ -650,7 +650,7 @@ { "ms_dataset_id": "liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT", "hf_dataset_id": "Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT", - "tags": ["chat", "sft", "cot", "r1"] + "tags": ["chat", "sft", "cot", "r1", "🔥"] }, { "ms_dataset_id": "AI-MO/NuminaMath-CoT", @@ -660,11 +660,17 @@ { "ms_dataset_id": "AI-MO/NuminaMath-TIR", "hf_dataset_id": "AI-MO/NuminaMath-TIR", - "tags": ["grpo", "math"] + "tags": ["grpo", "math", "🔥"] }, { "ms_dataset_id": "AI-MO/NuminaMath-1.5", "hf_dataset_id": "AI-MO/NuminaMath-1.5", "tags": ["grpo", "math"] + }, + { + "ms_dataset_id": "FreedomIntelligence/medical-o1-reasoning-SFT", + "hf_dataset_id": "FreedomIntelligence/medical-o1-reasoning-SFT", + "subsets": ["en", "zh"], + "tags": ["medical", "o1", "🔥"] } ] diff --git a/swift/llm/model/model/deepseek.py b/swift/llm/model/model/deepseek.py index 05cedd93a0..cd459e4d5f 100644 --- a/swift/llm/model/model/deepseek.py +++ b/swift/llm/model/model/deepseek.py @@ -108,6 +108,9 @@ def get_model_tokenizer_deepseek_moe(model_dir: str, Model('deepseek-ai/DeepSeek-V3-Base', 'deepseek-ai/DeepSeek-V3-Base'), Model('deepseek-ai/DeepSeek-V3', 'deepseek-ai/DeepSeek-V3'), ]), + ModelGroup([ + Model('cognitivecomputations/DeepSeek-V3-awq', 'cognitivecomputations/DeepSeek-V3-AWQ'), + ]) ], TemplateType.deepseek_v2_5, get_model_tokenizer_deepseek_moe, @@ -246,6 +249,9 @@ def get_model_tokenizer_deepseek_vl2(model_dir: str, *args, **kwargs): Model('deepseek-ai/DeepSeek-R1', 'deepseek-ai/DeepSeek-R1'), Model('deepseek-ai/DeepSeek-R1-Zero', 'deepseek-ai/DeepSeek-R1-Zero'), ]), + ModelGroup([ + Model('cognitivecomputations/DeepSeek-R1-awq', 'cognitivecomputations/DeepSeek-R1-AWQ'), + ]) ], TemplateType.deepseek_r1, get_model_tokenizer_deepseek_moe, From 4ae1895840c27f8b8c32ce0bb02ceea45f539fde Mon Sep 17 00:00:00 2001 From: jinghanhu Date: Fri, 21 Feb 2025 23:13:39 +0800 Subject: [PATCH 04/24] fix qwenvl grpo(#3220) Co-authored-by: hjh --- swift/trainers/rlhf_trainer/grpo_trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py index 666ddc3c8c..5fa5bd5aa2 100644 --- a/swift/trainers/rlhf_trainer/grpo_trainer.py +++ b/swift/trainers/rlhf_trainer/grpo_trainer.py @@ -509,8 +509,8 @@ def _get_per_token_logps(self, model, inputs): return super()._get_per_token_logps(model, input_ids, inputs['attention_mask'], logits_to_keep) inputs = { k: v - for k, v in inputs.items() - if k not in ['logits_to_keep', 'completion_mask', 'ref_per_token_logps', 'advantages'] + for k, v in inputs.items() if k not in + ['logits_to_keep', 'completion_mask', 'ref_per_token_logps', 'advantages', 'old_per_token_logps'] } logits = model(**inputs).logits # exclude the last logit: it corresponds to the next token pred From 0a327e5248b18e88496cf7284b7049382b3b37d8 Mon Sep 17 00:00:00 2001 From: DaozeZhang <58100690+DaozeZhang@users.noreply.github.com> Date: Sat, 22 Feb 2025 00:57:43 +0800 Subject: [PATCH 05/24] Support the generation of JanusPro models (#3218) * add transformers in gitignore * fix a typo bug in text-caps * add .run into gitignore * add vlmeval to gitignore * add my_model/ to gitignore * support generation using Janus Pro * update comment * change some var name, add test_gene.py * change format --- swift/llm/template/base.py | 16 +- swift/llm/template/template/deepseek.py | 195 ++++++++++++++++---- tests/test_align/test_template/test_gene.py | 32 ++++ 3 files changed, 207 insertions(+), 36 deletions(-) create mode 100644 tests/test_align/test_template/test_gene.py diff --git a/swift/llm/template/base.py b/swift/llm/template/base.py index 22f24b9de3..bddfd3c5df 100644 --- a/swift/llm/template/base.py +++ b/swift/llm/template/base.py @@ -33,7 +33,7 @@ class MaxLengthError(ValueError): class Template(ProcessorMixin): - special_tokens = ['', '