refactor patcher (#3124)

modelscope · Feb 16, 2025 · 7a84d44 · 7a84d44
1 parent 3c712c9
commit 7a84d44
Show file tree

Hide file tree

Showing 17 changed files with 133 additions and 102 deletions.
diff --git a/docs/source/Instruction/支持的模型和数据集.md b/docs/source/Instruction/支持的模型和数据集.md
@@ -529,6 +529,9 @@
 |[Qwen/Qwen2.5-VL-3B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-VL-3B-Instruct)|qwen2_5_vl|qwen2_5_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|vision, video|[Qwen/Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct)|
 |[Qwen/Qwen2.5-VL-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-VL-7B-Instruct)|qwen2_5_vl|qwen2_5_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|vision, video|[Qwen/Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct)|
 |[Qwen/Qwen2.5-VL-72B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-VL-72B-Instruct)|qwen2_5_vl|qwen2_5_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|vision, video|[Qwen/Qwen2.5-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct)|
+|[Qwen/Qwen2.5-VL-3B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2.5-VL-3B-Instruct-AWQ)|qwen2_5_vl|qwen2_5_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|vision, video|[Qwen/Qwen2.5-VL-3B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct-AWQ)|
+|[Qwen/Qwen2.5-VL-7B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2.5-VL-7B-Instruct-AWQ)|qwen2_5_vl|qwen2_5_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|vision, video|[Qwen/Qwen2.5-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct-AWQ)|
+|[Qwen/Qwen2.5-VL-72B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2.5-VL-72B-Instruct-AWQ)|qwen2_5_vl|qwen2_5_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|vision, video|[Qwen/Qwen2.5-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct-AWQ)|
 |[Qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B-Instruct)|qwen2_audio|qwen2_audio|transformers>=4.45, librosa|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)|
 |[Qwen/Qwen2-Audio-7B](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B)|qwen2_audio|qwen2_audio|transformers>=4.45, librosa|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)|
 |[Qwen/QVQ-72B-Preview](https://modelscope.cn/models/Qwen/QVQ-72B-Preview)|qvq|qvq|transformers>=4.45, qwen_vl_utils>=0.0.6, decord|vision, video|[Qwen/QVQ-72B-Preview](https://huggingface.co/Qwen/QVQ-72B-Preview)|

diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md
@@ -529,6 +529,9 @@ The table below introduces the models integrated with ms-swift:
 |[Qwen/Qwen2.5-VL-3B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-VL-3B-Instruct)|qwen2_5_vl|qwen2_5_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|vision, video|[Qwen/Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct)|
 |[Qwen/Qwen2.5-VL-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-VL-7B-Instruct)|qwen2_5_vl|qwen2_5_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|vision, video|[Qwen/Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct)|
 |[Qwen/Qwen2.5-VL-72B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-VL-72B-Instruct)|qwen2_5_vl|qwen2_5_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|vision, video|[Qwen/Qwen2.5-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct)|
+|[Qwen/Qwen2.5-VL-3B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2.5-VL-3B-Instruct-AWQ)|qwen2_5_vl|qwen2_5_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|vision, video|[Qwen/Qwen2.5-VL-3B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct-AWQ)|
+|[Qwen/Qwen2.5-VL-7B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2.5-VL-7B-Instruct-AWQ)|qwen2_5_vl|qwen2_5_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|vision, video|[Qwen/Qwen2.5-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct-AWQ)|
+|[Qwen/Qwen2.5-VL-72B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2.5-VL-72B-Instruct-AWQ)|qwen2_5_vl|qwen2_5_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|vision, video|[Qwen/Qwen2.5-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct-AWQ)|
 |[Qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B-Instruct)|qwen2_audio|qwen2_audio|transformers>=4.45, librosa|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)|
 |[Qwen/Qwen2-Audio-7B](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B)|qwen2_audio|qwen2_audio|transformers>=4.45, librosa|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)|
 |[Qwen/QVQ-72B-Preview](https://modelscope.cn/models/Qwen/QVQ-72B-Preview)|qvq|qvq|transformers>=4.45, qwen_vl_utils>=0.0.6, decord|vision, video|[Qwen/QVQ-72B-Preview](https://huggingface.co/Qwen/QVQ-72B-Preview)|

diff --git a/examples/train/multi-gpu/ddp/train.sh b/examples/train/multi-gpu/ddp/train.sh
@@ -10,14 +10,21 @@ swift sft \
     --dataset 'swift/self-cognition#1000' \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-4 \
     --lora_rank 8 \
     --lora_alpha 32 \
-    --learning_rate 1e-4 \
+    --target_modules all-linear \
     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
     --eval_steps 100 \
     --save_steps 100 \
     --save_total_limit 2 \
     --logging_steps 5 \
-    --gradient_checkpointing_kwargs '{"use_reentrant": false}' \
+    --max_length 2048 \
+    --output_dir output \
+    --system 'You are a helpful assistant.' \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
     --model_author swift \
-    --model_name swift-robot
+    --model_name swift-robot \
+    --gradient_checkpointing_kwargs '{"use_reentrant": false}'
diff --git a/examples/train/multi-gpu/ddp_device_map/train.sh b/examples/train/multi-gpu/ddp_device_map/train.sh
@@ -10,15 +10,21 @@ swift sft \
     --torch_dtype bfloat16 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-4 \
     --lora_rank 8 \
     --lora_alpha 32 \
-    --weight_decay 0.1 \
-    --learning_rate 1e-4 \
+    --target_modules all-linear \
     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
     --eval_steps 100 \
     --save_steps 100 \
     --save_total_limit 2 \
     --logging_steps 5 \
-    --gradient_checkpointing_kwargs '{"use_reentrant": false}' \
+    --max_length 2048 \
+    --output_dir output \
+    --system 'You are a helpful assistant.' \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
     --model_author swift \
-    --model_name swift-robot
+    --model_name swift-robot \
+    --gradient_checkpointing_kwargs '{"use_reentrant": false}'
diff --git a/examples/train/multi-gpu/deepspeed/train_zero2.sh b/examples/train/multi-gpu/deepspeed/train_zero2.sh
@@ -10,14 +10,21 @@ swift sft \
     --torch_dtype bfloat16 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-4 \
     --lora_rank 8 \
     --lora_alpha 32 \
-    --learning_rate 1e-4 \
+    --target_modules all-linear \
     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
     --eval_steps 100 \
     --save_steps 100 \
     --save_total_limit 2 \
     --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --system 'You are a helpful assistant.' \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
     --model_author swift \
     --model_name swift-robot \
     --deepspeed zero2
diff --git a/examples/train/multi-gpu/deepspeed/train_zero3.sh b/examples/train/multi-gpu/deepspeed/train_zero3.sh
@@ -7,17 +7,24 @@ swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
     --dataset 'swift/self-cognition#1000' \
+    --torch_dtype bfloat16 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-4 \
     --lora_rank 8 \
     --lora_alpha 32 \
-    --learning_rate 1e-4 \
+    --target_modules all-linear \
     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
     --eval_steps 100 \
     --save_steps 100 \
     --save_total_limit 2 \
     --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --system 'You are a helpful assistant.' \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
     --model_author swift \
     --model_name swift-robot \
-    --deepspeed zero3 \
-    --max_length 1024
+    --deepspeed zero3
diff --git a/examples/train/multi-gpu/fsdp_qlora/train.sh b/examples/train/multi-gpu/fsdp_qlora/train.sh
@@ -7,21 +7,28 @@ accelerate launch --config_file "./examples/train/fsdp_qlora/fsdp_offload.json"
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
     --dataset 'swift/self-cognition#1000' \
+    --torch_dtype bfloat16 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
-    --max_length 2048 \
+    --per_device_eval_batch_size 1 \
     --quant_bits 4 \
     --bnb_4bit_compute_dtype bfloat16 \
     --bnb_4bit_quant_storage bfloat16 \
+    --learning_rate 1e-4 \
     --lora_rank 8 \
     --lora_alpha 32 \
     --gradient_checkpointing true \
     --weight_decay 0.1 \
-    --learning_rate 1e-4 \
+    --target_modules all-linear \
     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
-    --eval_steps 50 \
-    --save_steps 50 \
+    --eval_steps 100 \
+    --save_steps 100 \
     --save_total_limit 2 \
-    --logging_steps 10 \
+    --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --system 'You are a helpful assistant.' \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
     --model_author swift \
     --model_name swift-robot
diff --git a/swift/llm/__init__.py b/swift/llm/__init__.py
@@ -25,7 +25,7 @@
     from .dataset import (AlpacaPreprocessor, ResponsePreprocessor, MessagesPreprocessor, AutoPreprocessor,
                           DATASET_MAPPING, MediaResource, register_dataset, register_dataset_info, EncodePreprocessor,
                           LazyLLMDataset, ConstantLengthDataset, load_dataset, DATASET_TYPE, sample_dataset,
-                          RowPreprocessor, DatasetMeta)
+                          RowPreprocessor, DatasetMeta, HfDataset, SubsetDataset)
     from .utils import (deep_getattr, to_device, History, Messages, history_to_messages, messages_to_history, Processor,
                         save_checkpoint, ProcessorMixin, get_temporary_cache_files_directory, get_cache_dir)
     from .base import SwiftPipeline
@@ -59,21 +59,10 @@
             'load_by_unsloth', 'git_clone_github', 'get_matched_model_meta'
         ],
         'dataset': [
-            'AlpacaPreprocessor',
-            'MessagesPreprocessor',
-            'DATASET_MAPPING',
-            'MediaResource',
-            'register_dataset',
-            'register_dataset_info',
-            'EncodePreprocessor',
-            'LazyLLMDataset',
-            'ConstantLengthDataset',
-            'load_dataset',
-            'DATASET_TYPE',
-            'sample_dataset',
-            'RowPreprocessor',
-            'ResponsePreprocessor',
-            'DatasetMeta',
+            'AlpacaPreprocessor', 'MessagesPreprocessor', 'DATASET_MAPPING', 'MediaResource', 'register_dataset',
+            'register_dataset_info', 'EncodePreprocessor', 'LazyLLMDataset', 'ConstantLengthDataset', 'load_dataset',
+            'DATASET_TYPE', 'sample_dataset', 'RowPreprocessor', 'ResponsePreprocessor', 'DatasetMeta', 'HfDataset',
+            'SubsetDataset'
         ],
         'utils': [
             'deep_getattr', 'to_device', 'History', 'Messages', 'history_to_messages', 'messages_to_history',

diff --git a/swift/llm/dataset/__init__.py b/swift/llm/dataset/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import datasets.fingerprint
+from datasets import Dataset as HfDataset
 from datasets import disable_caching
 
 from swift.utils.torch_utils import _find_local_mac
@@ -9,7 +10,7 @@
 from .media import MediaResource
 from .preprocessor import (AlpacaPreprocessor, AutoPreprocessor, MessagesPreprocessor, ResponsePreprocessor,
                            RowPreprocessor)
-from .register import DATASET_MAPPING, DatasetMeta, register_dataset, register_dataset_info
+from .register import DATASET_MAPPING, DatasetMeta, SubsetDataset, register_dataset, register_dataset_info
 from .utils import (ConstantLengthDataset, EncodePreprocessor, GetLengthPreprocessor, LazyLLMDataset,
                     PackingPreprocessor, sample_dataset)
 

diff --git a/swift/llm/model/model/qwen.py b/swift/llm/model/model/qwen.py
@@ -592,7 +592,12 @@ def get_model_tokenizer_qwen2_5_vl(*args, **kwargs):
                 Model('Qwen/Qwen2.5-VL-3B-Instruct', 'Qwen/Qwen2.5-VL-3B-Instruct'),
                 Model('Qwen/Qwen2.5-VL-7B-Instruct', 'Qwen/Qwen2.5-VL-7B-Instruct'),
                 Model('Qwen/Qwen2.5-VL-72B-Instruct', 'Qwen/Qwen2.5-VL-72B-Instruct'),
-            ])
+            ]),
+            ModelGroup([
+                Model('Qwen/Qwen2.5-VL-3B-Instruct-AWQ', 'Qwen/Qwen2.5-VL-3B-Instruct-AWQ'),
+                Model('Qwen/Qwen2.5-VL-7B-Instruct-AWQ', 'Qwen/Qwen2.5-VL-7B-Instruct-AWQ'),
+                Model('Qwen/Qwen2.5-VL-72B-Instruct-AWQ', 'Qwen/Qwen2.5-VL-72B-Instruct-AWQ'),
+            ]),
         ],
         TemplateType.qwen2_5_vl,
         get_model_tokenizer_qwen2_5_vl,

diff --git a/swift/llm/model/patcher.py b/swift/llm/model/patcher.py
@@ -2,18 +2,22 @@
 from contextlib import contextmanager
 from functools import wraps
 from types import MethodType
-from typing import List
+from typing import Dict, List, Optional, Union
 
+import accelerate
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import transformers
 from accelerate.utils import find_device
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from transformers import PreTrainedModel
+from torch.nn.parallel import DistributedDataParallel as DDP
+from transformers import PreTrainedModel, trainer
 from transformers.modeling_outputs import SequenceClassifierOutputWithPast
 
 from swift.llm import to_device
-from swift.utils import get_logger
+from swift.utils import get_dist_setting, get_logger, is_mp_ddp, use_torchacc
+from swift.utils.torch_utils import _get_max_memory, _sync_max_memory, get_device_count
 from .model_arch import get_model_arch
 from .utils import HfConfigFactory
 
@@ -234,3 +238,45 @@ def _new_from_pretrained(cls, *args, **kwargs):
         yield
     finally:
         PreTrainedModel.from_pretrained = classmethod(from_pretrained)
+
+
+_mp_ddp_patched = False
+
+
+def patch_mp_ddp():
+    """Patch ddp with device_map.
+    After patching, the ddp can run with the device_map.
+    This should be called before any training starts.
+    """
+    global _mp_ddp_patched
+    if is_mp_ddp() and not _mp_ddp_patched:
+        _mp_ddp_patched = True
+        from accelerate.utils.modeling import get_balanced_memory, infer_auto_device_map
+
+        @wraps(infer_auto_device_map)
+        def _infer_auto_device_map_patch(model: nn.Module,
+                                         max_memory: Optional[Dict[Union[int, str], Union[int, str]]] = None,
+                                         **kwargs) -> Dict[str, Union[int, str, torch.device]]:
+            """The auxiliary function for supports MP + DDP. Monkey Patching.
+            add feat in accelerate to support MP + DDP"""
+            verbose = kwargs.pop('verbose', False)
+            n_gpu = get_device_count()
+            _, local_rank, _, local_world_size = get_dist_setting()
+            device_ids = list(range(local_rank, n_gpu, local_world_size))
+            max_memory = _get_max_memory(device_ids)
+            max_memory = _sync_max_memory(max_memory)
+            max_memory = get_balanced_memory(model, max_memory, low_zero=False, **kwargs)
+            max_memory = {k: v for k, v in max_memory.items() if v > 0}
+            return infer_auto_device_map(model, max_memory, verbose=verbose, **kwargs)
+
+        _old_ddp_init = DDP.__init__
+        accelerate.accelerator.torch.nn.parallel.DistributedDataParallel.__init__ = (
+            lambda self, model, device_ids, output_device, *args, **kwargs: _old_ddp_init(self, model, *args, **kwargs))
+        transformers.modeling_utils.get_balanced_memory = lambda *args, **kwargs: None
+        transformers.modeling_utils.infer_auto_device_map = _infer_auto_device_map_patch
+
+    if is_mp_ddp() or use_torchacc():
+        _old_accelerator_init = trainer.Accelerator.__init__
+        trainer.Accelerator.__init__ = (lambda self, device_placement=False, *args, **kwargs: _old_accelerator_init(
+            self, device_placement=device_placement, *args, **kwargs))
+        trainer.Accelerator.verify_device_map = lambda *args, **kwargs: False
diff --git a/swift/llm/model/register.py b/swift/llm/model/register.py
@@ -21,7 +21,7 @@
 
 from swift.utils import get_dist_setting, get_logger, is_mp, is_unsloth_available, patch_getattr, use_torchacc
 from .constant import ModelType
-from .patcher import patch_automodel_for_awq, patch_automodel_for_sequence_classification
+from .patcher import patch_automodel_for_awq, patch_automodel_for_sequence_classification, patch_mp_ddp
 from .utils import AttnImpl, HfConfigFactory, ModelInfo, safe_snapshot_download
 
 GetModelTokenizerFunction = Callable[..., Tuple[Optional[PreTrainedModel], PreTrainedTokenizerBase]]
@@ -468,7 +468,7 @@ def get_model_tokenizer(
         If set to None : It will be automatically selected between sdpa and eager.
     download_model: Whether to download the model weights. If `None`, it will be selected based on load_model.
     """
-
+    patch_mp_ddp()
     if model_kwargs is None:
         model_kwargs = {}
     if download_model is None:

diff --git a/swift/llm/train/__init__.py b/swift/llm/train/__init__.py
@@ -1,5 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from . import patcher
 from .pt import SwiftPt, pt_main
 from .rlhf import SwiftRLHF, rlhf_main
 from .sft import SwiftSft, sft_main

diff --git a/swift/llm/train/patcher.py b/swift/llm/train/patcher.py
diff --git a/swift/llm/train/rlhf.py b/swift/llm/train/rlhf.py
@@ -54,7 +54,7 @@ def _prepare_model_tokenizer(self):
                 self.train_msg['value_model_parameter_info'] = model_parameter_info
                 logger.info(f'value_model_parameter_info: {model_parameter_info}')
             setattr(self, f'{origin_key}_model', model)
-            if origin_key == 'reward':
+            if origin_key == 'reward' and args.rlhf_type == 'grpo':
                 reward_template = self.args.get_template(processor)
                 if reward_template.use_model:
                     reward_template.model = model