diff --git a/training/benchmarks/llama1_7B/paddle/model/__init__.py b/training/benchmarks/llama1_7B/paddle/model/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/training/benchmarks/llama1_7B/paddle/model/models/modeling_pp.py b/training/benchmarks/llama1_7B/paddle/model/models/modeling_pp.py deleted file mode 100644 index 9a1969903..000000000 --- a/training/benchmarks/llama1_7B/paddle/model/models/modeling_pp.py +++ /dev/null @@ -1,263 +0,0 @@ -import paddle -import paddle.distributed.fleet as fleet -import paddle.nn as nn -from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer - -from paddlenlp.transformers import PretrainedModel -from paddlenlp.transformers.llama.modeling import ( - LlamaConfig, - LlamaDecoderLayer, - LlamaLMHead, - LlamaModel, - LlamaPretrainedModel, - LlamaPretrainingCriterion, - LlamaRMSNorm, -) - - -def get_hcg(): - return fleet.get_hybrid_communicate_group() - - -def parse_args(args): - if isinstance(args, tuple): - if len(args) == 3: - hidden_states, attention_mask, position_ids = args - elif len(args) == 2: - hidden_states, attention_mask = args - position_ids = None - else: - hidden_states = args - attention_mask, position_ids = None, None - - if position_ids is not None: - position_ids.stop_gradient = True - - if attention_mask is not None: - attention_mask.stop_gradient = True - - return hidden_states, attention_mask, position_ids - - -def return_args(hidden_states, attention_mask=None, position_ids=None): - ret = (hidden_states,) - - if attention_mask is not None: - ret += (attention_mask.clone(),) - if position_ids is not None: - ret += (position_ids.clone(),) - if len(ret) == 1: - ret = ret[0] - - return ret - - -class LlamaEmbeddingPipe(nn.Layer): - """Extends LlamaEmbeddings to forward attention_mask through the pipeline.""" - - def __init__(self, config): - super(LlamaEmbeddingPipe, self).__init__() - self.sequence_parallel = config.sequence_parallel - self.hidden_size = config.hidden_size - if config.tensor_parallel_degree > 1: - self.embed_tokens = fleet.meta_parallel.VocabParallelEmbedding( - config.vocab_size, - config.hidden_size, - weight_attr=paddle.ParamAttr(initializer=nn.initializer.XavierNormal()), - ) - else: - self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size) - - def forward(self, args): - """_summary_ - - Args: - input (_type_): _description_ - - Returns: - _type_: _description_ - """ - input_ids, attention_mask, position_ids = parse_args(args) - input_embeds = self.embed_tokens(input_ids) - if self.sequence_parallel: - from paddlenlp.transformers import ScatterOp - - # [bs, seq_len, num_head * head_dim] -> [bs * seq_len, num_head * head_dim] - bs, seq_len, hidden_size = input_embeds.shape - input_embeds = paddle.reshape_(input_embeds, [bs * seq_len, hidden_size]) - # [seq_len * bs / n, num_head * head_dim] (n is mp parallelism) - input_embeds = ScatterOp.apply(input_embeds) - - batch_size, seq_length = input_ids.shape - if attention_mask is not None: - attention_mask = LlamaModel._prepare_decoder_attention_mask( - attention_mask, (batch_size, seq_length), 0, input_embeds.dtype - ) - attention_mask.stop_gradient = True - - return return_args(input_embeds, attention_mask, position_ids) - - -class LlamaDecoderLayerPipe(LlamaDecoderLayer): - def forward(self, args): - hidden_states, attention_mask, position_ids = parse_args(args) - hidden_states = super().forward(hidden_states, attention_mask=attention_mask) - return return_args(hidden_states, attention_mask, position_ids) - - -class LlamaRMSNormPipe(LlamaRMSNorm): - def forward(self, args): - hidden_states, attention_mask, position_ids = parse_args(args) - return super().forward(hidden_states) - - -class PipelinePretrainedModel(PretrainedModel): - _sequential_layers = [] - _pipeline_name_mapping = None - - def __init__(self, config, *args, **kwargs): - super().__init__(config, *args, **kwargs) - - def add_sequential_layer(self, layer_desc, name_prefix=""): - self._sequential_layers.append({"layer": layer_desc, "name_prefix": name_prefix}) - - def get_sequential_layers(self): - return [x["layer"] for x in self._sequential_layers] - - def get_sequential_name_prefixs(self): - return {str(index): x["name_prefix"] for index, x in enumerate(self._sequential_layers)} - - def _set_pipeline_name_mapping(self, mappings=None): - if mappings is not None: - self._pipeline_name_mapping = mappings - else: - mapping = {} - state_dict_keys = list(super().state_dict().keys()) - first_key = state_dict_keys[0].split(".") - # if use virtual pp_degree, the prefix is like 0.0.xxx - # else it will be like 0.xxx - use_virtual_pp_degree = first_key[0].isdigit() and first_key[1].isdigit() - - prefixs = self.get_sequential_name_prefixs() - for k in state_dict_keys: - name_splited = k.split(".") - if use_virtual_pp_degree: - idx = str(int(name_splited[0]) + int(name_splited[1])) - single_name = [prefixs[idx]] - single_name.extend(name_splited[2:]) - else: - idx = name_splited[0] - single_name = [prefixs[idx]] - single_name.extend(name_splited[1:]) - mapping[".".join(single_name)] = k - - self._pipeline_name_mapping = mapping - - return self._pipeline_name_mapping - - def state_dict(self, *args, **kwargs): - state_dict = super().state_dict(*args, **kwargs) - - if self._pipeline_name_mapping is None: - self._set_pipeline_name_mapping() - assert len(self._pipeline_name_mapping) > 0, "The pipeline stage must have parameters!" - pp_to_single_mapping = {v: k for k, v in self._pipeline_name_mapping.items()} - - for k in list(state_dict.keys()): - v = state_dict.pop(k) - state_dict[pp_to_single_mapping[k]] = v - - return state_dict - - def set_state_dict(self, state_dict, *args, **kwargs): - if self._pipeline_name_mapping is None: - self._set_pipeline_name_mapping() - assert len(self._pipeline_name_mapping) > 0, "The pipeline stage must have parameters!" - - for k in list(state_dict.keys()): - v = state_dict.pop(k) - if k not in self._pipeline_name_mapping: - continue - state_dict[self._pipeline_name_mapping[k]] = v - - ret = super().set_state_dict(state_dict, *args, **kwargs) - return ret - - -class LlamaForCausalLMPipe(PipelinePretrainedModel, PipelineLayer): - """LlamaForPretraining adapted for pipeline parallelism. - - The largest change is flattening the LlamaModel class so we can express it as a - sequence of layers including embedding, transformer layers, and output. - """ - - config_class = LlamaConfig - - _get_tensor_parallel_mappings = LlamaPretrainedModel._get_tensor_parallel_mappings - _init_weights = LlamaPretrainedModel._init_weights - - # NO base_model_prefix !!!! - - def __init__( - self, - config, - # scale_qk_by_layer_num=True, - # virtual_pp_degree=4, - ): - self.config = config - - self.use_recompute = self.config.use_recompute - self.recompute_granularity = self.config.recompute_granularity - self.pp_recompute_interval = self.config.pp_recompute_interval - self.no_recompute_layers = config.no_recompute_layers if config.no_recompute_layers is not None else [] - if self.recompute_granularity == "full": - assert len(self.no_recompute_layers) == 0, "for pp with full recompute, no_recompute_layers is not support" - - # virtual_pp_degree = self.config.virtual_pp_degree - virtual_pp_degree = getattr(self.config, "virtual_pp_degree", 1) - - hcg = get_hcg() - tensor_parallel_degree = max(hcg.get_model_parallel_world_size(), 1) - tensor_parallel_rank = max(hcg.get_model_parallel_rank(), 0) - - config.tensor_parallel_degree = tensor_parallel_degree - config.tensor_parallel_rank = tensor_parallel_rank - - self.add_sequential_layer(LayerDesc(LlamaEmbeddingPipe, config=config), "llama") - for i in range(config.num_hidden_layers): - self.add_sequential_layer( - LayerDesc(LlamaDecoderLayerPipe, config=config, layerwise_recompute=i not in self.no_recompute_layers), - f"llama.layers.{i}", - ) - - self.add_sequential_layer(LayerDesc(LlamaRMSNormPipe, config=config), "llama.norm") - self.add_sequential_layer(LayerDesc(LlamaLMHead, config=config), "lm_head") - - recompute_interval = 0 - if self.use_recompute and self.recompute_granularity == "full": - assert self.config.pp_recompute_interval <= config.num_hidden_layers // ( - virtual_pp_degree * get_hcg().topology().get_dim_size("pipe") - ), "pp recompute interval should smaller than num layers of each pp chunk" - recompute_interval = self.config.pp_recompute_interval - - seg_method = "layer:LlamaDecoderLayer" - if config.num_hidden_layers % get_hcg().topology().get_dim_size("pipe") != 0: - seg_method = "uniform" - - PipelineLayer.__init__( - self, - layers=self.get_sequential_layers(), - loss_fn=LlamaPretrainingCriterion(config), - topology=get_hcg().topology(), - seg_method=seg_method, - recompute_interval=recompute_interval, - recompute_ctx={ - "mp_group": get_hcg().get_model_parallel_group(), - "offload": False, - "partition": False, - }, - num_virtual_pipeline_stages=virtual_pp_degree, - ) - self.apply(self._init_weights) - # DON'T init PipelinePretrainedModel - # PipelinePretrainedModel.__init__(self.super(), config=config) \ No newline at end of file diff --git a/training/benchmarks/llama1_7B/paddle/run_pretraining.py b/training/benchmarks/llama1_7B/paddle/run_pretraining.py index a446dd89e..53e3fa5a7 100644 --- a/training/benchmarks/llama1_7B/paddle/run_pretraining.py +++ b/training/benchmarks/llama1_7B/paddle/run_pretraining.py @@ -26,6 +26,7 @@ LinearAnnealingWithWarmupDecay, LlamaConfig, LlamaForCausalLM, + LlamaForCausalLMPipe, register_sequence_parallel_allreduce_hooks, ) @@ -39,7 +40,6 @@ from driver import Driver, Event, dist_paddle from driver.config_manager import get_properties_from_config from dataloaders.dataloader import create_pretrained_dataset, get_train_data_file -from model.models.modeling_pp import LlamaForCausalLMPipe from train.trainer import PretrainingTrainer from train.training_state import TrainingState @@ -232,7 +232,7 @@ def main(): model_args, data_args, training_args = parser.parse_dict( get_properties_from_config(config) ) - data_args.input_dir = gpt_driver.config.data_dir + data_args.input_dir = llama_driver.config.data_dir if model_args.tokenizer_name_or_path is None: model_args.tokenizer_name_or_path = model_args.model_name_or_path @@ -331,16 +331,12 @@ def main(): model.recompute_enable() # Create the learning_rate sheduler and optimizer - if training_args.decay_steps is None: - training_args.decay_steps = training_args.max_steps - warmup_steps = training_args.warmup_ratio * training_args.max_steps - lr_scheduler = None if training_args.lr_scheduler_type.value == "cosine": lr_scheduler = CosineAnnealingWithWarmupDecay( max_lr=training_args.learning_rate, min_lr=training_args.min_learning_rate, - warmup_step=warmup_steps, + warmup_step=training_args.warmup_steps, decay_step=training_args.decay_steps, last_epoch=0, ) @@ -348,7 +344,7 @@ def main(): lr_scheduler = LinearAnnealingWithWarmupDecay( max_lr=training_args.learning_rate, min_lr=training_args.min_learning_rate, - warmup_step=warmup_steps, + warmup_step=training_args.warmup_steps, decay_step=training_args.decay_steps, last_epoch=0, ) @@ -403,22 +399,20 @@ def main(): trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() - training_state.raw_train_time = train_metrics["train_runtime"] - training_state.training_sequences_per_second = train_metrics[ - "train_samples_per_second" - ] - training_state.loss = train_metrics["train_loss"] - training_state.effective_tokens_per_second = total_effective_tokens / train_metrics["train_runtime"] + training_state.raw_train_time = metrics["train_runtime"] + training_state.training_sequences_per_second = metrics["train_samples_per_second"] + training_state.loss = metrics["train_loss"] + training_state.effective_tokens_per_second = total_effective_tokens / metrics["train_runtime"] except: training_state.end_training = False # End Evaluation - dist_paddle.barrier() - eval_metrics = trainer.evaluate() - training_state.eval_loss = eval_metrics["eval_loss"] - training_state.eval_ppl = eval_metrics["eval_ppl"] - if eval_metrics["eval_ppl"] < config.target_ppl: - training_state.converged_success() + # dist_paddle.barrier() + # eval_metrics = trainer.evaluate() + # training_state.eval_loss = eval_metrics["eval_loss"] + # training_state.eval_ppl = eval_metrics["eval_ppl"] + # if eval_metrics["eval_ppl"] < config.target_ppl: + # training_state.converged_success() return training_args, training_state, llama_driver @@ -438,8 +432,8 @@ def main(): "training_sequences_per_second": state.training_sequences_per_second, "effective_tokens_per_second": state.effective_tokens_per_second, "converged": state.converged, - "final_loss": state.eval_loss, - "final_ppl": state.eval_ppl, + # "final_loss": state.eval_loss, + # "final_ppl": state.eval_ppl, "raw_train_time": state.raw_train_time, "init_time": state.init_time, } diff --git a/training/nvidia/llama1_7B-paddle/README.md b/training/nvidia/llama1_7B-paddle/README.md index d0bce7850..b7364d6f0 100644 --- a/training/nvidia/llama1_7B-paddle/README.md +++ b/training/nvidia/llama1_7B-paddle/README.md @@ -31,21 +31,27 @@ wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwe | 指标名称 | 指标值 | 特殊说明 | | -------------- | ------------------------------ | ------------------------------------------- | | 任务类别 | 文本分类、文本生成 | | -| 模型 | llama1_7B | | +| 模型 | llama1 | | | 数据集 | openwebtext | | | 配置文件 | config | | | 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 | | 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | | 并行策略 | parallel_strategy,见“性能指标” | DP, TP, PP, SP | -| 硬件设备简称 | nvidia A100 (80G *8) | | +| 硬件设备简称 | nvidia A100 (80G * 8) and (40G * 8) | | | 硬件存储使用 | memory(actual/total),见“性能指标” | 通常称为“显存”,单位为GiB | | 吞吐量 | throughput,见“性能指标” | 训练吞吐量 | * 性能指标 | 配置 | config | precision | fix_hp | parallel_strategy | throughput | memory | -| ------- | ------- | --------- | ------ | ---------------- | ------------ | ------ | -| A100单机8卡(1x8) | config_TP1PP1SH2SP8A100x1x8 | fp16, level="O2" | per_device_bs=4, accumulate=32, (global bs = 2M tokens) | flash_attention=True, recompute=False, use_fused_rms_norm=True, sharding="stage2", sharding_degree=8 | 15.70715 * 2048 / 8 = 4021 tokens/s | 76.98 * 8 GB | -| A100单机8卡(1x8) | config_TP2PP1SH1SP4A100x1x8 | fp16, level="O2" | per_device_bs=4, accumulate=64, (global bs = 2M tokens) | flash_attention=True, recompute=False, use_fused_rms_norm=True, sharding="stage1", sharding_degree=4, tensor_parallel_degree=2 | 14.27326 * 2048 / 8 = 3653 tokens/s | 62.11 * 8 GB | -| A100单机8卡(1x8) | config_TP2PP1SH2SP4A100x1x8 | fp16, level="O2" | per_device_bs=4, accumulate=64, (global bs = 2M tokens) | flash_attention=True, recompute=False, use_fused_rms_norm=True, sharding="stage2", sharding_degree=4, tensor_parallel_degree=2 | 13.48227 * 2048 / 8 = 3451 tokens/s | 57.63 * 8 GB | -| A100单机8卡(1x8) | config_TP2PP4SH1SP1A100x1x8 | fp16, level="O2" | per_device_bs=4, accumulate=64, (global bs = 2M tokens) | flash_attention=True, recompute=False, use_fused_rms_norm=True, sharding="stage2", sharding_degree=4, tensor_parallel_degree=2 | 13.644565 * 2048 / 8 = 3493 tokens/s | 58.62\*2 + 53.51\*2 + 49.46\*2 + 47.95\*2 GB | \ No newline at end of file +| ------- | ------- | --------- | ------ | ---------------- | ------------ | ------ | +| LLaMA-7B | ------- | --------- | ------ | ---------------- | ------------ | ------ | +| A100单机8卡(1x8*80G) | config_TP1PP1SH2SP8A10080Gx1x8 | fp16, level="O2" | per_device_bs=4, accumulate=64, (global bs = 4M tokens) | flash_attention=True, recompute=False, use_fused_rms_norm=True, sharding="stage2", sharding_degree=8 | 16.67 * 2048 / 8 = 4267 tokens/s | 70.09 * 8 GB | +| A100单机8卡(1x8*80G) | config_TP2PP1SH1SP4A10080Gx1x8 | fp16, level="O2" | per_device_bs=4, accumulate=128, (global bs = 4M tokens) | flash_attention=True, recompute=False, use_fused_rms_norm=True, sharding="stage1", sharding_degree=4, tensor_parallel_degree=2 | 15.19 * 2048 / 8 = 3888 tokens/s | 58.73 * 8 GB | +| A100单机8卡(1x8*80G) | config_TP2PP1SH2SP4A10080Gx1x8 | fp16, level="O2" | per_device_bs=4, accumulate=128, (global bs = 4M tokens) | flash_attention=True, recompute=False, use_fused_rms_norm=True, sharding="stage2", sharding_degree=4, tensor_parallel_degree=2 | 14.26 * 2048 / 8 = 3650 tokens/s | 54.01 * 8 GB | +| A100单机8卡(1x8*80G) | config_TP2PP4SH1SP1A10080Gx1x8 | fp16, level="O2" | per_device_bs=4, accumulate=512, (global bs = 4M tokens) | flash_attention=True, recompute=False, use_fused_rms_norm=True, sharding="stage1", tensor_parallel_degree=2, pipline_parallel_degree=4 | 14.54 * 2048 / 8 = 3722 tokens/s | 46.80\*2 + 38.93\*2 + 31.74\*2 + 26.92\*2 GB | +| LLaMA-7B | ------- | --------- | ------ | ---------------- | ------------ | ------ | +| A100单机8卡(1x8*40G) | config_TP1PP1SH2SP8A10040Gx1x8 | fp16, level="O2" | per_device_bs=2, accumulate=128, (global bs =4M tokens) | flash_attention=True, recompute=True, use_fused_rms_norm=False, sharding="stage2", sharding_degree=8 | 10.72 * 2048 / 8 = 2744 tokens/s | 33.55 * 8 GB | +| A100单机8卡(1x8*40G) | config_TP2PP1SH1SP4A10040Gx1x8 | fp16, level="O2" | per_device_bs=2, accumulate=256, (global bs = 4M tokens) | flash_attention=True, recompute=True, use_fused_rms_norm=False, sharding="stage1", sharding_degree=4, tensor_parallel_degree=2 | 8.45 * 2048 / 8 = 2163 tokens/s | 28.4 * 8 GB | +| A100单机8卡(1x8*40G) | config_TP2PP1SH2SP4A10040Gx1x8 | fp16, level="O2" | per_device_bs=2, accumulate=256, (global bs = 4M tokens) | flash_attention=True, recompute=True, use_fused_rms_norm=False, sharding="stage2", sharding_degree=4, tensor_parallel_degree=2 | 8.44 * 2048 / 8 = 2160 tokens/s | 25.8 * 8 GB | +| A100单机8卡(1x8*40G) | config_TP2PP4SH1SP1A10040Gx1x8 | fp16, level="O2" | per_device_bs=2, accumulate=1024, (global bs = 4M tokens) | flash_attention=True, recompute=True, use_fused_rms_norm=False, sharding="stage1", tensor_parallel_degree=2, pipline_parallel_degree=4 | 8.72 * 2048 / 8 = 2232 tokens/s | 20.41\*2 + 19.80\*2 + 19.41\*2 + 20.12\*2 GB | diff --git a/training/nvidia/llama1_7B-paddle/config/config_TP1PP1SH2SP8A10040Gx1x8.py b/training/nvidia/llama1_7B-paddle/config/config_TP1PP1SH2SP8A10040Gx1x8.py new file mode 100644 index 000000000..efe137584 --- /dev/null +++ b/training/nvidia/llama1_7B-paddle/config/config_TP1PP1SH2SP8A10040Gx1x8.py @@ -0,0 +1,43 @@ +# model info +model_name_or_path: str = "facebook/llama-7b" +tokenizer_name_or_path: str = "facebook/llama-7b" +continue_training = 0 +split = "998,1,1" +max_seq_length = 2048 + +# training info +dataloader_num_workers = 1 +max_steps = 100 +save_steps = 10000 +eval_steps = 10000 +learning_rate = 3e-4 +min_learning_rate = 3e-5 +warmup_steps = 2000 +weight_decay = 0.1 +lr_scheduler_type = "cosine" +adam_beta1 = 0.9 +adam_beta2 = 0.95 +adam_epsilon = 1e-06 +max_grad_norm = 1.0 +target_loss = 1.0 +target_ppl = 0.6 +logging_steps = 1 +log_freq = 1 +seed = 42 + +# for parallel +per_device_train_batch_size = 2 +per_device_eval_batch_size = 1 +tensor_parallel_degree = 1 +pipeline_parallel_degree = 1 +sharding_parallel_degree = 8 +gradient_accumulation_steps = 128 +use_flash_attention = 1 +fuse_attention_qkv = 0 +use_fused_rms_norm = 0 +fp16 = True +fp16_opt_level = "O2" +scale_loss = 1024 +sharding = "stage2" +recompute = True +recompute_granularity = "full" diff --git a/training/nvidia/llama1_7B-paddle/config/config_TP1PP1SH2SP8A100x1x8.py b/training/nvidia/llama1_7B-paddle/config/config_TP1PP1SH2SP8A10080Gx1x8.py similarity index 92% rename from training/nvidia/llama1_7B-paddle/config/config_TP1PP1SH2SP8A100x1x8.py rename to training/nvidia/llama1_7B-paddle/config/config_TP1PP1SH2SP8A10080Gx1x8.py index 0bb77c8c6..80530cd77 100644 --- a/training/nvidia/llama1_7B-paddle/config/config_TP1PP1SH2SP8A100x1x8.py +++ b/training/nvidia/llama1_7B-paddle/config/config_TP1PP1SH2SP8A10080Gx1x8.py @@ -7,7 +7,7 @@ # training info dataloader_num_workers = 1 -max_steps = 512 +max_steps = 100 save_steps = 10000 eval_steps = 10000 learning_rate = 3e-4 @@ -31,13 +31,13 @@ tensor_parallel_degree = 1 pipeline_parallel_degree = 1 sharding_parallel_degree = 8 -gradient_accumulation_steps = 32 +gradient_accumulation_steps = 64 use_flash_attention = 1 fuse_attention_qkv = 0 use_fused_rms_norm = 1 fp16 = True fp16_opt_level = "O2" -scale_loss = 32768 +scale_loss = 1024 sharding = "stage2" recompute = False recompute_granularity = "full" diff --git a/training/nvidia/llama1_7B-paddle/config/config_TP2PP1SH1SP4A10040Gx1x8.py b/training/nvidia/llama1_7B-paddle/config/config_TP2PP1SH1SP4A10040Gx1x8.py new file mode 100644 index 000000000..3daaaa8c2 --- /dev/null +++ b/training/nvidia/llama1_7B-paddle/config/config_TP2PP1SH1SP4A10040Gx1x8.py @@ -0,0 +1,43 @@ +# model info +model_name_or_path: str = "facebook/llama-7b" +tokenizer_name_or_path: str = "facebook/llama-7b" +continue_training = 0 +split = "998,1,1" +max_seq_length = 2048 + +# training info +dataloader_num_workers = 1 +max_steps = 100 +save_steps = 10000 +eval_steps = 10000 +learning_rate = 3e-4 +min_learning_rate = 3e-5 +warmup_steps = 2000 +weight_decay = 0.1 +lr_scheduler_type = "cosine" +adam_beta1 = 0.9 +adam_beta2 = 0.95 +adam_epsilon = 1e-06 +max_grad_norm = 1.0 +target_loss = 1.0 +target_ppl = 0.6 +logging_steps = 1 +log_freq = 1 +seed = 42 + +# for parallel +per_device_train_batch_size = 2 +per_device_eval_batch_size = 1 +tensor_parallel_degree = 2 +pipeline_parallel_degree = 1 +sharding_parallel_degree = 4 +gradient_accumulation_steps = 256 +use_flash_attention = 1 +fuse_attention_qkv = 0 +use_fused_rms_norm = 0 +fp16 = True +fp16_opt_level = "O2" +scale_loss = 1024 +sharding = "stage1" +recompute = True +recompute_granularity = "full" diff --git a/training/nvidia/llama1_7B-paddle/config/config_TP2PP1SH1SP4A100x1x8.py b/training/nvidia/llama1_7B-paddle/config/config_TP2PP1SH1SP4A10080Gx1x8.py similarity index 92% rename from training/nvidia/llama1_7B-paddle/config/config_TP2PP1SH1SP4A100x1x8.py rename to training/nvidia/llama1_7B-paddle/config/config_TP2PP1SH1SP4A10080Gx1x8.py index 876ea1c2d..8ce05aeb1 100644 --- a/training/nvidia/llama1_7B-paddle/config/config_TP2PP1SH1SP4A100x1x8.py +++ b/training/nvidia/llama1_7B-paddle/config/config_TP2PP1SH1SP4A10080Gx1x8.py @@ -7,7 +7,7 @@ # training info dataloader_num_workers = 1 -max_steps = 512 +max_steps = 100 save_steps = 10000 eval_steps = 10000 learning_rate = 3e-4 @@ -31,13 +31,13 @@ tensor_parallel_degree = 2 pipeline_parallel_degree = 1 sharding_parallel_degree = 4 -gradient_accumulation_steps = 64 +gradient_accumulation_steps = 128 use_flash_attention = 1 fuse_attention_qkv = 0 use_fused_rms_norm = 1 fp16 = True fp16_opt_level = "O2" -scale_loss = 32768 +scale_loss = 1024 sharding = "stage1" recompute = False recompute_granularity = "full" diff --git a/training/nvidia/llama1_7B-paddle/config/config_TP2PP1SH2SP4A10040Gx1x8.py b/training/nvidia/llama1_7B-paddle/config/config_TP2PP1SH2SP4A10040Gx1x8.py new file mode 100644 index 000000000..2ad136114 --- /dev/null +++ b/training/nvidia/llama1_7B-paddle/config/config_TP2PP1SH2SP4A10040Gx1x8.py @@ -0,0 +1,43 @@ +# model info +model_name_or_path: str = "facebook/llama-7b" +tokenizer_name_or_path: str = "facebook/llama-7b" +continue_training = 0 +split = "998,1,1" +max_seq_length = 2048 + +# training info +dataloader_num_workers = 1 +max_steps = 100 +save_steps = 10000 +eval_steps = 10000 +learning_rate = 3e-4 +min_learning_rate = 3e-5 +warmup_steps = 2000 +weight_decay = 0.1 +lr_scheduler_type = "cosine" +adam_beta1 = 0.9 +adam_beta2 = 0.95 +adam_epsilon = 1e-06 +max_grad_norm = 1.0 +target_loss = 1.0 +target_ppl = 0.6 +logging_steps = 1 +log_freq = 1 +seed = 42 + +# for parallel +per_device_train_batch_size = 2 +per_device_eval_batch_size = 1 +tensor_parallel_degree = 2 +pipeline_parallel_degree = 1 +sharding_parallel_degree = 4 +gradient_accumulation_steps = 256 +use_flash_attention = 1 +fuse_attention_qkv = 0 +use_fused_rms_norm = 0 +fp16 = True +fp16_opt_level = "O2" +scale_loss = 1024 +sharding = "stage2" +recompute = True +recompute_granularity = "full" diff --git a/training/nvidia/llama1_7B-paddle/config/config_TP2PP1SH2SP4A100x1x8.py b/training/nvidia/llama1_7B-paddle/config/config_TP2PP1SH2SP4A10080Gx1x8.py similarity index 92% rename from training/nvidia/llama1_7B-paddle/config/config_TP2PP1SH2SP4A100x1x8.py rename to training/nvidia/llama1_7B-paddle/config/config_TP2PP1SH2SP4A10080Gx1x8.py index 2352b5e7e..db0fa1bf1 100644 --- a/training/nvidia/llama1_7B-paddle/config/config_TP2PP1SH2SP4A100x1x8.py +++ b/training/nvidia/llama1_7B-paddle/config/config_TP2PP1SH2SP4A10080Gx1x8.py @@ -7,7 +7,7 @@ # training info dataloader_num_workers = 1 -max_steps = 512 +max_steps = 100 save_steps = 10000 eval_steps = 10000 learning_rate = 3e-4 @@ -31,13 +31,13 @@ tensor_parallel_degree = 2 pipeline_parallel_degree = 1 sharding_parallel_degree = 4 -gradient_accumulation_steps = 64 +gradient_accumulation_steps = 128 use_flash_attention = 1 fuse_attention_qkv = 0 use_fused_rms_norm = 1 fp16 = True fp16_opt_level = "O2" -scale_loss = 32768 +scale_loss = 1024 sharding = "stage2" recompute = False recompute_granularity = "full" diff --git a/training/nvidia/llama1_7B-paddle/config/config_TP2PP4SH1SP1A10040Gx1x8.py b/training/nvidia/llama1_7B-paddle/config/config_TP2PP4SH1SP1A10040Gx1x8.py new file mode 100644 index 000000000..5d09f794f --- /dev/null +++ b/training/nvidia/llama1_7B-paddle/config/config_TP2PP4SH1SP1A10040Gx1x8.py @@ -0,0 +1,44 @@ +# model info +model_name_or_path: str = "facebook/llama-7b" +tokenizer_name_or_path: str = "facebook/llama-7b" +continue_training = 0 +split = "998,1,1" +max_seq_length = 2048 + +# training info +dataloader_num_workers = 1 +max_steps = 100 +save_steps = 10000 +eval_steps = 10000 +learning_rate = 3e-4 +min_learning_rate = 3e-5 +warmup_steps = 2000 +weight_decay = 0.1 +lr_scheduler_type = "cosine" +adam_beta1 = 0.9 +adam_beta2 = 0.95 +adam_epsilon = 1e-06 +max_grad_norm = 1.0 +target_loss = 1.0 +target_ppl = 0.6 +logging_steps = 1 +log_freq = 1 +seed = 42 + +# for parallel +per_device_train_batch_size = 2 +per_device_eval_batch_size = 2048 +tensor_parallel_degree = 2 +pipeline_parallel_degree = 4 +virtual_pp_degree = 1 +sharding_parallel_degree = 1 +gradient_accumulation_steps = 1024 +use_flash_attention = 1 +fuse_attention_qkv = 0 +use_fused_rms_norm = 0 +fp16 = True +fp16_opt_level = "O2" +scale_loss = 1024 +sharding = "stage1" +recompute = True +recompute_granularity = "full" diff --git a/training/nvidia/llama1_7B-paddle/config/config_TP2PP4SH1SP1A100x1x8.py b/training/nvidia/llama1_7B-paddle/config/config_TP2PP4SH1SP1A10080Gx1x8.py similarity index 89% rename from training/nvidia/llama1_7B-paddle/config/config_TP2PP4SH1SP1A100x1x8.py rename to training/nvidia/llama1_7B-paddle/config/config_TP2PP4SH1SP1A10080Gx1x8.py index efeaa4e86..86ea6f756 100644 --- a/training/nvidia/llama1_7B-paddle/config/config_TP2PP4SH1SP1A100x1x8.py +++ b/training/nvidia/llama1_7B-paddle/config/config_TP2PP4SH1SP1A10080Gx1x8.py @@ -7,7 +7,7 @@ # training info dataloader_num_workers = 1 -max_steps = 512 +max_steps = 100 save_steps = 10000 eval_steps = 10000 learning_rate = 3e-4 @@ -27,18 +27,18 @@ # for parallel per_device_train_batch_size = 4 -per_device_eval_batch_size = 1 +per_device_eval_batch_size = 2048 tensor_parallel_degree = 2 pipeline_parallel_degree = 4 virtual_pp_degree = 1 sharding_parallel_degree = 1 -gradient_accumulation_steps = 256 +gradient_accumulation_steps = 512 use_flash_attention = 1 fuse_attention_qkv = 0 use_fused_rms_norm = 1 fp16 = True fp16_opt_level = "O2" -scale_loss = 32768 +scale_loss = 1024 sharding = "stage1" recompute = False recompute_granularity = "full" diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py index 93e141bf4..63fbaeb65 100644 --- a/training/run_benchmarks/config/test_conf.py +++ b/training/run_benchmarks/config/test_conf.py @@ -78,4 +78,9 @@ # "transformer_xl:pytorch_1.8:A100:1:8:1": "/raid/dataset/transformer_xl/", # "t5_small:pytorch_1.8:A100:1:8:1": "/home/datasets_ckpt/t5_small_train", # "gpt2:pytorch_1.12:A100:1:8:1": "/raid/dataset/gpt2", + + # "llama1_7B:paddle_2.5.1:TP1PP1SH2SP8A10040G:1:8:1":"/raid/dataset/llama/" + # "llama1_7B:paddle_2.5.1:TP2PP1SH1SP4A10040G:1:8:1":"/raid/dataset/llama/" + # "llama1_7B:paddle_2.5.1:TP2PP1SH2SP4A10040G:1:8:1":"/raid/dataset/llama/" + # "llama1_7B:paddle_2.5.1:TP2PP4SH1SP1A10040G:1:8:1":"/raid/dataset/llama/" } \ No newline at end of file