From 251c86940600b932f9925d630e1a77b55cc24d02 Mon Sep 17 00:00:00 2001 From: Mu Tian Date: Mon, 31 Aug 2020 23:02:05 -0700 Subject: [PATCH] hydra fairseq - add yaml files Summary: hydra fairseq - add yaml files Reviewed By: alexeib Differential Revision: D22403786 fbshipit-source-id: 81fb5902c1fbcf7b03d111037327ab0f8bfb57f2 --- config/config.yaml | 7 ++ config/config_eval_lm.yaml | 7 ++ config/criterion/adaptive_loss.yaml | 3 + config/criterion/cross_entropy.yaml | 3 + config/lr_scheduler/cosine.yaml | 7 ++ config/lr_scheduler/inverse_sqrt.yaml | 3 + config/model/transformer_lm.yaml | 36 ++++++ config/model/transformer_lm_baevski_gbw.yaml | 36 ++++++ .../model/transformer_lm_baevski_wiki103.yaml | 36 ++++++ config/model/transformer_lm_big.yaml | 36 ++++++ config/model/transformer_lm_gbw.yaml | 36 ++++++ config/model/transformer_lm_gpt.yaml | 36 ++++++ config/model/transformer_lm_gpt2_big.yaml | 36 ++++++ config/model/transformer_lm_gpt2_medium.yaml | 36 ++++++ config/model/transformer_lm_gpt2_small.yaml | 36 ++++++ config/model/transformer_lm_wiki103.yaml | 36 ++++++ config/optimizer/adam.yaml | 5 + config/optimizer/nag.yaml | 3 + config/params/eval_lm_params.yaml | 106 ++++++++++++++++++ config/params/training_params.yaml | 96 ++++++++++++++++ config/task/language_modeling.yaml | 10 ++ 21 files changed, 610 insertions(+) create mode 100644 config/config.yaml create mode 100644 config/config_eval_lm.yaml create mode 100644 config/criterion/adaptive_loss.yaml create mode 100644 config/criterion/cross_entropy.yaml create mode 100644 config/lr_scheduler/cosine.yaml create mode 100644 config/lr_scheduler/inverse_sqrt.yaml create mode 100644 config/model/transformer_lm.yaml create mode 100644 config/model/transformer_lm_baevski_gbw.yaml create mode 100644 config/model/transformer_lm_baevski_wiki103.yaml create mode 100644 config/model/transformer_lm_big.yaml create mode 100644 config/model/transformer_lm_gbw.yaml create mode 100644 config/model/transformer_lm_gpt.yaml create mode 100644 config/model/transformer_lm_gpt2_big.yaml create mode 100644 config/model/transformer_lm_gpt2_medium.yaml create mode 100644 config/model/transformer_lm_gpt2_small.yaml create mode 100644 config/model/transformer_lm_wiki103.yaml create mode 100644 config/optimizer/adam.yaml create mode 100644 config/optimizer/nag.yaml create mode 100644 config/params/eval_lm_params.yaml create mode 100644 config/params/training_params.yaml create mode 100644 config/task/language_modeling.yaml diff --git a/config/config.yaml b/config/config.yaml new file mode 100644 index 0000000000..66723e706c --- /dev/null +++ b/config/config.yaml @@ -0,0 +1,7 @@ +defaults: + - params: training_params + - task: language_modeling + - model: transformer_lm + - criterion: cross_entropy + - optimizer: adam + - lr_scheduler: inverse_sqrt diff --git a/config/config_eval_lm.yaml b/config/config_eval_lm.yaml new file mode 100644 index 0000000000..5a93cb5d92 --- /dev/null +++ b/config/config_eval_lm.yaml @@ -0,0 +1,7 @@ +defaults: + - params: eval_lm_params + - task: language_modeling + - model: transformer_lm + - criterion: cross_entropy + - optimizer: adam + - lr_scheduler: inverse_sqrt diff --git a/config/criterion/adaptive_loss.yaml b/config/criterion/adaptive_loss.yaml new file mode 100644 index 0000000000..a85a7eed1c --- /dev/null +++ b/config/criterion/adaptive_loss.yaml @@ -0,0 +1,3 @@ +# @package _group_ +sentence_avg: ${params.optimization.sentence_avg} +ddp_backend: ${params.distributed_training.ddp_backend} diff --git a/config/criterion/cross_entropy.yaml b/config/criterion/cross_entropy.yaml new file mode 100644 index 0000000000..a85a7eed1c --- /dev/null +++ b/config/criterion/cross_entropy.yaml @@ -0,0 +1,3 @@ +# @package _group_ +sentence_avg: ${params.optimization.sentence_avg} +ddp_backend: ${params.distributed_training.ddp_backend} diff --git a/config/lr_scheduler/cosine.yaml b/config/lr_scheduler/cosine.yaml new file mode 100644 index 0000000000..0f91e0d240 --- /dev/null +++ b/config/lr_scheduler/cosine.yaml @@ -0,0 +1,7 @@ +# @package _group_ +warmup_updates: 0 +warmup_init_lr: -1 +max_lr: 1.0 +t_mult: 1.0 +lr_period_updates: -1 +lr_shrink: 0.1 diff --git a/config/lr_scheduler/inverse_sqrt.yaml b/config/lr_scheduler/inverse_sqrt.yaml new file mode 100644 index 0000000000..0eac7d88eb --- /dev/null +++ b/config/lr_scheduler/inverse_sqrt.yaml @@ -0,0 +1,3 @@ +# @package _group_ +warmup_updates: 4000 +warmup_init_lr: -1 diff --git a/config/model/transformer_lm.yaml b/config/model/transformer_lm.yaml new file mode 100644 index 0000000000..3837ea54e1 --- /dev/null +++ b/config/model/transformer_lm.yaml @@ -0,0 +1,36 @@ +# @package _group_ +activation_fn: "relu" +dropout: 0.1 +attention_dropout: 0.0 +activation_dropout: 0.0 +relu_dropout: 0.0 +decoder_embed_dim: 512 +decoder_output_dim: 512 +decoder_input_dim: 512 +decoder_ffn_embed_dim: 2048 +decoder_layers: 6 +decoder_attention_heads: 8 +decoder_normalize_before: true +no_decoder_final_norm: false +adaptive_softmax_cutoff: null +adaptive_softmax_dropout: 0 +adaptive_softmax_factor: 4 +no_token_positional_embeddings: false +share_decoder_input_output_embed: false +character_embeddings: false +character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]" +character_embedding_dim: 4 +char_embedder_highway_layers: 2 +adaptive_input: false +adaptive_input_factor: 4 +adaptive_input_cutoff: null +tie_adaptive_weights: false +tie_adaptive_proj: false +decoder_learned_pos: false +decoder_layerdrop: 0 +decoder_layers_to_keep: null +layernorm_embedding: false +no_scale_embedding: false +quant_noise_pq: 0 +quant_noise_pq_block_size: 8 +quant_noise_scalar: 0 diff --git a/config/model/transformer_lm_baevski_gbw.yaml b/config/model/transformer_lm_baevski_gbw.yaml new file mode 100644 index 0000000000..30b1a4f1e0 --- /dev/null +++ b/config/model/transformer_lm_baevski_gbw.yaml @@ -0,0 +1,36 @@ +# @package _group_ +activation_fn: "relu" +dropout: 0.1 +attention_dropout: 0.1 +activation_dropout: 0.0 +relu_dropout: 0.0 +decoder_embed_dim: 512 +decoder_output_dim: 512 +decoder_input_dim: 512 +decoder_ffn_embed_dim: 4096 +decoder_layers: 12 +decoder_attention_heads: 16 +decoder_normalize_before: true +no_decoder_final_norm: true +adaptive_softmax_cutoff: null +adaptive_softmax_dropout: 0 +adaptive_softmax_factor: 4 +no_token_positional_embeddings: false +share_decoder_input_output_embed: false +character_embeddings: false +character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]" +character_embedding_dim: 4 +char_embedder_highway_layers: 2 +adaptive_input: false +adaptive_input_factor: 4 +adaptive_input_cutoff: null +tie_adaptive_weights: false +tie_adaptive_proj: false +decoder_learned_pos: false +decoder_layerdrop: 0 +decoder_layers_to_keep: null +layernorm_embedding: false +no_scale_embedding: false +quant_noise_pq: 0 +quant_noise_pq_block_size: 8 +quant_noise_scalar: 0 diff --git a/config/model/transformer_lm_baevski_wiki103.yaml b/config/model/transformer_lm_baevski_wiki103.yaml new file mode 100644 index 0000000000..1154cfa660 --- /dev/null +++ b/config/model/transformer_lm_baevski_wiki103.yaml @@ -0,0 +1,36 @@ +# @package _group_ +activation_fn: "relu" +dropout: 0.3 +attention_dropout: 0.1 +activation_dropout: 0.1 +relu_dropout: 0.1 +decoder_embed_dim: 1024 +decoder_output_dim: 1024 +decoder_input_dim: 1024 +decoder_ffn_embed_dim: 4096 +decoder_layers: 16 +decoder_attention_heads: 8 +decoder_normalize_before: true +no_decoder_final_norm: true +adaptive_softmax_cutoff: "20000,60000" +adaptive_softmax_dropout: 0.2 +adaptive_softmax_factor: 4 +no_token_positional_embeddings: false +share_decoder_input_output_embed: false +character_embeddings: false +character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]" +character_embedding_dim: 4 +char_embedder_highway_layers: 2 +adaptive_input: true +adaptive_input_factor: 4 +adaptive_input_cutoff: "20000,60000" +tie_adaptive_weights: true +tie_adaptive_proj: true +decoder_learned_pos: false +decoder_layerdrop: 0 +decoder_layers_to_keep: null +layernorm_embedding: false +no_scale_embedding: false +quant_noise_pq: 0 +quant_noise_pq_block_size: 8 +quant_noise_scalar: 0 diff --git a/config/model/transformer_lm_big.yaml b/config/model/transformer_lm_big.yaml new file mode 100644 index 0000000000..309575310b --- /dev/null +++ b/config/model/transformer_lm_big.yaml @@ -0,0 +1,36 @@ +# @package _group_ +activation_fn: "relu" +dropout: 0.1 +attention_dropout: 0.0 +activation_dropout: 0.0 +relu_dropout: 0.0 +decoder_embed_dim: 1024 +decoder_output_dim: 1024 +decoder_input_dim: 1024 +decoder_ffn_embed_dim: 4096 +decoder_layers: 12 +decoder_attention_heads: 16 +decoder_normalize_before: true +no_decoder_final_norm: false +adaptive_softmax_cutoff: null +adaptive_softmax_dropout: 0 +adaptive_softmax_factor: 4 +no_token_positional_embeddings: false +share_decoder_input_output_embed: false +character_embeddings: false +character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]" +character_embedding_dim: 4 +char_embedder_highway_layers: 2 +adaptive_input: false +adaptive_input_factor: 4 +adaptive_input_cutoff: null +tie_adaptive_weights: false +tie_adaptive_proj: false +decoder_learned_pos: false +decoder_layerdrop: 0 +decoder_layers_to_keep: null +layernorm_embedding: false +no_scale_embedding: false +quant_noise_pq: 0 +quant_noise_pq_block_size: 8 +quant_noise_scalar: 0 diff --git a/config/model/transformer_lm_gbw.yaml b/config/model/transformer_lm_gbw.yaml new file mode 100644 index 0000000000..30b1a4f1e0 --- /dev/null +++ b/config/model/transformer_lm_gbw.yaml @@ -0,0 +1,36 @@ +# @package _group_ +activation_fn: "relu" +dropout: 0.1 +attention_dropout: 0.1 +activation_dropout: 0.0 +relu_dropout: 0.0 +decoder_embed_dim: 512 +decoder_output_dim: 512 +decoder_input_dim: 512 +decoder_ffn_embed_dim: 4096 +decoder_layers: 12 +decoder_attention_heads: 16 +decoder_normalize_before: true +no_decoder_final_norm: true +adaptive_softmax_cutoff: null +adaptive_softmax_dropout: 0 +adaptive_softmax_factor: 4 +no_token_positional_embeddings: false +share_decoder_input_output_embed: false +character_embeddings: false +character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]" +character_embedding_dim: 4 +char_embedder_highway_layers: 2 +adaptive_input: false +adaptive_input_factor: 4 +adaptive_input_cutoff: null +tie_adaptive_weights: false +tie_adaptive_proj: false +decoder_learned_pos: false +decoder_layerdrop: 0 +decoder_layers_to_keep: null +layernorm_embedding: false +no_scale_embedding: false +quant_noise_pq: 0 +quant_noise_pq_block_size: 8 +quant_noise_scalar: 0 diff --git a/config/model/transformer_lm_gpt.yaml b/config/model/transformer_lm_gpt.yaml new file mode 100644 index 0000000000..2c6cb7be38 --- /dev/null +++ b/config/model/transformer_lm_gpt.yaml @@ -0,0 +1,36 @@ +# @package _group_ +activation_fn: "gelu" +dropout: 0.1 +attention_dropout: 0.1 +activation_dropout: 0.0 +relu_dropout: 0.0 +decoder_embed_dim: 768 +decoder_output_dim: 768 +decoder_input_dim: 768 +decoder_ffn_embed_dim: 3072 +decoder_layers: 12 +decoder_attention_heads: 12 +decoder_normalize_before: true +no_decoder_final_norm: false +adaptive_softmax_cutoff: null +adaptive_softmax_dropout: 0 +adaptive_softmax_factor: 4 +no_token_positional_embeddings: false +share_decoder_input_output_embed: false +character_embeddings: false +character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]" +character_embedding_dim: 4 +char_embedder_highway_layers: 2 +adaptive_input: false +adaptive_input_factor: 4 +adaptive_input_cutoff: null +tie_adaptive_weights: false +tie_adaptive_proj: false +decoder_learned_pos: false +decoder_layerdrop: 0 +decoder_layers_to_keep: null +layernorm_embedding: false +no_scale_embedding: false +quant_noise_pq: 0 +quant_noise_pq_block_size: 8 +quant_noise_scalar: 0 diff --git a/config/model/transformer_lm_gpt2_big.yaml b/config/model/transformer_lm_gpt2_big.yaml new file mode 100644 index 0000000000..a08769a178 --- /dev/null +++ b/config/model/transformer_lm_gpt2_big.yaml @@ -0,0 +1,36 @@ +# @package _group_ +activation_fn: "gelu" +dropout: 0.1 +attention_dropout: 0.1 +activation_dropout: 0.0 +relu_dropout: 0.0 +decoder_embed_dim: 1600 +decoder_output_dim: 1600 +decoder_input_dim: 1600 +decoder_ffn_embed_dim: 6400 +decoder_layers: 48 +decoder_attention_heads: 25 +decoder_normalize_before: true +no_decoder_final_norm: false +adaptive_softmax_cutoff: null +adaptive_softmax_dropout: 0 +adaptive_softmax_factor: 4 +no_token_positional_embeddings: false +share_decoder_input_output_embed: false +character_embeddings: false +character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]" +character_embedding_dim: 4 +char_embedder_highway_layers: 2 +adaptive_input: false +adaptive_input_factor: 4 +adaptive_input_cutoff: null +tie_adaptive_weights: false +tie_adaptive_proj: false +decoder_learned_pos: false +decoder_layerdrop: 0 +decoder_layers_to_keep: null +layernorm_embedding: false +no_scale_embedding: false +quant_noise_pq: 0 +quant_noise_pq_block_size: 8 +quant_noise_scalar: 0 diff --git a/config/model/transformer_lm_gpt2_medium.yaml b/config/model/transformer_lm_gpt2_medium.yaml new file mode 100644 index 0000000000..64261d793c --- /dev/null +++ b/config/model/transformer_lm_gpt2_medium.yaml @@ -0,0 +1,36 @@ +# @package _group_ +activation_fn: "gelu" +dropout: 0.1 +attention_dropout: 0.1 +activation_dropout: 0.0 +relu_dropout: 0.0 +decoder_embed_dim: 1280 +decoder_output_dim: 1280 +decoder_input_dim: 1280 +decoder_ffn_embed_dim: 5120 +decoder_layers: 36 +decoder_attention_heads: 20 +decoder_normalize_before: true +no_decoder_final_norm: false +adaptive_softmax_cutoff: null +adaptive_softmax_dropout: 0 +adaptive_softmax_factor: 4 +no_token_positional_embeddings: false +share_decoder_input_output_embed: false +character_embeddings: false +character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]" +character_embedding_dim: 4 +char_embedder_highway_layers: 2 +adaptive_input: false +adaptive_input_factor: 4 +adaptive_input_cutoff: null +tie_adaptive_weights: false +tie_adaptive_proj: false +decoder_learned_pos: false +decoder_layerdrop: 0 +decoder_layers_to_keep: null +layernorm_embedding: false +no_scale_embedding: false +quant_noise_pq: 0 +quant_noise_pq_block_size: 8 +quant_noise_scalar: 0 diff --git a/config/model/transformer_lm_gpt2_small.yaml b/config/model/transformer_lm_gpt2_small.yaml new file mode 100644 index 0000000000..702e81f466 --- /dev/null +++ b/config/model/transformer_lm_gpt2_small.yaml @@ -0,0 +1,36 @@ +# @package _group_ +activation_fn: "gelu" +dropout: 0.1 +attention_dropout: 0.1 +activation_dropout: 0.0 +relu_dropout: 0.0 +decoder_embed_dim: 1024 +decoder_output_dim: 1024 +decoder_input_dim: 1024 +decoder_ffn_embed_dim: 4096 +decoder_layers: 24 +decoder_attention_heads: 16 +decoder_normalize_before: true +no_decoder_final_norm: false +adaptive_softmax_cutoff: null +adaptive_softmax_dropout: 0 +adaptive_softmax_factor: 4 +no_token_positional_embeddings: false +share_decoder_input_output_embed: false +character_embeddings: false +character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]" +character_embedding_dim: 4 +char_embedder_highway_layers: 2 +adaptive_input: false +adaptive_input_factor: 4 +adaptive_input_cutoff: null +tie_adaptive_weights: false +tie_adaptive_proj: false +decoder_learned_pos: false +decoder_layerdrop: 0 +decoder_layers_to_keep: null +layernorm_embedding: false +no_scale_embedding: false +quant_noise_pq: 0 +quant_noise_pq_block_size: 8 +quant_noise_scalar: 0 diff --git a/config/model/transformer_lm_wiki103.yaml b/config/model/transformer_lm_wiki103.yaml new file mode 100644 index 0000000000..1154cfa660 --- /dev/null +++ b/config/model/transformer_lm_wiki103.yaml @@ -0,0 +1,36 @@ +# @package _group_ +activation_fn: "relu" +dropout: 0.3 +attention_dropout: 0.1 +activation_dropout: 0.1 +relu_dropout: 0.1 +decoder_embed_dim: 1024 +decoder_output_dim: 1024 +decoder_input_dim: 1024 +decoder_ffn_embed_dim: 4096 +decoder_layers: 16 +decoder_attention_heads: 8 +decoder_normalize_before: true +no_decoder_final_norm: true +adaptive_softmax_cutoff: "20000,60000" +adaptive_softmax_dropout: 0.2 +adaptive_softmax_factor: 4 +no_token_positional_embeddings: false +share_decoder_input_output_embed: false +character_embeddings: false +character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]" +character_embedding_dim: 4 +char_embedder_highway_layers: 2 +adaptive_input: true +adaptive_input_factor: 4 +adaptive_input_cutoff: "20000,60000" +tie_adaptive_weights: true +tie_adaptive_proj: true +decoder_learned_pos: false +decoder_layerdrop: 0 +decoder_layers_to_keep: null +layernorm_embedding: false +no_scale_embedding: false +quant_noise_pq: 0 +quant_noise_pq_block_size: 8 +quant_noise_scalar: 0 diff --git a/config/optimizer/adam.yaml b/config/optimizer/adam.yaml new file mode 100644 index 0000000000..e5264f895e --- /dev/null +++ b/config/optimizer/adam.yaml @@ -0,0 +1,5 @@ +# @package _group_ +adam_betas: "(0.9, 0.999)" +adam_eps: 1.0e-8 +weight_decay: 0 +use_old_adam: false diff --git a/config/optimizer/nag.yaml b/config/optimizer/nag.yaml new file mode 100644 index 0000000000..4ab2745686 --- /dev/null +++ b/config/optimizer/nag.yaml @@ -0,0 +1,3 @@ +# @package _group_ +momentum: 0.99 +weight_decay: 0.0 diff --git a/config/params/eval_lm_params.yaml b/config/params/eval_lm_params.yaml new file mode 100644 index 0000000000..4a0259bca6 --- /dev/null +++ b/config/params/eval_lm_params.yaml @@ -0,0 +1,106 @@ +# @package _group_ +common: + no_progress_bar: false + log_interval: 100 + log_format: null + tensorboard_logdir: null + seed: 1 + cpu: false + fp16: false + memory_efficient_fp16: false + fp16_no_flatten_grads: false + fp16_init_scale: 128 + fp16_scale_window: null + fp16_scale_tolerance: 0.0 + min_loss_scale: 1.0e-4 + threshold_loss_scale: null + user_dir: null + empty_cache_freq: 0 + all_gather_list_size: 16384 + model_parallel_size: 1 + checkpoint_suffix: "" + quantization_config_path: null +distributed_training: + distributed_rank: 0 + distributed_backend: "nccl" + distributed_init_method: null + distributed_port: -1 + device_id: 0 + local_rank: 0 + distributed_no_spawn: false + ddp_backend: "c10d" + bucket_cap_mb: 25 + fix_batches_to_gpus: false + find_unused_parameters: false + fast_stat_sync: false + broadcast_buffers: false + distributed_wrapper: "DDP" + slowmo_momentum: null + slowmo_algorithm: "LocalSGD" + localsgd_frequency: 3 +dataset: + num_workers: 1 + skip_invalid_size_inputs_valid_test: false + max_tokens: null + max_sentences: null + batch_size: ${params.dataset.max_sentences} + required_batch_size_multiple: 8 + dataset_impl: null + data_buffer_size: 10 + train_subset: "train" + valid_subset: "valid" + validate_interval: 1 + fixed_validation_seed: null + disable_validation: false + curriculum: 0 + gen_subset: "test" + num_shards: 1 + shard_id: 0 + max_tokens_valid: ${params.dataset.max_tokens} + max_sentences_valid: ${params.dataset.max_sentences} +optimization: + max_epoch: 0 + max_update: 0 + clip_norm: 25.0 + sentence_avg: false + update_freq: [1] + lr: [0.25] + min_lr: -1.0 + use_bmuf: false +checkpoint: + save_dir: "checkpoints" + restore_file: "checkpoint_last.pt" + reset_dataloader: false + reset_lr_scheduler: false + reset_meters: false + reset_optimizer: false + optimizer_overrides: "{}" + save_interval: 1 + save_interval_updates: 0 + keep_interval_updates: -1 + keep_last_epochs: -1 + keep_best_checkpoints: -1 + no_save: false + no_epoch_checkpoints: false + no_last_checkpoints: false + no_save_optimizer_state: false + best_checkpoint_metric: "loss" + maximize_best_checkpoint_metric: false + patience: -1 +common_eval: + path: null + remove_bpe: null + quiet: false + model_overrides: "{}" + results_path: null +eval_lm: + output_word_probs: false + output_word_stats: false + context_window: 0 +bmuf: + block_lr: 1 + block_momentum: 0.875 + global_sync_iter: 50 + warmup_iterations: 500 + use_nbm: false + average_sync: false diff --git a/config/params/training_params.yaml b/config/params/training_params.yaml new file mode 100644 index 0000000000..3d52a82ac4 --- /dev/null +++ b/config/params/training_params.yaml @@ -0,0 +1,96 @@ +# @package _group_ +common: + no_progress_bar: false + log_interval: 100 + log_format: null + tensorboard_logdir: null + seed: 1 + cpu: false + fp16: false + memory_efficient_fp16: false + fp16_no_flatten_grads: false + fp16_init_scale: 128 + fp16_scale_window: null + fp16_scale_tolerance: 0.0 + min_loss_scale: 1.0e-4 + threshold_loss_scale: null + user_dir: null + empty_cache_freq: 0 + all_gather_list_size: 16384 + model_parallel_size: 1 + checkpoint_suffix: "" + quantization_config_path: null +distributed_training: + distributed_rank: 0 + distributed_backend: "nccl" + distributed_init_method: null + distributed_port: -1 + device_id: 0 + local_rank: 0 + distributed_no_spawn: false + ddp_backend: "c10d" + bucket_cap_mb: 25 + fix_batches_to_gpus: false + find_unused_parameters: false + fast_stat_sync: false + broadcast_buffers: false + distributed_wrapper: "DDP" + slowmo_momentum: null + slowmo_algorithm: "LocalSGD" + localsgd_frequency: 3 +dataset: + num_workers: 1 + skip_invalid_size_inputs_valid_test: false + max_tokens: null + max_sentences: null + batch_size: ${params.dataset.max_sentences} + required_batch_size_multiple: 8 + dataset_impl: null + data_buffer_size: 10 + train_subset: "train" + valid_subset: "valid" + validate_interval: 1 + fixed_validation_seed: null + disable_validation: false + curriculum: 0 + gen_subset: "test" + num_shards: 1 + shard_id: 0 + max_tokens_valid: ${params.dataset.max_tokens} + max_sentences_valid: ${params.dataset.max_sentences} +optimization: + max_epoch: 0 + max_update: 0 + clip_norm: 25.0 + sentence_avg: false + update_freq: [1] + lr: [0.25] + min_lr: -1.0 + use_bmuf: false +checkpoint: + save_dir: "checkpoints" + restore_file: "checkpoint_last.pt" + reset_dataloader: false + reset_lr_scheduler: false + reset_meters: false + reset_optimizer: false + optimizer_overrides: "{}" + save_interval: 1 + save_interval_updates: 0 + keep_interval_updates: -1 + keep_last_epochs: -1 + keep_best_checkpoints: -1 + no_save: false + no_epoch_checkpoints: false + no_last_checkpoints: false + no_save_optimizer_state: false + best_checkpoint_metric: "loss" + maximize_best_checkpoint_metric: false + patience: -1 +bmuf: + block_lr: 1 + block_momentum: 0.875 + global_sync_iter: 50 + warmup_iterations: 500 + use_nbm: false + average_sync: false diff --git a/config/task/language_modeling.yaml b/config/task/language_modeling.yaml new file mode 100644 index 0000000000..58a2ad1358 --- /dev/null +++ b/config/task/language_modeling.yaml @@ -0,0 +1,10 @@ +# @package _group_ +data: ??? +sample_break_mode: "none" +tokens_per_sample: 1024 +output_dictionary_size: -1 +self_target: false +future_target: false +past_target: false +add_bos_token: false +max_target_positions: null