From 251c86940600b932f9925d630e1a77b55cc24d02 Mon Sep 17 00:00:00 2001
From: Mu Tian <mtian@fb.com>
Date: Mon, 31 Aug 2020 23:02:05 -0700
Subject: [PATCH] hydra fairseq - add yaml files

Summary: hydra fairseq - add yaml files

Reviewed By: alexeib

Differential Revision: D22403786

fbshipit-source-id: 81fb5902c1fbcf7b03d111037327ab0f8bfb57f2
---
 config/config.yaml                            |   7 ++
 config/config_eval_lm.yaml                    |   7 ++
 config/criterion/adaptive_loss.yaml           |   3 +
 config/criterion/cross_entropy.yaml           |   3 +
 config/lr_scheduler/cosine.yaml               |   7 ++
 config/lr_scheduler/inverse_sqrt.yaml         |   3 +
 config/model/transformer_lm.yaml              |  36 ++++++
 config/model/transformer_lm_baevski_gbw.yaml  |  36 ++++++
 .../model/transformer_lm_baevski_wiki103.yaml |  36 ++++++
 config/model/transformer_lm_big.yaml          |  36 ++++++
 config/model/transformer_lm_gbw.yaml          |  36 ++++++
 config/model/transformer_lm_gpt.yaml          |  36 ++++++
 config/model/transformer_lm_gpt2_big.yaml     |  36 ++++++
 config/model/transformer_lm_gpt2_medium.yaml  |  36 ++++++
 config/model/transformer_lm_gpt2_small.yaml   |  36 ++++++
 config/model/transformer_lm_wiki103.yaml      |  36 ++++++
 config/optimizer/adam.yaml                    |   5 +
 config/optimizer/nag.yaml                     |   3 +
 config/params/eval_lm_params.yaml             | 106 ++++++++++++++++++
 config/params/training_params.yaml            |  96 ++++++++++++++++
 config/task/language_modeling.yaml            |  10 ++
 21 files changed, 610 insertions(+)
 create mode 100644 config/config.yaml
 create mode 100644 config/config_eval_lm.yaml
 create mode 100644 config/criterion/adaptive_loss.yaml
 create mode 100644 config/criterion/cross_entropy.yaml
 create mode 100644 config/lr_scheduler/cosine.yaml
 create mode 100644 config/lr_scheduler/inverse_sqrt.yaml
 create mode 100644 config/model/transformer_lm.yaml
 create mode 100644 config/model/transformer_lm_baevski_gbw.yaml
 create mode 100644 config/model/transformer_lm_baevski_wiki103.yaml
 create mode 100644 config/model/transformer_lm_big.yaml
 create mode 100644 config/model/transformer_lm_gbw.yaml
 create mode 100644 config/model/transformer_lm_gpt.yaml
 create mode 100644 config/model/transformer_lm_gpt2_big.yaml
 create mode 100644 config/model/transformer_lm_gpt2_medium.yaml
 create mode 100644 config/model/transformer_lm_gpt2_small.yaml
 create mode 100644 config/model/transformer_lm_wiki103.yaml
 create mode 100644 config/optimizer/adam.yaml
 create mode 100644 config/optimizer/nag.yaml
 create mode 100644 config/params/eval_lm_params.yaml
 create mode 100644 config/params/training_params.yaml
 create mode 100644 config/task/language_modeling.yaml

diff --git a/config/config.yaml b/config/config.yaml
new file mode 100644
index 0000000000..66723e706c
--- /dev/null
+++ b/config/config.yaml
@@ -0,0 +1,7 @@
+defaults:
+  - params: training_params
+  - task: language_modeling
+  - model: transformer_lm
+  - criterion: cross_entropy
+  - optimizer: adam
+  - lr_scheduler: inverse_sqrt
diff --git a/config/config_eval_lm.yaml b/config/config_eval_lm.yaml
new file mode 100644
index 0000000000..5a93cb5d92
--- /dev/null
+++ b/config/config_eval_lm.yaml
@@ -0,0 +1,7 @@
+defaults:
+  - params: eval_lm_params
+  - task: language_modeling
+  - model: transformer_lm
+  - criterion: cross_entropy
+  - optimizer: adam
+  - lr_scheduler: inverse_sqrt
diff --git a/config/criterion/adaptive_loss.yaml b/config/criterion/adaptive_loss.yaml
new file mode 100644
index 0000000000..a85a7eed1c
--- /dev/null
+++ b/config/criterion/adaptive_loss.yaml
@@ -0,0 +1,3 @@
+# @package _group_
+sentence_avg: ${params.optimization.sentence_avg}
+ddp_backend: ${params.distributed_training.ddp_backend}
diff --git a/config/criterion/cross_entropy.yaml b/config/criterion/cross_entropy.yaml
new file mode 100644
index 0000000000..a85a7eed1c
--- /dev/null
+++ b/config/criterion/cross_entropy.yaml
@@ -0,0 +1,3 @@
+# @package _group_
+sentence_avg: ${params.optimization.sentence_avg}
+ddp_backend: ${params.distributed_training.ddp_backend}
diff --git a/config/lr_scheduler/cosine.yaml b/config/lr_scheduler/cosine.yaml
new file mode 100644
index 0000000000..0f91e0d240
--- /dev/null
+++ b/config/lr_scheduler/cosine.yaml
@@ -0,0 +1,7 @@
+# @package _group_
+warmup_updates: 0
+warmup_init_lr: -1
+max_lr: 1.0
+t_mult: 1.0
+lr_period_updates: -1
+lr_shrink: 0.1
diff --git a/config/lr_scheduler/inverse_sqrt.yaml b/config/lr_scheduler/inverse_sqrt.yaml
new file mode 100644
index 0000000000..0eac7d88eb
--- /dev/null
+++ b/config/lr_scheduler/inverse_sqrt.yaml
@@ -0,0 +1,3 @@
+# @package _group_
+warmup_updates: 4000
+warmup_init_lr: -1
diff --git a/config/model/transformer_lm.yaml b/config/model/transformer_lm.yaml
new file mode 100644
index 0000000000..3837ea54e1
--- /dev/null
+++ b/config/model/transformer_lm.yaml
@@ -0,0 +1,36 @@
+# @package _group_
+activation_fn: "relu"
+dropout: 0.1
+attention_dropout: 0.0
+activation_dropout: 0.0
+relu_dropout: 0.0
+decoder_embed_dim: 512
+decoder_output_dim: 512
+decoder_input_dim: 512
+decoder_ffn_embed_dim: 2048
+decoder_layers: 6
+decoder_attention_heads: 8
+decoder_normalize_before: true
+no_decoder_final_norm: false
+adaptive_softmax_cutoff: null
+adaptive_softmax_dropout: 0
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: false
+adaptive_input_factor: 4
+adaptive_input_cutoff: null
+tie_adaptive_weights: false
+tie_adaptive_proj: false
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
diff --git a/config/model/transformer_lm_baevski_gbw.yaml b/config/model/transformer_lm_baevski_gbw.yaml
new file mode 100644
index 0000000000..30b1a4f1e0
--- /dev/null
+++ b/config/model/transformer_lm_baevski_gbw.yaml
@@ -0,0 +1,36 @@
+# @package _group_
+activation_fn: "relu"
+dropout: 0.1
+attention_dropout: 0.1
+activation_dropout: 0.0
+relu_dropout: 0.0
+decoder_embed_dim: 512
+decoder_output_dim: 512
+decoder_input_dim: 512
+decoder_ffn_embed_dim: 4096
+decoder_layers: 12
+decoder_attention_heads: 16
+decoder_normalize_before: true
+no_decoder_final_norm: true
+adaptive_softmax_cutoff: null
+adaptive_softmax_dropout: 0
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: false
+adaptive_input_factor: 4
+adaptive_input_cutoff: null
+tie_adaptive_weights: false
+tie_adaptive_proj: false
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
diff --git a/config/model/transformer_lm_baevski_wiki103.yaml b/config/model/transformer_lm_baevski_wiki103.yaml
new file mode 100644
index 0000000000..1154cfa660
--- /dev/null
+++ b/config/model/transformer_lm_baevski_wiki103.yaml
@@ -0,0 +1,36 @@
+# @package _group_
+activation_fn: "relu"
+dropout: 0.3
+attention_dropout: 0.1
+activation_dropout: 0.1
+relu_dropout: 0.1
+decoder_embed_dim: 1024
+decoder_output_dim: 1024
+decoder_input_dim: 1024
+decoder_ffn_embed_dim: 4096
+decoder_layers: 16
+decoder_attention_heads: 8
+decoder_normalize_before: true
+no_decoder_final_norm: true
+adaptive_softmax_cutoff: "20000,60000"
+adaptive_softmax_dropout: 0.2
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: true
+adaptive_input_factor: 4
+adaptive_input_cutoff: "20000,60000"
+tie_adaptive_weights: true
+tie_adaptive_proj: true
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
diff --git a/config/model/transformer_lm_big.yaml b/config/model/transformer_lm_big.yaml
new file mode 100644
index 0000000000..309575310b
--- /dev/null
+++ b/config/model/transformer_lm_big.yaml
@@ -0,0 +1,36 @@
+# @package _group_
+activation_fn: "relu"
+dropout: 0.1
+attention_dropout: 0.0
+activation_dropout: 0.0
+relu_dropout: 0.0
+decoder_embed_dim: 1024
+decoder_output_dim: 1024
+decoder_input_dim: 1024
+decoder_ffn_embed_dim: 4096
+decoder_layers: 12
+decoder_attention_heads: 16
+decoder_normalize_before: true
+no_decoder_final_norm: false
+adaptive_softmax_cutoff: null
+adaptive_softmax_dropout: 0
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: false
+adaptive_input_factor: 4
+adaptive_input_cutoff: null
+tie_adaptive_weights: false
+tie_adaptive_proj: false
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
diff --git a/config/model/transformer_lm_gbw.yaml b/config/model/transformer_lm_gbw.yaml
new file mode 100644
index 0000000000..30b1a4f1e0
--- /dev/null
+++ b/config/model/transformer_lm_gbw.yaml
@@ -0,0 +1,36 @@
+# @package _group_
+activation_fn: "relu"
+dropout: 0.1
+attention_dropout: 0.1
+activation_dropout: 0.0
+relu_dropout: 0.0
+decoder_embed_dim: 512
+decoder_output_dim: 512
+decoder_input_dim: 512
+decoder_ffn_embed_dim: 4096
+decoder_layers: 12
+decoder_attention_heads: 16
+decoder_normalize_before: true
+no_decoder_final_norm: true
+adaptive_softmax_cutoff: null
+adaptive_softmax_dropout: 0
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: false
+adaptive_input_factor: 4
+adaptive_input_cutoff: null
+tie_adaptive_weights: false
+tie_adaptive_proj: false
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
diff --git a/config/model/transformer_lm_gpt.yaml b/config/model/transformer_lm_gpt.yaml
new file mode 100644
index 0000000000..2c6cb7be38
--- /dev/null
+++ b/config/model/transformer_lm_gpt.yaml
@@ -0,0 +1,36 @@
+# @package _group_
+activation_fn: "gelu"
+dropout: 0.1
+attention_dropout: 0.1
+activation_dropout: 0.0
+relu_dropout: 0.0
+decoder_embed_dim: 768
+decoder_output_dim: 768
+decoder_input_dim: 768
+decoder_ffn_embed_dim: 3072
+decoder_layers: 12
+decoder_attention_heads: 12
+decoder_normalize_before: true
+no_decoder_final_norm: false
+adaptive_softmax_cutoff: null
+adaptive_softmax_dropout: 0
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: false
+adaptive_input_factor: 4
+adaptive_input_cutoff: null
+tie_adaptive_weights: false
+tie_adaptive_proj: false
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
diff --git a/config/model/transformer_lm_gpt2_big.yaml b/config/model/transformer_lm_gpt2_big.yaml
new file mode 100644
index 0000000000..a08769a178
--- /dev/null
+++ b/config/model/transformer_lm_gpt2_big.yaml
@@ -0,0 +1,36 @@
+# @package _group_
+activation_fn: "gelu"
+dropout: 0.1
+attention_dropout: 0.1
+activation_dropout: 0.0
+relu_dropout: 0.0
+decoder_embed_dim: 1600
+decoder_output_dim: 1600
+decoder_input_dim: 1600
+decoder_ffn_embed_dim: 6400
+decoder_layers: 48
+decoder_attention_heads: 25
+decoder_normalize_before: true
+no_decoder_final_norm: false
+adaptive_softmax_cutoff: null
+adaptive_softmax_dropout: 0
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: false
+adaptive_input_factor: 4
+adaptive_input_cutoff: null
+tie_adaptive_weights: false
+tie_adaptive_proj: false
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
diff --git a/config/model/transformer_lm_gpt2_medium.yaml b/config/model/transformer_lm_gpt2_medium.yaml
new file mode 100644
index 0000000000..64261d793c
--- /dev/null
+++ b/config/model/transformer_lm_gpt2_medium.yaml
@@ -0,0 +1,36 @@
+# @package _group_
+activation_fn: "gelu"
+dropout: 0.1
+attention_dropout: 0.1
+activation_dropout: 0.0
+relu_dropout: 0.0
+decoder_embed_dim: 1280
+decoder_output_dim: 1280
+decoder_input_dim: 1280
+decoder_ffn_embed_dim: 5120
+decoder_layers: 36
+decoder_attention_heads: 20
+decoder_normalize_before: true
+no_decoder_final_norm: false
+adaptive_softmax_cutoff: null
+adaptive_softmax_dropout: 0
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: false
+adaptive_input_factor: 4
+adaptive_input_cutoff: null
+tie_adaptive_weights: false
+tie_adaptive_proj: false
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
diff --git a/config/model/transformer_lm_gpt2_small.yaml b/config/model/transformer_lm_gpt2_small.yaml
new file mode 100644
index 0000000000..702e81f466
--- /dev/null
+++ b/config/model/transformer_lm_gpt2_small.yaml
@@ -0,0 +1,36 @@
+# @package _group_
+activation_fn: "gelu"
+dropout: 0.1
+attention_dropout: 0.1
+activation_dropout: 0.0
+relu_dropout: 0.0
+decoder_embed_dim: 1024
+decoder_output_dim: 1024
+decoder_input_dim: 1024
+decoder_ffn_embed_dim: 4096
+decoder_layers: 24
+decoder_attention_heads: 16
+decoder_normalize_before: true
+no_decoder_final_norm: false
+adaptive_softmax_cutoff: null
+adaptive_softmax_dropout: 0
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: false
+adaptive_input_factor: 4
+adaptive_input_cutoff: null
+tie_adaptive_weights: false
+tie_adaptive_proj: false
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
diff --git a/config/model/transformer_lm_wiki103.yaml b/config/model/transformer_lm_wiki103.yaml
new file mode 100644
index 0000000000..1154cfa660
--- /dev/null
+++ b/config/model/transformer_lm_wiki103.yaml
@@ -0,0 +1,36 @@
+# @package _group_
+activation_fn: "relu"
+dropout: 0.3
+attention_dropout: 0.1
+activation_dropout: 0.1
+relu_dropout: 0.1
+decoder_embed_dim: 1024
+decoder_output_dim: 1024
+decoder_input_dim: 1024
+decoder_ffn_embed_dim: 4096
+decoder_layers: 16
+decoder_attention_heads: 8
+decoder_normalize_before: true
+no_decoder_final_norm: true
+adaptive_softmax_cutoff: "20000,60000"
+adaptive_softmax_dropout: 0.2
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: true
+adaptive_input_factor: 4
+adaptive_input_cutoff: "20000,60000"
+tie_adaptive_weights: true
+tie_adaptive_proj: true
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
diff --git a/config/optimizer/adam.yaml b/config/optimizer/adam.yaml
new file mode 100644
index 0000000000..e5264f895e
--- /dev/null
+++ b/config/optimizer/adam.yaml
@@ -0,0 +1,5 @@
+# @package _group_
+adam_betas: "(0.9, 0.999)"
+adam_eps: 1.0e-8
+weight_decay: 0
+use_old_adam: false
diff --git a/config/optimizer/nag.yaml b/config/optimizer/nag.yaml
new file mode 100644
index 0000000000..4ab2745686
--- /dev/null
+++ b/config/optimizer/nag.yaml
@@ -0,0 +1,3 @@
+# @package _group_
+momentum: 0.99
+weight_decay: 0.0
diff --git a/config/params/eval_lm_params.yaml b/config/params/eval_lm_params.yaml
new file mode 100644
index 0000000000..4a0259bca6
--- /dev/null
+++ b/config/params/eval_lm_params.yaml
@@ -0,0 +1,106 @@
+# @package _group_
+common:
+  no_progress_bar: false
+  log_interval: 100
+  log_format: null
+  tensorboard_logdir: null
+  seed: 1
+  cpu: false
+  fp16: false
+  memory_efficient_fp16: false
+  fp16_no_flatten_grads: false
+  fp16_init_scale: 128
+  fp16_scale_window: null
+  fp16_scale_tolerance: 0.0
+  min_loss_scale: 1.0e-4
+  threshold_loss_scale: null
+  user_dir: null
+  empty_cache_freq: 0
+  all_gather_list_size: 16384
+  model_parallel_size: 1
+  checkpoint_suffix: ""
+  quantization_config_path: null
+distributed_training:
+  distributed_rank: 0
+  distributed_backend: "nccl"
+  distributed_init_method: null
+  distributed_port: -1
+  device_id: 0
+  local_rank: 0
+  distributed_no_spawn: false
+  ddp_backend: "c10d"
+  bucket_cap_mb: 25
+  fix_batches_to_gpus: false
+  find_unused_parameters: false
+  fast_stat_sync: false
+  broadcast_buffers: false
+  distributed_wrapper: "DDP"
+  slowmo_momentum: null
+  slowmo_algorithm: "LocalSGD"
+  localsgd_frequency: 3
+dataset:
+  num_workers: 1
+  skip_invalid_size_inputs_valid_test: false
+  max_tokens: null
+  max_sentences: null
+  batch_size: ${params.dataset.max_sentences}
+  required_batch_size_multiple: 8
+  dataset_impl: null
+  data_buffer_size: 10
+  train_subset: "train"
+  valid_subset: "valid"
+  validate_interval: 1
+  fixed_validation_seed: null
+  disable_validation: false
+  curriculum: 0
+  gen_subset: "test"
+  num_shards: 1
+  shard_id: 0
+  max_tokens_valid: ${params.dataset.max_tokens}
+  max_sentences_valid: ${params.dataset.max_sentences}
+optimization:
+  max_epoch: 0
+  max_update: 0
+  clip_norm: 25.0
+  sentence_avg: false
+  update_freq: [1]
+  lr: [0.25]
+  min_lr: -1.0
+  use_bmuf: false
+checkpoint:
+  save_dir: "checkpoints"
+  restore_file: "checkpoint_last.pt"
+  reset_dataloader: false
+  reset_lr_scheduler: false
+  reset_meters: false
+  reset_optimizer: false
+  optimizer_overrides: "{}"
+  save_interval: 1
+  save_interval_updates: 0
+  keep_interval_updates: -1
+  keep_last_epochs: -1
+  keep_best_checkpoints: -1
+  no_save: false
+  no_epoch_checkpoints: false
+  no_last_checkpoints: false
+  no_save_optimizer_state: false
+  best_checkpoint_metric: "loss"
+  maximize_best_checkpoint_metric: false
+  patience: -1
+common_eval:
+  path: null
+  remove_bpe: null
+  quiet: false
+  model_overrides: "{}"
+  results_path: null
+eval_lm:
+  output_word_probs: false
+  output_word_stats: false
+  context_window: 0
+bmuf:
+  block_lr: 1
+  block_momentum: 0.875
+  global_sync_iter: 50
+  warmup_iterations: 500
+  use_nbm: false
+  average_sync: false
diff --git a/config/params/training_params.yaml b/config/params/training_params.yaml
new file mode 100644
index 0000000000..3d52a82ac4
--- /dev/null
+++ b/config/params/training_params.yaml
@@ -0,0 +1,96 @@
+# @package _group_
+common:
+  no_progress_bar: false
+  log_interval: 100
+  log_format: null
+  tensorboard_logdir: null
+  seed: 1
+  cpu: false
+  fp16: false
+  memory_efficient_fp16: false
+  fp16_no_flatten_grads: false
+  fp16_init_scale: 128
+  fp16_scale_window: null
+  fp16_scale_tolerance: 0.0
+  min_loss_scale: 1.0e-4
+  threshold_loss_scale: null
+  user_dir: null
+  empty_cache_freq: 0
+  all_gather_list_size: 16384
+  model_parallel_size: 1
+  checkpoint_suffix: ""
+  quantization_config_path: null
+distributed_training:
+  distributed_rank: 0
+  distributed_backend: "nccl"
+  distributed_init_method: null
+  distributed_port: -1
+  device_id: 0
+  local_rank: 0
+  distributed_no_spawn: false
+  ddp_backend: "c10d"
+  bucket_cap_mb: 25
+  fix_batches_to_gpus: false
+  find_unused_parameters: false
+  fast_stat_sync: false
+  broadcast_buffers: false
+  distributed_wrapper: "DDP"
+  slowmo_momentum: null
+  slowmo_algorithm: "LocalSGD"
+  localsgd_frequency: 3
+dataset:
+  num_workers: 1
+  skip_invalid_size_inputs_valid_test: false
+  max_tokens: null
+  max_sentences: null
+  batch_size: ${params.dataset.max_sentences}
+  required_batch_size_multiple: 8
+  dataset_impl: null
+  data_buffer_size: 10
+  train_subset: "train"
+  valid_subset: "valid"
+  validate_interval: 1
+  fixed_validation_seed: null
+  disable_validation: false
+  curriculum: 0
+  gen_subset: "test"
+  num_shards: 1
+  shard_id: 0
+  max_tokens_valid: ${params.dataset.max_tokens}
+  max_sentences_valid: ${params.dataset.max_sentences}
+optimization:
+  max_epoch: 0
+  max_update: 0
+  clip_norm: 25.0
+  sentence_avg: false
+  update_freq: [1]
+  lr: [0.25]
+  min_lr: -1.0
+  use_bmuf: false
+checkpoint:
+  save_dir: "checkpoints"
+  restore_file: "checkpoint_last.pt"
+  reset_dataloader: false
+  reset_lr_scheduler: false
+  reset_meters: false
+  reset_optimizer: false
+  optimizer_overrides: "{}"
+  save_interval: 1
+  save_interval_updates: 0
+  keep_interval_updates: -1
+  keep_last_epochs: -1
+  keep_best_checkpoints: -1
+  no_save: false
+  no_epoch_checkpoints: false
+  no_last_checkpoints: false
+  no_save_optimizer_state: false
+  best_checkpoint_metric: "loss"
+  maximize_best_checkpoint_metric: false
+  patience: -1
+bmuf:
+  block_lr: 1
+  block_momentum: 0.875
+  global_sync_iter: 50
+  warmup_iterations: 500
+  use_nbm: false
+  average_sync: false
diff --git a/config/task/language_modeling.yaml b/config/task/language_modeling.yaml
new file mode 100644
index 0000000000..58a2ad1358
--- /dev/null
+++ b/config/task/language_modeling.yaml
@@ -0,0 +1,10 @@
+# @package _group_
+data: ???
+sample_break_mode: "none"
+tokens_per_sample: 1024
+output_dictionary_size: -1
+self_target: false
+future_target: false
+past_target: false
+add_bos_token: false
+max_target_positions: null