BERT runs with KungFu in the run_squad task.

luomai · luomai · commit 45ee88b37017 · 2019-10-30T17:40:29.000Z
diff --git a/README.md b/README.md
@@ -1,4 +1,15 @@
-# BERT
+# Distributed BERT with KungFu
+
+## Scaling out BERT with KungFu
+
+Install [KungFu](https://github.com/lsds/KungFu) first.
+Configure the relevant paths in the `run_kungfu.sh`, and simply launch:
+
+```bash
+./run_kungfu.sh
+```
+
+## BERT Releases
 
 **\*\*\*\*\* New May 31st, 2019: Whole Word Masking Models \*\*\*\*\***
 
diff --git a/optimization.py b/optimization.py
@@ -67,6 +67,9 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
   if use_tpu:
     optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
 
+  from kungfu.tensorflow.v1.optimizers import SynchronousSGDOptimizer
+  optimizer = SynchronousSGDOptimizer(optimizer)
+
   tvars = tf.trainable_variables()
   grads = tf.gradients(loss, tvars)
 
diff --git a/run_kungfu.sh b/run_kungfu.sh
@@ -0,0 +1,26 @@
+# Path to the pre-trained model
+BERT_BASE_DIR=/data/uncased_L-12_H-768_A-12
+
+# Path to the squad dataset
+SQUAD_DIR=/data/squad1
+
+# Path to the checkpoint folder
+OUTPUT_DIR=./tmp/squad_base_kungfu
+
+# Path to the kungfu-run executable
+KUNGFU_RUN=$HOME/KungFu/bin/kungfu-run
+
+$KUNGFU_RUN -np 4 python3 run_squad.py \
+  --vocab_file=$BERT_BASE_DIR/vocab.txt \
+  --bert_config_file=$BERT_BASE_DIR/bert_config.json \
+  --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
+  --do_train=True \
+  --train_file=$SQUAD_DIR/train-v1.1.json \
+  --do_predict=True \
+  --predict_file=$SQUAD_DIR/dev-v1.1.json \
+  --train_batch_size=8 \
+  --learning_rate=3e-5 \
+  --num_train_epochs=2.0 \
+  --max_seq_length=384 \
+  --doc_stride=128 \
+  --output_dir=$OUTPUT_DIR
diff --git a/run_squad.py b/run_squad.py
@@ -29,6 +29,8 @@
 import six
 import tensorflow as tf
 
+from kungfu import current_rank, current_cluster_size
+
 flags = tf.flags
 
 FLAGS = flags.FLAGS
@@ -1141,11 +1143,15 @@ def main(_):
         FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
 
   is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
+
+  # KungFu: Let one estimator to do checkpoint.
+  save_checkpoints_steps = None if current_rank() != 0 else FLAGS.save_checkpoints_steps
+
   run_config = tf.contrib.tpu.RunConfig(
       cluster=tpu_cluster_resolver,
       master=FLAGS.master,
       model_dir=FLAGS.output_dir,
-      save_checkpoints_steps=FLAGS.save_checkpoints_steps,
+      save_checkpoints_steps=save_checkpoints_steps,
       tpu_config=tf.contrib.tpu.TPUConfig(
           iterations_per_loop=FLAGS.iterations_per_loop,
           num_shards=FLAGS.num_tpu_cores,
@@ -1166,6 +1172,10 @@ def main(_):
     rng = random.Random(12345)
     rng.shuffle(train_examples)
 
+    # KungFu: Adjust training steps based on parallelism
+    num_train_steps = num_train_steps // current_cluster_size()
+    num_warmup_steps = num_warmup_steps // current_cluster_size()
+
   model_fn = model_fn_builder(
       bert_config=bert_config,
       init_checkpoint=FLAGS.init_checkpoint,
@@ -1212,7 +1222,12 @@ def main(_):
         seq_length=FLAGS.max_seq_length,
         is_training=True,
         drop_remainder=True)
-    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
+
+    # KungFu: let the first estimator to broadcast global variables.
+    from kungfu.tensorflow.v1.initializer import BroadcastGlobalVariablesHook
+    hooks = [BroadcastGlobalVariablesHook()]
+
+    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps, hooks=hooks)
 
   if FLAGS.do_predict:
     eval_examples = read_squad_examples(