awslabs · comaniac · Feb 10, 2023 · Feb 4, 2023 · Feb 4, 2023 · Feb 4, 2023
diff --git a/benchmark/bench_single_node.py b/benchmark/bench_single_node.py
@@ -369,6 +369,7 @@ def list_envs(append_to=None):
         "epoi",
         "transformers",
         "xformers",
+        "flash_attn",
         "megatron",
         "deepspeed",
         "triton",

diff --git a/examples/README.md b/examples/README.md
@@ -24,6 +24,18 @@ git submodule update --init --recursive
 pip3 install -e ".[dev]"
 ```
 
+Note currently we need to apply the following patch to the `xformers` library:
+```
+XFORMER_PATH=`python3 -c "import xformers, pathlib; print(pathlib.Path(xformers.__path__[0]).parent)"`
+cp scripts/xformers_patch $XFORMER_PATH
+pushd $XFORMER_PATH
+git config --global --add safe.directory $XFORMER_PATH
+git reset --hard
+git apply xformers_patch
+git --no-pager diff
+popd
+```
+
 - flash-attention:
 ```
 git clone https://github.com/jfc4050/flash-attention.git

diff --git a/examples/albert/deepspeed_hf.py b/examples/albert/deepspeed_hf.py
@@ -88,14 +88,15 @@ def train(args):
             model,
             config,
             prefix="albert",
+            attn_op_name=args.attn_op_name,
             ckpt_ratio=args.checkpoint,
             bcast_input=True,
             group=group,
             pipeline_cuts=pipeline_cuts,
             delay_init=enable_pipeline,
         )
     if SINGLE_DEVICE_FOR_DEBUG:
-        slapo.build(sch)
+        slapo.build(sch, init_weights=model._init_weights)
         assert False
 
     if enable_pipeline:
@@ -118,6 +119,7 @@ def loss_fn(outputs, labels):
             target="deepspeed",
             config=ds_config_dict,
             loss_fn=loss_fn,
+            init_weights=model._init_weights,
         )
     else:
         if batch_size is not None:
@@ -133,6 +135,7 @@ def loss_fn(outputs, labels):
             topology=topology,
             target="deepspeed",
             config=ds_config_dict,
+            init_weights=model._init_weights,
         )
         model = model.to(device)
     report_memory(msg="After building model")
@@ -216,6 +219,14 @@ def loss_fn(outputs, labels):
         default=None,
         help="Micro batch size per GPU",
     )
+    parser.add_argument(
+        "--attn_op_name",
+        type=str,
+        default="cuda",
+        help="Attention op name {'native_xformers', 'cutlass', 'triton', 'cuda'}. "
+        "'cuda' and 'triton' only support sm_80+, and other archs will "
+        "fallback to 'cutlas'",
+    )
     parser.add_argument(
         "--seq_len",
         type=int,

diff --git a/examples/albert/megatron_hf.py b/examples/albert/megatron_hf.py
@@ -56,12 +56,12 @@ def get_model(
         sch = schedule_model(
             model,
             config,
-            disable_flash_attn=disable_flash_attn,
+            attn_op_name="native_xformers" if disable_flash_attn else "cuda",
             fp16=fp16,
             ckpt_ratio=ckpt_ratio,
             delay_init=delay_init,
         )
-        model, _ = slapo.build(sch)
+        model, _ = slapo.build(sch, init_weights=model._init_weights)
         report_memory()
 
     elif impl == "torchscript":

diff --git a/examples/albert/model.py b/examples/albert/model.py
@@ -10,6 +10,7 @@
     broadcast_input,
     checkpoint,
     replace_and_shard_attention,
+    fuse_bias_gelu,
     shard_mlp,
     shard_word_embedding,
 )
@@ -21,12 +22,13 @@ def schedule_model(
     model,
     config,
     prefix="",
-    disable_flash_attn=False,
+    attn_op_name="cuda",
     fp16=True,
     ckpt_ratio=0.0,
     group=None,
     bcast_input=False,
     pipeline_cuts=None,
+    disable_fuse_bias_gelu=True,
     delay_init=True,
 ):
     logger.info("Scheduling Albert", ranks=0)
@@ -39,15 +41,22 @@ def schedule_model(
 
     # Replace self attention with flash attention, and shard QKV/output
     # if MP group > 1.
-    if disable_flash_attn:
-        logger.info("Disabled Flash Attention", rank=0)
-    cnt = replace_and_shard_attention(
+    if attn_op_name == "native_xformers":
+        logger.info("Disabled Flash Attention", ranks=0)
+    cnt, applied_attn_op_name = replace_and_shard_attention(
         sch[prefix],
         config,
         delay_init=delay_init,
-        disable_flash_attn=disable_flash_attn,
+        attn_op_name=attn_op_name,
     )
-    logger.info(f"Replace {cnt} attention patterns", ranks=0)
+    logger.info(
+        f"Replace {cnt} attention layers with {applied_attn_op_name} op", ranks=0
+    )
+
+    # Operator fusion
+    if not disable_fuse_bias_gelu:
+        fuse_bias_gelu(sch[prefix], config)
+        logger.info(f"Fused Bias+GeLU", ranks=0)
 
     # Shard other parameters if MP group > 1.
     if sch.world_size > 1: