custom op working with compile now

Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags:
pytorch · Oct 8, 2024 · 923f657 · 923f657
1 parent e4cf9bd
commit 923f657
Show file tree

Hide file tree

Showing 6 changed files with 157 additions and 97 deletions.
diff --git a/torchao/_models/llama/benchmark_results.txt b/torchao/_models/llama/benchmark_results.txt
@@ -52,16 +52,19 @@ OTHER BENCHMARKS
 20240910110958, tok/s=223.95, mem/s= 682.88 GB/s, peak_mem= 5.59 GB, model_size= 3.05 GB quant: sparse-marlin, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --quantization sparse-marlin --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.float16 --compile --num_samples 5 --max_new_tokens 200 --top_k 200 --temperature 0.8
 
 bs1
-20241007221134, tok/s= 13.93, mem/s= 184.03 GB/s, peak_mem=13.64 GB, model_size=13.21 GB quant: None, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --num_samples 1 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
-20241007223402, tok/s=  0.32, mem/s=   1.18 GB/s, peak_mem= 5.55 GB, model_size= 3.72 GB quant: gemlite-4-64, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --quantization gemlite-4-64 --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.float16 --num_samples 1 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
-20241008092353, tok/s= 15.35, mem/s=  57.35 GB/s, peak_mem=15.56 GB, model_size= 3.74 GB quant: int4wo-64, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization int4wo-64 --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --num_samples 1 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
+20241008151940, tok/s= 94.38, mem/s=1416.56 GB/s, peak_mem=16.46 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 1 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
+20241008152137, tok/s=181.84, mem/s= 767.71 GB/s, peak_mem= 6.57 GB, model_size= 4.22 GB quant: int4wo-64, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization int4wo-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 1 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
+20241008152538, tok/s= 49.68, mem/s= 211.10 GB/s, peak_mem= 7.40 GB, model_size= 4.25 GB quant: gemlite-4-64, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --quantization gemlite-4-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.float16 --compile --num_samples 1 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
+20241008153006, tok/s= 52.19, mem/s= 221.78 GB/s, peak_mem= 7.65 GB, model_size= 4.25 GB quant: gemlite-4-64, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization gemlite-4-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 1 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8
 
 bs2
-20241007221256, tok/s= 20.04, mem/s= 264.80 GB/s, peak_mem=13.78 GB, model_size=13.21 GB quant: None, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --num_samples 1 --max_new_tokens 200 --batch_size 2 --top_k 200 --temperature 0.8
-20241007223928, tok/s=  0.92, mem/s=   3.43 GB/s, peak_mem= 5.57 GB, model_size= 3.72 GB quant: gemlite-4-64, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --quantization gemlite-4-64 --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.float16 --num_samples 1 --max_new_tokens 200 --batch_size 2 --top_k 200 --temperature 0.8
-20241008092519, tok/s= 15.06, mem/s=  56.26 GB/s, peak_mem=15.58 GB, model_size= 3.74 GB quant: int4wo-64, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization int4wo-64 --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --num_samples 1 --max_new_tokens 200 --batch_size 2 --top_k 200 --temperature 0.8
+20241008153347, tok/s= 84.89, mem/s=1274.15 GB/s, peak_mem=16.81 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 1 --max_new_tokens 200 --batch_size 2 --top_k 200 --temperature 0.8
+20241008153609, tok/s=173.71, mem/s= 733.37 GB/s, peak_mem= 6.92 GB, model_size= 4.22 GB quant: int4wo-64, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization int4wo-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 1 --max_new_tokens 200 --batch_size 2 --top_k 200 --temperature 0.8
+20241008154149, tok/s= 49.57, mem/s= 211.96 GB/s, peak_mem= 7.75 GB, model_size= 4.28 GB quant: gemlite-4-64, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --quantization gemlite-4-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.float16 --compile --num_samples 1 --max_new_tokens 200 --batch_size 2 --top_k 200 --temperature 0.8
+20241008154651, tok/s= 52.04, mem/s= 222.53 GB/s, peak_mem= 7.67 GB, model_size= 4.28 GB quant: gemlite-4-64, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization gemlite-4-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 1 --max_new_tokens 200 --batch_size 2 --top_k 200 --temperature 0.8
 
 bs4
-20241007221421, tok/s= 19.03, mem/s= 251.42 GB/s, peak_mem=14.06 GB, model_size=13.21 GB quant: None, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --num_samples 1 --max_new_tokens 200 --batch_size 4 --top_k 200 --temperature 0.8
-20241007224456, tok/s=  0.91, mem/s=   3.38 GB/s, peak_mem= 5.59 GB, model_size= 3.72 GB quant: gemlite-4-64, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --quantization gemlite-4-64 --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.float16 --num_samples 1 --max_new_tokens 200 --batch_size 4 --top_k 200 --temperature 0.8
-20241008092656, tok/s= 12.32, mem/s=  46.04 GB/s, peak_mem=15.60 GB, model_size= 3.74 GB quant: int4wo-64, mod: Llama-2-7b-chat-hf, kv_quant: False, compile: False, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization int4wo-64 --checkpoint_path ../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth --device cuda --precision torch.bfloat16 --num_samples 1 --max_new_tokens 200 --batch_size 4 --top_k 200 --temperature 0.8
+20241008155034, tok/s= 83.37, mem/s=1251.36 GB/s, peak_mem=16.97 GB, model_size=15.01 GB quant: None, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 1 --max_new_tokens 200 --batch_size 4 --top_k 200 --temperature 0.8
+20241008155257, tok/s=141.60, mem/s= 597.82 GB/s, peak_mem= 6.95 GB, model_size= 4.22 GB quant: int4wo-64, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization int4wo-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 1 --max_new_tokens 200 --batch_size 4 --top_k 200 --temperature 0.8
+20241008155928, tok/s= 49.45, mem/s= 214.18 GB/s, peak_mem= 7.81 GB, model_size= 4.33 GB quant: gemlite-4-64, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --quantization gemlite-4-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.float16 --compile --num_samples 1 --max_new_tokens 200 --batch_size 4 --top_k 200 --temperature 0.8
+20241008160515, tok/s= 51.74, mem/s= 224.09 GB/s, peak_mem= 7.79 GB, model_size= 4.33 GB quant: gemlite-4-64, mod: Meta-Llama-3-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --quantization gemlite-4-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 1 --max_new_tokens 200 --batch_size 4 --top_k 200 --temperature 0.8