Skip to content

Commit

Permalink
Update pre-commit hooks (vllm-project#12475)
Browse files Browse the repository at this point in the history
Signed-off-by: Harry Mellor <[email protected]>
  • Loading branch information
hmellor authored and tjtanaa committed Jan 28, 2025
1 parent 411e0d2 commit 0ae8f3e
Show file tree
Hide file tree
Showing 64 changed files with 322 additions and 288 deletions.
10 changes: 5 additions & 5 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,18 @@ default_stages:
- manual # Run in CI
repos:
- repo: https://github.com/google/yapf
rev: v0.32.0
rev: v0.43.0
hooks:
- id: yapf
args: [--in-place, --verbose]
additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.6.5
rev: v0.9.3
hooks:
- id: ruff
args: [--output-format, github]
- repo: https://github.com/codespell-project/codespell
rev: v2.3.0
rev: v2.4.0
hooks:
- id: codespell
exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
Expand All @@ -23,7 +23,7 @@ repos:
hooks:
- id: isort
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v18.1.5
rev: v19.1.7
hooks:
- id: clang-format
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))'
Expand All @@ -35,7 +35,7 @@ repos:
- id: pymarkdown
files: docs/.*
- repo: https://github.com/rhysd/actionlint
rev: v1.7.6
rev: v1.7.7
hooks:
- id: actionlint
- repo: local
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -926,8 +926,8 @@ def main(args: argparse.Namespace):
)

# Traffic
result_json["request_rate"] = (
args.request_rate if args.request_rate < float("inf") else "inf")
result_json["request_rate"] = (args.request_rate if args.request_rate
< float("inf") else "inf")
result_json["burstiness"] = args.burstiness
result_json["max_concurrency"] = args.max_concurrency

Expand Down
8 changes: 6 additions & 2 deletions csrc/custom_all_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,13 @@ struct Signal {
alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
};

struct __align__(16) RankData { const void* __restrict__ ptrs[8]; };
struct __align__(16) RankData {
const void* __restrict__ ptrs[8];
};

struct __align__(16) RankSignals { Signal* signals[8]; };
struct __align__(16) RankSignals {
Signal* signals[8];
};

// like std::array, but aligned
template <typename T, int sz>
Expand Down
8 changes: 4 additions & 4 deletions csrc/moe/marlin_kernels/marlin_moe_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,8 @@ __device__ inline FragB dequant<vllm::kU4B8.id()>(int q) {
const int HI = 0x00f000f0;
const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`.
const int SUB = 0x64086408;
Expand Down Expand Up @@ -182,8 +182,8 @@ __device__ inline FragB dequant<vllm::kU4.id()>(int q) {
const int HI = 0x00f000f0;
const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);

const int SUB = 0x64006400;
const int MUL = 0x2c002c00;
Expand Down
16 changes: 8 additions & 8 deletions csrc/quantization/gptq_marlin/gptq_marlin.cu
Original file line number Diff line number Diff line change
Expand Up @@ -173,8 +173,8 @@ dequant<half, vllm::kU4B8.id()>(int q) {
const int HI = 0x00f000f0;
const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`.
const int SUB = 0x64086408;
Expand All @@ -197,9 +197,9 @@ dequant<nv_bfloat16, vllm::kU4B8.id()>(int q) {

// Guarantee that the `(a & b) | c` operations are LOP3s.

int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
q >>= 4;
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);

typename ScalarType<nv_bfloat16>::FragB frag_b;
static constexpr uint32_t MUL = 0x3F803F80;
Expand All @@ -221,8 +221,8 @@ dequant<half, vllm::kU4.id()>(int q) {
const int HI = 0x00f000f0;
const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);

const int SUB = 0x64006400;
const int MUL = 0x2c002c00;
Expand All @@ -244,9 +244,9 @@ dequant<nv_bfloat16, vllm::kU4.id()>(int q) {

// Guarantee that the `(a & b) | c` operations are LOP3s.

int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
q >>= 4;
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);

typename ScalarType<nv_bfloat16>::FragB frag_b;
static constexpr uint32_t MUL = 0x3F803F80;
Expand Down
4 changes: 2 additions & 2 deletions csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,8 @@ __device__ inline FragB dequant(int q) {
const int HI = 0x00f000f0;
const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`.
const int SUB = 0x64086408;
Expand Down
4 changes: 2 additions & 2 deletions csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,8 @@ __device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) {
static constexpr uint32_t HI = 0x00f000f0;
static constexpr uint32_t EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s.
uint32_t t0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
uint32_t t1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
uint32_t t0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
uint32_t t1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`.
static constexpr uint32_t SUB = 0x64086408;
Expand Down
4 changes: 2 additions & 2 deletions csrc/quantization/marlin/sparse/common/mma.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,8 @@ __device__ inline FragB dequant_4bit(int q) {
const int HI = 0x00f000f0;
const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`.
const int SUB = 0x64086408;
Expand Down
4 changes: 3 additions & 1 deletion csrc/rocm/attention.cu
Original file line number Diff line number Diff line change
Expand Up @@ -907,7 +907,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads,
// max_num_partitions, head_size]
const int* __restrict__ context_lens, // [num_seqs]
const int max_num_partitions){UNREACHABLE_CODE}
const int max_num_partitions) {
UNREACHABLE_CODE
}

#endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ def get_rocm_version():

if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor),
ctypes.byref(patch)) == 0):
return "%d.%d.%d" % (major.value, minor.value, patch.value)
return f"{major.value}.{minor.value}.{patch.value}"
return None
except Exception:
return None
Expand Down
25 changes: 14 additions & 11 deletions tests/kernels/test_block_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,10 @@ def native_w8a8_block_fp8_matmul(A,
A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
]
B_tiles = [[
B[j * block_n:min((j + 1) * block_n, N),
i * block_k:min((i + 1) * block_k, K), ] for i in range(k_tiles)
B[
j * block_n:min((j + 1) * block_n, N),
i * block_k:min((i + 1) * block_k, K),
] for i in range(k_tiles)
] for j in range(n_tiles)]
C_tiles = [
C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
Expand Down Expand Up @@ -157,9 +159,9 @@ def setup_cuda():
torch.set_default_device("cuda")


@pytest.mark.parametrize("num_tokens,d,dtype,group_size,seed",
itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE,
SEEDS))
@pytest.mark.parametrize(
"num_tokens,d,dtype,group_size,seed",
itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS))
@torch.inference_mode()
def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
torch.manual_seed(seed)
Expand All @@ -174,9 +176,9 @@ def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
assert torch.allclose(scale, ref_scale)


@pytest.mark.parametrize("M,N,K,block_size,out_dtype,seed",
itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES,
SEEDS))
@pytest.mark.parametrize(
"M,N,K,block_size,out_dtype,seed",
itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
@torch.inference_mode()
def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
torch.manual_seed(seed)
Expand Down Expand Up @@ -207,9 +209,10 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
assert rel_diff < 0.001


@pytest.mark.parametrize("M,N,K,E,topk,block_size,dtype,seed",
itertools.product(M_moe, N_moe, K_moe, E, TOP_KS,
BLOCK_SIZE, DTYPES, SEEDS))
@pytest.mark.parametrize(
"M,N,K,E,topk,block_size,dtype,seed",
itertools.product(M_moe, N_moe, K_moe, E, TOP_KS, BLOCK_SIZE, DTYPES,
SEEDS))
@torch.inference_mode()
def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
torch.manual_seed(seed)
Expand Down
10 changes: 5 additions & 5 deletions tests/kv_transfer/test_lookup_buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def test_run(my_rank, buffer, device):
assert buffer.buffer_size == 0
assert len(buffer.buffer) == 0

print("My rank: %d, device: %s" % (my_rank, device))
print(f"My rank: {my_rank}, device: {device}")

# insert
tokens = torch.tensor([1, 2, 3]).to(device)
Expand Down Expand Up @@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device):
assert buffer.buffer_size == 0
assert len(buffer.buffer) == 0

print("My rank: %d, Test run passed!" % (my_rank))
print(f"My rank: {my_rank}, Test run passed!")


def stress_test(my_rank, buf, device):
Expand Down Expand Up @@ -94,7 +94,7 @@ def stress_test(my_rank, buf, device):
assert torch.allclose(k, k_)
assert torch.allclose(v, v_)
assert torch.allclose(h, h_)
print('Rank %d done' % my_rank)
print(f"Rank {my_rank} done")
torch.distributed.barrier()

if my_rank == 0:
Expand All @@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device):
else:
torch.distributed.send(torch.tensor([n]), 0)

print("My rank: %d, Passed stress test!" % (my_rank))
print(f"My rank: {my_rank}, Passed stress test!")


if __name__ == "__main__":
Expand All @@ -122,7 +122,7 @@ def stress_test(my_rank, buf, device):
rank=my_rank,
)

print("initialized! My rank is %d" % my_rank)
print(f"initialized! My rank is {my_rank}")

config = KVTransferConfig(
kv_connector='PyNcclConnector',
Expand Down
6 changes: 3 additions & 3 deletions tests/lora/test_qwen2vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
return generated_texts


@pytest.mark.xfail(current_platform.is_rocm(),
reason="Qwen2-VL dependency xformers incompatible with ROCm"
)
@pytest.mark.xfail(
current_platform.is_rocm(),
reason="Qwen2-VL dependency xformers incompatible with ROCm")
def test_qwen2vl_lora(qwen2vl_lora_files):
llm = vllm.LLM(
MODEL_PATH,
Expand Down
Loading

0 comments on commit 0ae8f3e

Please sign in to comment.