Merge branch 'main' into apple-silicon-merge2

rickardp · Jun 7, 2023 · 2b77064 · 2b77064
2 parents 5254caf + ac5550a
commit 2b77064
Show file tree

Hide file tree

Showing 26 changed files with 2,958 additions and 1,020 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -228,3 +228,14 @@ Deprecated:
 Features:
  - Added Int8 SwitchBack layers
  - Added Fake FP8 layers for research purposes (available under `bnb.research.nn. ...`)
+
+
+### 0.39.0
+
+
+Features:
+ - 4-bit matrix multiplication for Float4 and NormalFloat4 data types.
+ - Added 4-bit quantization routines
+ - Doubled quantization routines for 4-bit quantization
+ - Paged optimizers for Adam and Lion.
+ - bfloat16 gradient / weight support for Adam and Lion with 8 or 32-bit states.
diff --git a/Makefile.previous b/Makefile.previous
@@ -2,6 +2,7 @@ MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
 ROOT_DIR := $(patsubst %/,%,$(dir $(MKFILE_PATH)))
 
 GPP:= /usr/bin/g++
+#GPP:= /sw/gcc/11.2.0/bin/g++
 ifeq ($(CUDA_HOME),)
 	CUDA_HOME:= $(shell which nvcc | rev | cut -d'/' -f3- | rev)
 endif
@@ -39,6 +40,7 @@ ifneq ($(TARGET_ARCH),$(NATIVE_ARCH))
 endif
 
 
+
 NVCC := $(CUDA_HOME)/bin/nvcc
 
 ###########################################
@@ -50,26 +52,19 @@ FILES_CUDA := $(CSRC)/ops.cu $(CSRC)/kernels.cu
 FILES_CPP := $(CSRC)/common.cpp $(CSRC)/cpu_ops.cpp $(CSRC)/pythonInterface.cpp
 
 INCLUDE :=  -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/include
-INCLUDE_10x :=  -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(ROOT_DIR)/dependencies/cub -I $(ROOT_DIR)/include
-LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcublas -lcublasLt -lcurand -lcusparse -L $(CONDA_PREFIX)/lib
+LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcublas -lcublasLt -lcusparse -L $(CONDA_PREFIX)/lib
 
 # NVIDIA NVCC compilation flags
 COMPUTE_CAPABILITY += -gencode arch=compute_50,code=sm_50 # Maxwell
 COMPUTE_CAPABILITY += -gencode arch=compute_52,code=sm_52 # Maxwell
 COMPUTE_CAPABILITY += -gencode arch=compute_60,code=sm_60 # Pascal
 COMPUTE_CAPABILITY += -gencode arch=compute_61,code=sm_61 # Pascal
 COMPUTE_CAPABILITY += -gencode arch=compute_70,code=sm_70 # Volta
-COMPUTE_CAPABILITY += -gencode arch=compute_72,code=sm_72 # Volta
 
 CC_KEPLER := -gencode arch=compute_35,code=sm_35 # Kepler
 CC_KEPLER += -gencode arch=compute_37,code=sm_37 # Kepler
 
 # Later versions of CUDA support the new architectures
-CC_CUDA10x += -gencode arch=compute_75,code=sm_75
-
-CC_CUDA110 := -gencode arch=compute_75,code=sm_75
-CC_CUDA110 += -gencode arch=compute_80,code=sm_80
-
 CC_CUDA11x := -gencode arch=compute_75,code=sm_75
 CC_CUDA11x += -gencode arch=compute_80,code=sm_80
 CC_CUDA11x += -gencode arch=compute_86,code=sm_86
@@ -86,23 +81,10 @@ CC_ADA_HOPPER := -gencode arch=compute_89,code=sm_89
 CC_ADA_HOPPER += -gencode arch=compute_90,code=sm_90
 
 
-all: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
-	echo "Specify a target to make (arch=$(TARGET_ARCH), flags:$(EXTRA_FLAGS))" && false
-
-cuda: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
-	$(NVCC) $(CC_CUDA10x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
-	$(NVCC) $(CC_CUDA10x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) $(EXTRA_FLAGS) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)$(SHLIB_EXTENSION) $(LIB)
-
-cuda92: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) $(EXTRA_FLAGS) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt$(SHLIB_EXTENSION) $(LIB)
-
-cuda10x_nomatmul: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE_10x) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) $(EXTRA_FLAGS) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt$(SHLIB_EXTENSION) $(LIB)
+all: $(BUILD_DIR) env
+	$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
+	$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
+	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
 
 cuda110_nomatmul: $(BUILD_DIR) env
 	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
@@ -10,6 +10,7 @@
     matmul,
     matmul_cublas,
     mm_cublas,
+    matmul_4bit
 )
 from .cextension import COMPILED_WITH_CUDA
 from .nn import modules

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
@@ -2,7 +2,7 @@
 import warnings
 from dataclasses import dataclass
 from functools import reduce  # Required in Python 3
-from typing import Tuple, Optional
+from typing import Tuple, Optional, List
 
 import torch
 
@@ -427,10 +427,10 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
         ctx.dtype_A, ctx.dtype_B, ctx.dtype_bias = A.dtype, B.dtype, None if bias is None else bias.dtype
 
         if any(ctx.needs_input_grad[:2]):
-            ctx.tensors = (CAt, subA)
+            ctx.tensors = (CAt, subA, A)
             ctx.tensor_states = (SCAt, state.idx)
         else:
-            ctx.tensors = [None, None]
+            ctx.tensors = [None, None, A]
             ctx.tensor_states = (None, None)
             ctx.save_for_backward(None, None)
 
@@ -443,7 +443,7 @@ def backward(ctx, grad_output):
             bias_grad = None if ctx.bias is None else torch.zeros_like(ctx.bias)
             return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None
         req_gradA, req_gradB, _, req_gradBias, _ = ctx.needs_input_grad
-        CAt, subA = ctx.tensors
+        CAt, subA, A = ctx.tensors
         SCAt, idx = ctx.tensor_states
         formatB = ctx.formatB
         state = ctx.state
@@ -489,6 +489,65 @@ def backward(ctx, grad_output):
 
         return grad_A, grad_B, None, grad_bias, None
 
+
+class MatMul4Bit(torch.autograd.Function):
+    # forward is the same, but we added the fallback for pre-turing GPUs
+    # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None")
+
+    @staticmethod
+    def forward(ctx, A, B, out=None, bias=None, state=None):
+        # default of pytorch behavior if inputs are empty
+        ctx.is_empty = False
+        if prod(A.shape) == 0:
+            ctx.is_empty = True
+            ctx.A = A
+            ctx.B = B
+            ctx.bias = bias
+            B_shape = state[1]
+            if A.shape[-1] == B_shape[0]:
+                return torch.empty(A.shape[:-1] + B_shape[1:], dtype=A.dtype, device=A.device)
+            else:
+                return torch.empty(A.shape[:-1] + B_shape[:1], dtype=A.dtype, device=A.device)
+
+
+        # 1. Dequantize
+        # 2. MatmulnN
+        output = torch.nn.functional.linear(A, F.dequantize_fp4(B, state).to(A.dtype).t(), bias)
+
+        # 3. Save state
+        ctx.state = state
+        ctx.dtype_A, ctx.dtype_B, ctx.dtype_bias = A.dtype, B.dtype, None if bias is None else bias.dtype
+
+        if any(ctx.needs_input_grad[:2]):
+            ctx.tensors = (A, B)
+        else:
+            ctx.tensors = (None, None)
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        if ctx.is_empty:
+            bias_grad = None if ctx.bias is None else torch.zeros_like(ctx.bias)
+            return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None
+
+        req_gradA, _, _, req_gradBias, _= ctx.needs_input_grad
+        A, B = ctx.tensors
+        state = ctx.state
+
+        grad_A, grad_B, grad_bias = None, None, None
+
+        if req_gradBias:
+            # compute grad_bias first before changing grad_output dtype
+            grad_bias = grad_output.sum(0, dtype=ctx.dtype_bias)
+
+        # not supported by PyTorch. TODO: create work-around
+        #if req_gradB: grad_B = torch.matmul(grad_output.t(), A)
+        if req_gradA: grad_A = torch.matmul(grad_output, F.dequantize_fp4(B, ctx.state).to(grad_output.dtype).t())
+
+        return grad_A, grad_B, None, grad_bias, None
+
+
 def matmul(
     A: tensor,
     B: tensor,
@@ -501,3 +560,8 @@ def matmul(
     if threshold > 0.0:
         state.threshold = threshold
     return MatMul8bitLt.apply(A, B, out, bias, state)
+
+
+def matmul_4bit(A: tensor, B: tensor, quant_state: List, out: tensor = None, bias=None):
+    assert quant_state is not None
+    return MatMul4Bit.apply(A, B, out, bias, quant_state)
diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
@@ -30,15 +30,13 @@
             Inspect the output of the command and see if you can locate CUDA libraries. You might need to add them
             to your LD_LIBRARY_PATH. If you suspect a bug, please take the information from python -m bitsandbytes
             and open an issue at: https://github.com/TimDettmers/bitsandbytes/issues''')
-        lib.cadam32bit_g32
+        lib.cadam32bit_grad_fp32 # runs on an error if the library could not be found -> COMPILED_WITH_CUDA=False
         lib.get_context.restype = ct.c_void_p
         lib.get_cusparse.restype = ct.c_void_p
+        lib.cget_managed_ptr.restype = ct.c_void_p
         COMPILED_WITH_CUDA = True
-    except AttributeError:
+    except AttributeError as ex:
         warn("The installed version of bitsandbytes was compiled without GPU support. "
             "8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.")
         COMPILED_WITH_CUDA = False
-
-    # print the setup details after checking for errors so we do not print twice
-    if 'BITSANDBYTES_NOWELCOME' not in os.environ or str(os.environ['BITSANDBYTES_NOWELCOME']) == '0':
-        setup.print_log_stack()
+        print(str(ex))
diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py
@@ -44,6 +44,9 @@ def __init__(self):
         raise RuntimeError("Call get_instance() instead")
 
     def generate_instructions(self):
+        if getattr(self, 'error', False): return
+        print(self.error)
+        self.error = True
         if self.cuda is None:
             self.add_log_entry('CUDA SETUP: Problem: The main issue seems to be that the main CUDA library was not detected.')
             self.add_log_entry('CUDA SETUP: Solution 1): Your paths are probably not up-to-date. You can update them via: sudo ldconfig.')
@@ -93,6 +96,7 @@ def initialize(self):
             self.has_printed = False
             self.lib = None
             self.initialized = False
+            self.error = False
             self.cuda_setup_log = []
 
     def run_cuda_setup(self):