Re-integrate HPU after upstream refactors (#20)

* Fix setup.py for HPU * Fix vllm._C import ops -> vllm.hpu import ops * more of the same thing * re-add hpex rmsnorm and rope; but rope is crashing * remove unnecessary comments * add vllm/hpu files * add hpu autodetection * Add HabanaAttention stub * revert accidental changes * revert non-habana backend attention changes * add habana attention/worker/executor, sampling fails now * Restore unnecessarily changed files * enable HabanaMemoryProfiler * Make sampler pass * restore habana fused rope * prefill is now working!!! * fix prefill padding; decode is now working!!!!! * revert accidental changes * remove unused stuff in habana_paged_attn.py * remove diagnostic stuff from llm_engine.py * use HabanaExecutorAsync in async_llm_engine.py * add habana copyright headers to habana_*.py files * fix prefill attention conformance * minor naming fixes * remove naive attention from habana_attn (it never worked anyway) * re-enable profile run * Add fake HPUGraph support * add more metrics * indentation fix * ~~recipe cache metrics don't work lalalala~~ * i'm done with metrics for now * fix corner case in which hl-smi is not available but synapse is * FIXME: temporary setup.py workaround * WIP: add tensor parallelism stubs * habana worker cleanup * tensor parallelism is now working * remove unused files * remove unused func * add hpugraphrunner * improve hpu layernorm * Port pipelined PA * Port context length bucketing * remove cudagraphrunner from hpu runner * restore HPUGraphRunner back from FakeHPUGraphRunner * handle rotary embeddings properly on gaudi3 * oopsie! captured_block_counts was incorrect! * captured_block_counts.append doesn't do anything * Restore habana_main KV cache memory layout * fix memory profiler * overhaul hpugraph capture * memory profiling overhaul * format memory properly in model warmup * add graph compilation profiler for graph capture phase * adroll back log lvl on graph capture message * Remove unnecessary view on residual connection in RMSNorm (#25) --------- Co-authored-by: madamczykhabana <[email protected]>
opendatahub-io · May 8, 2024 · a115250 · a115250
1 parent 01bfb22
commit a115250
Show file tree

Hide file tree

Showing 36 changed files with 4,045 additions and 113 deletions.
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/requirements-hpu.txt b/requirements-hpu.txt
@@ -0,0 +1,15 @@
+cmake>=3.21
+ninja  # For faster builds.
+psutil
+ray == 2.9.3
+sentencepiece  # Required for LLaMA tokenizer.
+numpy
+fastapi
+uvicorn[standard]
+pydantic >= 2.0  # Required for OpenAI server.
+prometheus_client >= 0.18.0
+pynvml == 11.5.0
+triton >= 2.1.0
+outlines == 0.0.34
+pandas
+tabulate
diff --git a/setup.py b/setup.py
@@ -174,8 +174,19 @@ def build_extensions(self) -> None:
             subprocess.check_call(['cmake', *build_args], cwd=self.build_temp)
 
 
+def _is_hpu() -> bool:
+    return True 
+    is_hpu_available = True
+    try:
+        subprocess.run(["hl-smi"], capture_output=True, check=True)
+    except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
+        if not os.path.exists('/dev/hl0') and not os.path.exists('/dev/hl_controlD0'):
+            is_hpu_available = False
+    return is_hpu_available
+
+
 def _is_cuda() -> bool:
-    return torch.version.cuda is not None and not _is_neuron()
+    return torch.version.cuda is not None and not _is_neuron() and not _is_hpu()
 
 
 def _is_hip() -> bool:
@@ -190,7 +201,6 @@ def _is_neuron() -> bool:
         torch_neuronx_installed = False
     return torch_neuronx_installed
 
-
 def _install_punica() -> bool:
     return bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0")))
 
@@ -265,6 +275,17 @@ def find_version(filepath: str) -> str:
             return version_match.group(1)
         raise RuntimeError("Unable to find version string.")
 
+def get_gaudi_sw_version():
+    """
+    Returns the driver version.
+    """
+    # Enable console printing for `hl-smi` check
+    output = subprocess.run(
+        "hl-smi", shell=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env={"ENABLE_CONSOLE": "true"}
+    )
+    if output.returncode == 0 and output.stdout:
+        return output.stdout.split("\n")[2].replace(" ", "").split(":")[1][:-1].split("-")[0]
+    return "0.0.0" # when hl-smi is not available
 
 def get_vllm_version() -> str:
     version = find_version(get_path("vllm", "__init__.py"))
@@ -286,6 +307,12 @@ def get_vllm_version() -> str:
         if neuron_version != MAIN_CUDA_VERSION:
             neuron_version_str = neuron_version.replace(".", "")[:3]
             version += f"+neuron{neuron_version_str}"
+    elif _is_hpu():
+        # Get the Intel Gaudi Software Suite version
+        gaudi_sw_version = str(get_gaudi_sw_version()) 
+        if gaudi_sw_version != MAIN_CUDA_VERSION:
+            gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3]
+            version += f"+gaudi{gaudi_sw_version}"
     else:
         raise RuntimeError("Unknown runtime environment")
 
@@ -318,9 +345,12 @@ def get_requirements() -> List[str]:
     elif _is_neuron():
         with open(get_path("requirements-neuron.txt")) as f:
             requirements = f.read().strip().split("\n")
+    elif _is_hpu():
+        with open(get_path("requirements-hpu.txt")) as f:
+            requirements = f.read().strip().split("\n")
     else:
         raise ValueError(
-            "Unsupported platform, please use CUDA, ROCM or Neuron.")
+            "Unsupported platform, please use CUDA, ROCM, Neuron or HPU.")
 
     return requirements
 
@@ -333,7 +363,7 @@ def get_requirements() -> List[str]:
     if _install_punica():
         ext_modules.append(CMakeExtension(name="vllm._punica_C"))
 
-if not _is_neuron():
+if not (_is_neuron() or _is_hpu()):
     ext_modules.append(CMakeExtension(name="vllm._C"))
 
 package_data = {
@@ -369,6 +399,6 @@ def get_requirements() -> List[str]:
     python_requires=">=3.8",
     install_requires=get_requirements(),
     ext_modules=ext_modules,
-    cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {},
+    cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() or _is_hpu() else {},
     package_data=package_data,
 )