Skip to content

Commit

Permalink
Re-integrate HPU after upstream refactors (#20)
Browse files Browse the repository at this point in the history
* Fix setup.py for HPU

* Fix  vllm._C import ops -> vllm.hpu import ops

* more of the same thing

* re-add hpex rmsnorm and rope; but rope is crashing

* remove unnecessary comments

* add vllm/hpu files

* add hpu autodetection

* Add HabanaAttention stub

* revert accidental changes

* revert non-habana backend attention changes

* add habana attention/worker/executor, sampling fails now

* Restore unnecessarily changed files

* enable HabanaMemoryProfiler

* Make sampler pass

* restore habana fused rope

* prefill is now working!!!

* fix prefill padding; decode is now working!!!!!

* revert accidental changes

* remove unused stuff in habana_paged_attn.py

* remove diagnostic stuff from llm_engine.py

* use HabanaExecutorAsync in async_llm_engine.py

* add habana copyright headers to habana_*.py files

* fix prefill attention conformance

* minor naming fixes

* remove naive attention from habana_attn (it never worked anyway)

* re-enable profile run

* Add fake HPUGraph support

* add more metrics

* indentation fix

* ~~recipe cache metrics don't work lalalala~~

* i'm done with metrics for now

* fix corner case in which hl-smi is not available but synapse is

* FIXME: temporary setup.py workaround

* WIP: add tensor parallelism stubs

* habana worker cleanup

* tensor parallelism is now working

* remove unused files

* remove unused func

* add hpugraphrunner

* improve hpu layernorm

* Port pipelined PA

* Port context length bucketing

* remove cudagraphrunner from hpu runner

* restore HPUGraphRunner back from FakeHPUGraphRunner

* handle rotary embeddings properly on gaudi3

* oopsie! captured_block_counts was incorrect!

* captured_block_counts.append doesn't do anything

* Restore habana_main KV cache memory layout

* fix memory profiler

* overhaul hpugraph capture

* memory profiling overhaul

* format memory properly in model warmup

* add graph compilation profiler for graph capture phase

* adroll back log lvl on graph capture message

* Remove unnecessary view on residual connection in RMSNorm (#25)

---------

Co-authored-by: madamczykhabana <[email protected]>
  • Loading branch information
kzawora-intel and madamczykhabana authored May 8, 2024
1 parent 01bfb22 commit a115250
Show file tree
Hide file tree
Showing 36 changed files with 4,045 additions and 113 deletions.
57 changes: 0 additions & 57 deletions pyproject.toml

This file was deleted.

15 changes: 15 additions & 0 deletions requirements-hpu.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
cmake>=3.21
ninja # For faster builds.
psutil
ray == 2.9.3
sentencepiece # Required for LLaMA tokenizer.
numpy
fastapi
uvicorn[standard]
pydantic >= 2.0 # Required for OpenAI server.
prometheus_client >= 0.18.0
pynvml == 11.5.0
triton >= 2.1.0
outlines == 0.0.34
pandas
tabulate
40 changes: 35 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,19 @@ def build_extensions(self) -> None:
subprocess.check_call(['cmake', *build_args], cwd=self.build_temp)


def _is_hpu() -> bool:
return True
is_hpu_available = True
try:
subprocess.run(["hl-smi"], capture_output=True, check=True)
except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
if not os.path.exists('/dev/hl0') and not os.path.exists('/dev/hl_controlD0'):
is_hpu_available = False
return is_hpu_available


def _is_cuda() -> bool:
return torch.version.cuda is not None and not _is_neuron()
return torch.version.cuda is not None and not _is_neuron() and not _is_hpu()


def _is_hip() -> bool:
Expand All @@ -190,7 +201,6 @@ def _is_neuron() -> bool:
torch_neuronx_installed = False
return torch_neuronx_installed


def _install_punica() -> bool:
return bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0")))

Expand Down Expand Up @@ -265,6 +275,17 @@ def find_version(filepath: str) -> str:
return version_match.group(1)
raise RuntimeError("Unable to find version string.")

def get_gaudi_sw_version():
"""
Returns the driver version.
"""
# Enable console printing for `hl-smi` check
output = subprocess.run(
"hl-smi", shell=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env={"ENABLE_CONSOLE": "true"}
)
if output.returncode == 0 and output.stdout:
return output.stdout.split("\n")[2].replace(" ", "").split(":")[1][:-1].split("-")[0]
return "0.0.0" # when hl-smi is not available

def get_vllm_version() -> str:
version = find_version(get_path("vllm", "__init__.py"))
Expand All @@ -286,6 +307,12 @@ def get_vllm_version() -> str:
if neuron_version != MAIN_CUDA_VERSION:
neuron_version_str = neuron_version.replace(".", "")[:3]
version += f"+neuron{neuron_version_str}"
elif _is_hpu():
# Get the Intel Gaudi Software Suite version
gaudi_sw_version = str(get_gaudi_sw_version())
if gaudi_sw_version != MAIN_CUDA_VERSION:
gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3]
version += f"+gaudi{gaudi_sw_version}"
else:
raise RuntimeError("Unknown runtime environment")

Expand Down Expand Up @@ -318,9 +345,12 @@ def get_requirements() -> List[str]:
elif _is_neuron():
with open(get_path("requirements-neuron.txt")) as f:
requirements = f.read().strip().split("\n")
elif _is_hpu():
with open(get_path("requirements-hpu.txt")) as f:
requirements = f.read().strip().split("\n")
else:
raise ValueError(
"Unsupported platform, please use CUDA, ROCM or Neuron.")
"Unsupported platform, please use CUDA, ROCM, Neuron or HPU.")

return requirements

Expand All @@ -333,7 +363,7 @@ def get_requirements() -> List[str]:
if _install_punica():
ext_modules.append(CMakeExtension(name="vllm._punica_C"))

if not _is_neuron():
if not (_is_neuron() or _is_hpu()):
ext_modules.append(CMakeExtension(name="vllm._C"))

package_data = {
Expand Down Expand Up @@ -369,6 +399,6 @@ def get_requirements() -> List[str]:
python_requires=">=3.8",
install_requires=get_requirements(),
ext_modules=ext_modules,
cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {},
cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() or _is_hpu() else {},
package_data=package_data,
)
Loading

0 comments on commit a115250

Please sign in to comment.