From 151f4365ebb810c8c0340d682022ba09e50abc0f Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 12 May 2021 16:46:13 +0200 Subject: [PATCH] docs: jit-unspilling --- docs/source/examples/spilling.rst | 32 ----------- docs/source/index.rst | 2 +- docs/source/spilling.rst | 95 +++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+), 33 deletions(-) delete mode 100644 docs/source/examples/spilling.rst create mode 100644 docs/source/spilling.rst diff --git a/docs/source/examples/spilling.rst b/docs/source/examples/spilling.rst deleted file mode 100644 index 29a9add44..000000000 --- a/docs/source/examples/spilling.rst +++ /dev/null @@ -1,32 +0,0 @@ -Spilling from device -==================== - -By default, Dask-CUDA enables spilling from GPU to host memory when a GPU reaches a memory utilization of 80%. -This can be changed to suit the needs of a workload, or disabled altogether, by explicitly setting ``device_memory_limit``. -This parameter accepts an integer or string memory size, or a float representing a percentage of the GPU's total memory: - -.. code-block:: python - - from dask_cuda import LocalCUDACluster - - cluster = LocalCUDACluster(device_memory_limit=50000) # spilling after 50000 bytes - cluster = LocalCUDACluster(device_memory_limit="5GB") # spilling after 5 GB - cluster = LocalCUDACluster(device_memory_limit=0.3) # spilling after 30% memory utilization - -Memory spilling can be disabled by setting ``device_memory_limit`` to 0: - -.. code-block:: python - - cluster = LocalCUDACluster(device_memory_limit=0) # spilling disabled - -The same applies for ``dask-cuda-worker``, and spilling can be controlled by setting ``--device-memory-limit``: - -.. code-block:: - - $ dask-scheduler - distributed.scheduler - INFO - Scheduler at: tcp://127.0.0.1:8786 - - $ dask-cuda-worker --device-memory-limit 50000 - $ dask-cuda-worker --device-memory-limit 5GB - $ dask-cuda-worker --device-memory-limit 0.3 - $ dask-cuda-worker --device-memory-limit 0 \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst index f479760c6..e8fa1b747 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -32,11 +32,11 @@ Contents ucx explicit_comms + spilling .. toctree:: :maxdepth: 1 :caption: Examples examples/worker_count - examples/spilling examples/ucx diff --git a/docs/source/spilling.rst b/docs/source/spilling.rst new file mode 100644 index 000000000..277f634f1 --- /dev/null +++ b/docs/source/spilling.rst @@ -0,0 +1,95 @@ +Spilling from device +==================== + +By default, Dask-CUDA enables spilling from GPU to host memory when a GPU reaches a memory utilization of 80%. +This can be changed to suit the needs of a workload, or disabled altogether, by explicitly setting ``device_memory_limit``. +This parameter accepts an integer or string memory size, or a float representing a percentage of the GPU's total memory: + +.. code-block:: python + + from dask_cuda import LocalCUDACluster + + cluster = LocalCUDACluster(device_memory_limit=50000) # spilling after 50000 bytes + cluster = LocalCUDACluster(device_memory_limit="5GB") # spilling after 5 GB + cluster = LocalCUDACluster(device_memory_limit=0.3) # spilling after 30% memory utilization + +Memory spilling can be disabled by setting ``device_memory_limit`` to 0: + +.. code-block:: python + + cluster = LocalCUDACluster(device_memory_limit=0) # spilling disabled + +The same applies for ``dask-cuda-worker``, and spilling can be controlled by setting ``--device-memory-limit``: + +.. code-block:: + + $ dask-scheduler + distributed.scheduler - INFO - Scheduler at: tcp://127.0.0.1:8786 + + $ dask-cuda-worker --device-memory-limit 50000 + $ dask-cuda-worker --device-memory-limit 5GB + $ dask-cuda-worker --device-memory-limit 0.3 + $ dask-cuda-worker --device-memory-limit 0 + + +JIT-Unspill +----------- +The regular spilling in Dask and Dask-CUDA has some significate issues. Instead of tracking individual objects, it tracks task outputs. +This means that a task returning a collection of CUDA objects will either spill all of the CUDA objects or none of them. +Other issues includes *object duplication*, *wrong spilling order*, and *non-tracking of sharing device buffers* +(see: https://github.com/dask/distributed/issues/4568#issuecomment-805049321) + + +In order to address all of these issues, Dask-CUDA introduces JIT-Unspilling, which can be enabled +by using the ``jit_unspill`` argument: + +.. code-block:: + + >>> import dask​ + >>> from distributed import Client​ + >>> from dask_cuda import LocalCUDACluster​ + + >>> cluster = LocalCUDACluster(n_workers=10, device_memory_limit="1GB", jit_unspill=True)​ + >>> client = Client(cluster)​ + + >>> with dask.config.set(jit_unspill=True):​ + ... cluster = LocalCUDACluster(n_workers=10, device_memory_limit="1GB")​ + ... client = Client(cluster) + + +.. code-block:: + + $ dask-scheduler + distributed.scheduler - INFO - Scheduler at: tcp://127.0.0.1:8786 + + $ dask-cuda-worker --enable-jit-unspill​ + $ DASK_JIT_UNSPILL=True dask-cuda-worker​ + + +JIT-Unspill wraps all CUDA objects, such as ``cudf.Dataframe``, in a ``ProxyObject``: + +.. code-block:: + + class ProxyObject: + """Object wrapper/proxy for serializable objects + + This is used by ProxifyHostFile to delay deserialization of returned objects. + + Objects proxied by an instance of this class will be JIT-deserialized when + accessed. The instance behaves as the proxied object and can be accessed/used + just like the proxied object. + + ProxyObject has some limitations and doesn't mimic the proxied object perfectly. + Thus, if encountering problems remember that it is always possible to use unproxy() + to access the proxied object directly or disable JIT deserialization completely + with `jit_unspill=False`. + + Type checking using instance() works as expected but direct type checking + doesn't: + >>> import numpy as np + >>> from dask_cuda.proxy_object import asproxy + >>> x = np.arange(3) + >>> isinstance(asproxy(x), type(x)) + True + >>> type(asproxy(x)) is type(x) + False