From 840df4dc460f48f263b5975f6f50695b7cb6aa35 Mon Sep 17 00:00:00 2001
From: Dan Bailey <danbailey@ilm.com>
Date: Fri, 6 Dec 2024 11:10:51 -0800
Subject: [PATCH 01/59] Meeting notes

Signed-off-by: Dan Bailey <danbailey@ilm.com>
---
 tsc/meetings/2024-12-03.md | 80 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 tsc/meetings/2024-12-03.md

diff --git a/tsc/meetings/2024-12-03.md b/tsc/meetings/2024-12-03.md
new file mode 100644
index 0000000000..e864d84bdd
--- /dev/null
+++ b/tsc/meetings/2024-12-03.md
@@ -0,0 +1,80 @@
+Minutes from OpenVDB TSC meeting, December 3rd, 2024
+
+Attendees: *Ken* M., *Andre* P, *Dan* B.
+
+Additional Attendees: Jonathan Swartz (NVIDIA), Dhruv Govil (Apple)
+
+Regrets: *Jeff* L., *Richard* J., *Nick* A., *Greg* H.
+
+Agenda:
+
+1) Confirm quorum
+2) Secretary
+3) PR1971
+4) Binary Distribution
+5) PR1977
+6) Fluid Example
+7) TBB Threading
+8) ASWF Code Build
+9) Next meeting
+
+------------
+
+1) Confirm quorum
+
+No quorum.
+
+2) Secretary
+
+Secretary is Dan Bailey.
+
+3) PR1971
+
+A bug was discovered with vdb_tool that wasn't caught with CI. This has now been
+addressed and the CI extended to test this case.
+
+4) Binary Distribution
+
+Brief discussion about binary distribution. Still want to support package
+manager maintainers, but hesitant to take this on ourselves. A bigger ROI might
+be to focus our efforts on migrating towards CMake config files to make using
+OpenVDB as a dependency easier. The autogenerated version.h is a step in that
+direction, but could be improved with CMake configs. Other ASWF projects such
+as OpenEXR have successfully gone in this direction.
+
+Anaconda integration has been attempted, but is challenging as it only supports
+a single library per module by default.
+
+5) PR1977
+
+Dhruv has submitted this PR to address issues with compiling OpenVDB using Clang
+19. This new version is stricter and is catching a few valid issues.
+
+6) Fluid Example
+
+All would like this to go in. Andre says that it is blocked on an issue with the
+Poisson solver which needs to be resolved. Also, the OpenVDB Remove Divergence
+SOP has a bug in how the matrices are initialized that needs to be addressed.
+
+There is a new NanoVDB poisson solver which is of interest to the group. All
+agreed that it would potentially be a good fit for NanoVDB. Dan would also
+like an example of how to call it directly from OpenVDB.
+
+7) TBB Threading
+
+Question about our use of TBB (#1973). Previously discussed making TBB optional
+by wrapping all the data structures and function calls in
+openvdb/util/Threading.h. This could provide a route to running
+single-threading or providing another threading implementation like Apple's
+Grand Central Dispatch. A lot of work needed and not clear how general this
+approach would be.
+
+8) ASWF Code Build
+
+Jonathan to reach out to CI working group to try and move this forward to get
+GPU runtime testing. Ken can escalate to TSC if need be.
+
+9) Next meeting
+
+Next meeting is Tuesday December 17, 2024 at 11:00 PST. This will be the last
+meeting of the year.
\ No newline at end of file

From ca4aec122b793c2ec53ed4ee01ec07b79007b1d2 Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Wed, 11 Dec 2024 18:21:55 +1300
Subject: [PATCH 02/59] Changed GH action upload-artifact to v4; fixes #1982

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 .github/workflows/ax.yml     | 2 +-
 .github/workflows/weekly.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ax.yml b/.github/workflows/ax.yml
index be512fbc2c..4e56d3d721 100644
--- a/.github/workflows/ax.yml
+++ b/.github/workflows/ax.yml
@@ -160,7 +160,7 @@ jobs:
     - name: build
       run: ./ci/build.sh -v --components=axgr --target=openvdb_ax_grammar --cargs=\"-DOPENVDB_AX_GRAMMAR_NO_LINES=ON\"
     - name: upload grammar
-      uses: actions/upload-artifact@v2
+      uses: actions/upload-artifact@v4
       with:
         name: ax_grammar
         path: ./build/openvdb_ax/openvdb_ax/openvdb_ax/grammar
diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml
index 747c8af744..57dec07ec8 100644
--- a/.github/workflows/weekly.yml
+++ b/.github/workflows/weekly.yml
@@ -589,7 +589,7 @@ jobs:
         -strict
         -extended
     - name: upload_report
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       if: always()
       with:
         name: abi_report

From 38adbab1bc788c0ec6a58f83fecc5ed73be3463b Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Fri, 13 Dec 2024 11:40:50 +1300
Subject: [PATCH 03/59] Updating the versions of checkout action; add
 .gitmodules to ignore list

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 .github/workflows/docs.yml       | 2 +-
 .github/workflows/whitespace.yml | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 617ae20a12..bdd4b3e2c1 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -110,7 +110,7 @@ jobs:
     env:
       CXX: g++
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: install_gcovr
       run: pip install gcovr
     - name: build
diff --git a/.github/workflows/whitespace.yml b/.github/workflows/whitespace.yml
index fb2e0187b1..57c3f74b00 100644
--- a/.github/workflows/whitespace.yml
+++ b/.github/workflows/whitespace.yml
@@ -1,4 +1,3 @@
-
 name: Whitespace
 
 on:
@@ -21,7 +20,7 @@ jobs:
   trailingspaces:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: test
       run: |
           set +e
@@ -32,9 +31,9 @@ jobs:
   spacesnottabs:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: test
       run: |
           set +e
-          git grep -n "	" -- ':!*/whitespace.yml' ':!tsc/meetings/*' ':!*.svg' ':!*.cmd' ':!*.png' ':!pendingchanges/*' ':!*.wlt' ':!*.jpg' ':!*.gif' ':!*.mp4' ':!*.pt' ':!*.pth' ':!*.nvdb' ':!*.npz'
+          git grep -n "	" -- ':!*/whitespace.yml' ':!tsc/meetings/*' ':!*.svg' ':!*.cmd' ':!*.png' ':!pendingchanges/*' ':!*.wlt' ':!*.jpg' ':!*.gif' ':!*.mp4' ':!*.pt' ':!*.pth' ':!*.nvdb' ':!*.npz' ':!*.gitmodules'
           test $? -eq 1

From 43a52207007b4048384573a36fb3239ae49eb8bd Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Fri, 13 Dec 2024 12:26:18 +1300
Subject: [PATCH 04/59] Adding fvdb CI unit tests to github workflows

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 .github/workflows/fvdb.yml | 236 +++++++++++++++++++++++++++++++++++++
 1 file changed, 236 insertions(+)
 create mode 100644 .github/workflows/fvdb.yml

diff --git a/.github/workflows/fvdb.yml b/.github/workflows/fvdb.yml
new file mode 100644
index 0000000000..cdfad89045
--- /dev/null
+++ b/.github/workflows/fvdb.yml
@@ -0,0 +1,236 @@
+name: fVDB Unit Tests
+
+on:
+  pull_request:
+    branches:
+        - 'master'
+        - 'feature/**'
+        - 'pr/**'
+    paths-ignore:
+        - 'CHANGES'
+        - 'CODEOWNERS'
+        - 'doc/**'
+        - 'openvdb/**'
+        - 'openvdb_cmd/**'
+        - 'openvdb_ax/**'
+        - 'openvdb_maya/**'
+        - 'openvdb_houdini/**'
+        - 'nanovdb/**'
+        - 'pendingchanges/**'
+        - '**.md'
+        - 'fvdb/debug/**'
+        - 'fvdb/docs/**'
+        - 'fvdb/examples/**'
+        - 'fvdb/notebooks/**'
+        - 'fvdb/scripts/**'
+
+# Allow subsequent pushes to the same PR or REF to cancel any previous jobs.
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: write
+  deployments: write
+
+jobs:
+  fvdb-build:
+    if: ${{ !startsWith(github.event.pull_request.title, 'Draft:') }}
+    name: fVDB Build
+    runs-on:
+      - self-hosted
+    container:
+      image: aswf/ci-openvdb:2024
+      env:
+        PYTHONPATH: ""
+      options: --rm
+    defaults:
+      run:
+        shell: bash -el {0}
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up fvdb_build Conda env
+        uses: conda-incubator/setup-miniconda@v3
+        with:
+          miniforge-version: latest
+          conda-remove-defaults: "true"
+          activate-environment: fvdb_build
+          environment-file: fvdb/env/build_environment.yml
+
+
+      - name: Buid fvdb
+        run: |
+          cd fvdb;
+          TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6+PTX" MAX_JOBS=$(($(nproc) < $(free -g | awk '/^Mem:/{jobs=int($7/2.5); if(jobs<1) jobs=1; print jobs}') ? $(nproc) : $(free -g | awk '/^Mem:/{jobs=int($7/2.5); if(jobs<1) jobs=1; print jobs}'))) conda run --no-capture-output -n fvdb_build python setup.py bdist_wheel --dist-dir=dist
+
+      - name: Upload package
+        uses: actions/upload-artifact@v4
+        with:
+            name: fvdb-test-package
+            path: dist/*.whl
+            retention-days: 2
+
+      - name: Cleanup
+        if: always()
+        run: |
+          echo "Cleaning up /__w/_temp directory"
+          sudo rm -rf /__w/_temp/*
+          echo "Cleanup completed"
+
+
+  fvdb-unit-test:
+    needs: [fvdb-build]
+    name: fVDB Unit Tests
+    runs-on:
+      - self-hosted
+    container:
+      image: aswf/ci-openvdb:2024
+      env:
+        PYTHONPATH: ""
+      options: --rm
+    defaults:
+      run:
+        shell: bash -el {0}
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up fvdb_test Conda env
+        uses: conda-incubator/setup-miniconda@v3
+        with:
+          miniforge-version: latest
+          conda-remove-defaults: "true"
+          activate-environment: fvdb_test
+          environment-file: fvdb/env/test_environment.yml
+
+      - name: Download package
+        uses: actions/download-artifact@v4
+        with:
+            name: fvdb-test-package
+            path: ./dist
+
+      - name: Install package
+        run: |
+            conda activate fvdb_test
+            pip install ./dist/*.whl
+
+      - name: Run tests
+        run: |
+            cd fvdb/tests;
+            pytest -v unit
+
+      - name: Cleanup
+        if: always()
+        run: |
+          echo "Cleaning up /__w/_temp directory"
+          sudo rm -rf /__w/_temp/*
+          echo "Cleanup completed"
+
+  fvdb-docs-test:
+    needs: [fvdb-build]
+    name: fVDB Documentation Tests
+    runs-on:
+      - self-hosted
+    container:
+      image: aswf/ci-openvdb:2024
+      env:
+        PYTHONPATH: ""
+      options: --rm
+    defaults:
+      run:
+        shell: bash -el {0}
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up fvdb_test Conda env
+        uses: conda-incubator/setup-miniconda@v3
+        with:
+          miniforge-version: latest
+          conda-remove-defaults: "true"
+          activate-environment: fvdb_test
+          environment-file: fvdb/env/test_environment.yml
+
+      - name: Download package
+        uses: actions/download-artifact@v4
+        with:
+            name: fvdb-test-package
+            path: ./dist
+
+      - name: Install package
+        run: |
+            conda activate fvdb_test
+            pip install ./dist/*.whl
+
+      - name: Run tests
+        run: |
+            pytest --markdown-docs fvdb/docs
+
+      - name: Cleanup
+        if: always()
+        run: |
+          echo "Cleaning up /__w/_temp directory"
+          sudo rm -rf /__w/_temp/*
+          echo "Cleanup completed"
+
+  fvdb-benchmarks:
+    needs: [fvdb-build]
+    name: fVDB Continuous Benchmarking
+    runs-on:
+      - self-hosted
+    container:
+      image: aswf/ci-openvdb:2024
+      env:
+        PYTHONPATH: ""
+      options: --rm
+    defaults:
+      run:
+        shell: bash -el {0}
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up fvdb_test Conda env
+        uses: conda-incubator/setup-miniconda@v3
+        with:
+          miniforge-version: latest
+          conda-remove-defaults: "true"
+          activate-environment: fvdb_test
+          environment-file: fvdb/env/test_environment.yml
+
+      - name: Download package
+        uses: actions/download-artifact@v4
+        with:
+            name: fvdb-test-package
+            path: ./dist
+
+      - name: Install package
+        run: |
+            conda activate fvdb_test
+            pip install ./dist/*.whl
+
+      - name: Disable git ownership verification
+        run: |
+          git config --global --add safe.directory "$(pwd)"
+
+      - name: Run benchmarks
+        run: |
+          cd fvdb/tests;
+          pytest benchmark --benchmark-json benchmark/output.json
+
+      - name: Store benchmark result
+        uses: benchmark-action/github-action-benchmark@v1
+        with:
+            name: Python Benchmark with pytest-benchmark
+            tool: 'pytest'
+            output-file-path: fvdb/fvdb/tests/benchmark/output.json
+            # Use personal access token instead of GITHUB_TOKEN due to https://github.community/t/github-action-not-triggering-gh-pages-upon-push/16096
+            github-token: ${{ secrets.GITHUB_TOKEN }}
+            auto-push: true
+            # Show alert with commit comment on detecting possible performance regression
+            alert-threshold: '200%'
+            comment-on-alert: true
+            fail-on-alert: true
+            alert-comment-cc-users: '@swahtz'
+
+      - name: Cleanup
+        if: always()
+        run: |
+          echo "Cleaning up /__w/_temp directory"
+          sudo rm -rf /__w/_temp/*
+          echo "Cleanup completed"
\ No newline at end of file

From f088f173d11a8ac34d0a3d69bba427ec29fa4096 Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Mon, 16 Dec 2024 13:07:12 +1300
Subject: [PATCH 05/59] Fixing fvdb wheel upload path

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 .github/workflows/fvdb.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/fvdb.yml b/.github/workflows/fvdb.yml
index cdfad89045..e7f6912fb7 100644
--- a/.github/workflows/fvdb.yml
+++ b/.github/workflows/fvdb.yml
@@ -68,7 +68,7 @@ jobs:
         uses: actions/upload-artifact@v4
         with:
             name: fvdb-test-package
-            path: dist/*.whl
+            path: fvdb/dist/*.whl
             retention-days: 2
 
       - name: Cleanup

From 128be93972fe8b6433fa99423f0fb2a517ad4761 Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Mon, 16 Dec 2024 13:46:57 +1300
Subject: [PATCH 06/59] fVDB continuous benchmaring results path fix

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 .github/workflows/fvdb.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/fvdb.yml b/.github/workflows/fvdb.yml
index e7f6912fb7..3f67b8e062 100644
--- a/.github/workflows/fvdb.yml
+++ b/.github/workflows/fvdb.yml
@@ -218,7 +218,7 @@ jobs:
         with:
             name: Python Benchmark with pytest-benchmark
             tool: 'pytest'
-            output-file-path: fvdb/fvdb/tests/benchmark/output.json
+            output-file-path: fvdb/tests/benchmark/output.json
             # Use personal access token instead of GITHUB_TOKEN due to https://github.community/t/github-action-not-triggering-gh-pages-upon-push/16096
             github-token: ${{ secrets.GITHUB_TOKEN }}
             auto-push: true

From f70b1a570fe8aa7f540bb219178d4a42466cf9dc Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Mon, 16 Dec 2024 21:56:45 +1300
Subject: [PATCH 07/59] Turn off explicit instantiation for debug linux build

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 6d479e57a5..ed4b12d8fb 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -78,7 +78,7 @@ jobs:
         config:
           - { cxx: clang++, image: '2024', abi: '12', build: 'Release', cmake: '' }
           - { cxx: g++,     image: '2024', abi: '12', build: 'Release', cmake: '' }
-          - { cxx: clang++, image: '2024', abi: '12', build: 'Debug',   cmake: '' }
+          - { cxx: clang++, image: '2024', abi: '12', build: 'Debug',   cmake: '-DUSE_EXPLICIT_INSTANTIATION=OFF' }
           - { cxx: clang++, image: '2023', abi: '11', build: 'Release', cmake: '-DDISABLE_DEPENDENCY_VERSION_CHECKS=ON' }
           - { cxx: g++,     image: '2023', abi: '11', build: 'Release', cmake: '-DDISABLE_DEPENDENCY_VERSION_CHECKS=ON' }
       fail-fast: false

From a035c14b90430018849eb4e995f87763bcb40614 Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Tue, 17 Dec 2024 09:04:33 +1300
Subject: [PATCH 08/59] Add fVDB code style check actions

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 .github/workflows/fvdb_codestyle.yml | 77 ++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 .github/workflows/fvdb_codestyle.yml

diff --git a/.github/workflows/fvdb_codestyle.yml b/.github/workflows/fvdb_codestyle.yml
new file mode 100644
index 0000000000..081757fe88
--- /dev/null
+++ b/.github/workflows/fvdb_codestyle.yml
@@ -0,0 +1,77 @@
+name: fVDB Code Style
+on:
+  pull_request:
+    branches:
+        - 'master'
+        - 'feature/**'
+        - 'pr/**'
+    paths-ignore:
+        - 'CHANGES'
+        - 'CODEOWNERS'
+        - 'doc/**'
+        - 'ci/**'
+        - 'openvdb/**'
+        - 'openvdb_cmd/**'
+        - 'openvdb_ax/**'
+        - 'openvdb_maya/**'
+        - 'openvdb_houdini/**'
+        - 'openvdb_wolfram/**'
+        - 'tsc/**'
+        - 'nanovdb/**'
+        - 'pendingchanges/**'
+        - '**.md'
+        - 'fvdb/docs/**'
+        - 'fvdb/env/**'
+        - 'fvdb/notebooks/**'
+
+
+
+# Allow subsequent pushes to the same PR or REF to cancel any previous jobs.
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  test-python-black-lint:
+    name: Check Python code style with black
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: psf/black@stable
+        with:
+          options: "--check --verbose --target-version=py311 --line-length=120"
+          src: "."
+          version: "~= 24.0"
+
+  test-cpp-clang-format-lint:
+    name: Check C++ code style with clang-format
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - uses: DoozyX/clang-format-lint-action@v0.18.2
+      with:
+        source: 'src/'
+        extensions: 'h,cpp,cc,cu,ch'
+        clangFormatVersion: 18
+        style: file
+
+  include-guards:
+    name: Check include guards
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - uses: swahtz/include-guards-check-action@master
+      with:
+        path: 'src/'
+        pattern: 'FVDB_{path}'
+
+  check-spdx-identifiers:
+    name: Check SPDX identifiers
+    runs-on: ubuntu-latest
+    steps:
+    - name: checkout
+      uses: actions/checkout@v2
+    - uses: enarx/spdx@master
+      with:
+        licenses: |-
+          Apache-2.0
\ No newline at end of file

From 7053bfd0d283b0ca0c648b116d7aed6f83c4919d Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Tue, 17 Dec 2024 14:31:07 +1300
Subject: [PATCH 09/59] Fix codestyle source paths

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 .github/workflows/fvdb_codestyle.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/fvdb_codestyle.yml b/.github/workflows/fvdb_codestyle.yml
index 081757fe88..5741ad6f52 100644
--- a/.github/workflows/fvdb_codestyle.yml
+++ b/.github/workflows/fvdb_codestyle.yml
@@ -40,7 +40,7 @@ jobs:
       - uses: psf/black@stable
         with:
           options: "--check --verbose --target-version=py311 --line-length=120"
-          src: "."
+          src: "fvdb/"
           version: "~= 24.0"
 
   test-cpp-clang-format-lint:
@@ -50,7 +50,7 @@ jobs:
     - uses: actions/checkout@v4
     - uses: DoozyX/clang-format-lint-action@v0.18.2
       with:
-        source: 'src/'
+        source: 'fvdb/src/'
         extensions: 'h,cpp,cc,cu,ch'
         clangFormatVersion: 18
         style: file
@@ -62,7 +62,7 @@ jobs:
     - uses: actions/checkout@v4
     - uses: swahtz/include-guards-check-action@master
       with:
-        path: 'src/'
+        path: 'fvdb/src/'
         pattern: 'FVDB_{path}'
 
   check-spdx-identifiers:
@@ -70,7 +70,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - name: checkout
-      uses: actions/checkout@v2
+      uses: actions/checkout@v4
     - uses: enarx/spdx@master
       with:
         licenses: |-

From caa3969681d019f782916ce2a7c8d8ccf8509974 Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Tue, 17 Dec 2024 15:21:47 +1300
Subject: [PATCH 10/59] Added spdx license identifiers to files missing them.

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 ci/build.sh                                                   | 2 ++
 ci/build_sonar.sh                                             | 2 ++
 ci/download_houdini.sh                                        | 2 ++
 ci/download_vdb_caches.py                                     | 2 ++
 ci/extract_test_examples.sh                                   | 2 ++
 ci/install_blosc.sh                                           | 2 ++
 ci/install_cppunit.sh                                         | 2 ++
 ci/install_doxygen.sh                                         | 2 ++
 ci/install_glfw.sh                                            | 2 ++
 ci/install_gtest.sh                                           | 2 ++
 ci/install_llvm_windows.sh                                    | 2 ++
 ci/install_macos.sh                                           | 2 ++
 ci/install_nanobind.sh                                        | 2 ++
 ci/install_tbb_macos.sh                                       | 2 ++
 ci/test_install.sh                                            | 2 ++
 ci/test_sonar.sh                                              | 2 ++
 nanovdb/nanovdb/cmd/updateFiles.py                            | 2 ++
 nanovdb/nanovdb/cmd/updateFiles.sh                            | 2 ++
 nanovdb/nanovdb/docs/CMakeLists.txt                           | 2 ++
 .../read_nanovdb_sphere_accessor.cc                           | 2 ++
 nanovdb/nanovdb/python/CMakeLists.txt                         | 2 ++
 nanovdb/nanovdb/python/NanoVDBModule.cc                       | 2 ++
 nanovdb/nanovdb/python/PyCreateNanoGrid.cc                    | 2 ++
 nanovdb/nanovdb/python/PyCreateNanoGrid.h                     | 2 ++
 nanovdb/nanovdb/python/PyGridChecksum.cc                      | 2 ++
 nanovdb/nanovdb/python/PyGridChecksum.h                       | 2 ++
 nanovdb/nanovdb/python/PyGridHandle.cc                        | 2 ++
 nanovdb/nanovdb/python/PyGridHandle.h                         | 2 ++
 nanovdb/nanovdb/python/PyGridStats.cc                         | 2 ++
 nanovdb/nanovdb/python/PyGridStats.h                          | 2 ++
 nanovdb/nanovdb/python/PyGridValidator.cc                     | 2 ++
 nanovdb/nanovdb/python/PyGridValidator.h                      | 2 ++
 nanovdb/nanovdb/python/PyHostBuffer.cc                        | 2 ++
 nanovdb/nanovdb/python/PyHostBuffer.h                         | 2 ++
 nanovdb/nanovdb/python/PyIO.cc                                | 2 ++
 nanovdb/nanovdb/python/PyIO.h                                 | 2 ++
 nanovdb/nanovdb/python/PyMath.cc                              | 2 ++
 nanovdb/nanovdb/python/PyMath.h                               | 2 ++
 nanovdb/nanovdb/python/PyNanoToOpenVDB.cc                     | 2 ++
 nanovdb/nanovdb/python/PyNanoToOpenVDB.h                      | 2 ++
 nanovdb/nanovdb/python/PyPrimitives.cc                        | 2 ++
 nanovdb/nanovdb/python/PyPrimitives.h                         | 2 ++
 nanovdb/nanovdb/python/PySampleFromVoxels.cc                  | 2 ++
 nanovdb/nanovdb/python/PySampleFromVoxels.h                   | 2 ++
 nanovdb/nanovdb/python/PyTools.cc                             | 2 ++
 nanovdb/nanovdb/python/PyTools.h                              | 2 ++
 nanovdb/nanovdb/python/__init__.py                            | 2 ++
 nanovdb/nanovdb/python/cuda/PyDeviceBuffer.cc                 | 2 ++
 nanovdb/nanovdb/python/cuda/PyDeviceBuffer.h                  | 2 ++
 nanovdb/nanovdb/python/cuda/PyDeviceGridHandle.cu             | 2 ++
 nanovdb/nanovdb/python/cuda/PyPointsToGrid.cu                 | 2 ++
 nanovdb/nanovdb/python/cuda/PyPointsToGrid.h                  | 2 ++
 nanovdb/nanovdb/python/cuda/PySampleFromVoxels.cu             | 2 ++
 nanovdb/nanovdb/python/cuda/PySampleFromVoxels.h              | 2 ++
 nanovdb/nanovdb/python/cuda/PySignedFloodFill.cu              | 2 ++
 nanovdb/nanovdb/python/cuda/PySignedFloodFill.h               | 2 ++
 nanovdb/nanovdb/python/test/TestNanoVDB.py                    | 4 +++-
 openvdb/openvdb/python/__init__.py                            | 2 ++
 openvdb/openvdb/python/pyTypeCasters.h                        | 2 ++
 openvdb_cmd/vdb_tool/CMakeLists.txt                           | 2 ++
 60 files changed, 121 insertions(+), 1 deletion(-)

diff --git a/ci/build.sh b/ci/build.sh
index 110455c881..b0b3c8dfb1 100755
--- a/ci/build.sh
+++ b/ci/build.sh
@@ -1,4 +1,6 @@
 #!/usr/bin/env bash
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
 
 set -e
 
diff --git a/ci/build_sonar.sh b/ci/build_sonar.sh
index 10012a5c43..1e720bc626 100755
--- a/ci/build_sonar.sh
+++ b/ci/build_sonar.sh
@@ -1,4 +1,6 @@
 #!/usr/bin/env bash
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
 
 set -ex
 
diff --git a/ci/download_houdini.sh b/ci/download_houdini.sh
index 77aa8e3bd9..0f63a4eceb 100755
--- a/ci/download_houdini.sh
+++ b/ci/download_houdini.sh
@@ -1,4 +1,6 @@
 #!/usr/bin/env bash
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
 
 set -ex
 
diff --git a/ci/download_vdb_caches.py b/ci/download_vdb_caches.py
index 8fa6dadb74..55edaaca74 100755
--- a/ci/download_vdb_caches.py
+++ b/ci/download_vdb_caches.py
@@ -1,4 +1,6 @@
 #!/usr/bin/env python3
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
 
 import os
 import sys
diff --git a/ci/extract_test_examples.sh b/ci/extract_test_examples.sh
index cc892c12b7..5cdf559863 100755
--- a/ci/extract_test_examples.sh
+++ b/ci/extract_test_examples.sh
@@ -1,4 +1,6 @@
 #!/usr/bin/env bash
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
 #################################################################################
 # This script extracts all code blocks from AX documentation which are NOT      #
 # marked as cpp/sh/unparsed and attempts to parse or compile them through the   #
diff --git a/ci/install_blosc.sh b/ci/install_blosc.sh
index 0436fa875e..e40d233f5b 100755
--- a/ci/install_blosc.sh
+++ b/ci/install_blosc.sh
@@ -1,4 +1,6 @@
 #!/usr/bin/env bash
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
 
 set -ex
 
diff --git a/ci/install_cppunit.sh b/ci/install_cppunit.sh
index 7f64238e8a..da7cc300b3 100755
--- a/ci/install_cppunit.sh
+++ b/ci/install_cppunit.sh
@@ -1,4 +1,6 @@
 #!/usr/bin/env bash
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
 
 set -ex
 CURL_VERSION="$1"
diff --git a/ci/install_doxygen.sh b/ci/install_doxygen.sh
index 5c3ab3136d..d89ac38510 100755
--- a/ci/install_doxygen.sh
+++ b/ci/install_doxygen.sh
@@ -1,4 +1,6 @@
 #!/usr/bin/env bash
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
 
 set -ex
 
diff --git a/ci/install_glfw.sh b/ci/install_glfw.sh
index 33108657f4..faeb243ae7 100755
--- a/ci/install_glfw.sh
+++ b/ci/install_glfw.sh
@@ -1,4 +1,6 @@
 #!/usr/bin/env bash
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
 
 set -ex
 GLFW_VERSION="$1"
diff --git a/ci/install_gtest.sh b/ci/install_gtest.sh
index 71d7fbd7bb..9084b31cf4 100755
--- a/ci/install_gtest.sh
+++ b/ci/install_gtest.sh
@@ -1,4 +1,6 @@
 #!/usr/bin/env bash
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
 
 set -ex
 
diff --git a/ci/install_llvm_windows.sh b/ci/install_llvm_windows.sh
index c174c50296..d8866520f0 100644
--- a/ci/install_llvm_windows.sh
+++ b/ci/install_llvm_windows.sh
@@ -1,4 +1,6 @@
 #!/usr/bin/env bash
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
 
 set -ex
 
diff --git a/ci/install_macos.sh b/ci/install_macos.sh
index 80b71d4864..a581229035 100755
--- a/ci/install_macos.sh
+++ b/ci/install_macos.sh
@@ -1,4 +1,6 @@
 #!/usr/bin/env bash
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
 
 set -x
 
diff --git a/ci/install_nanobind.sh b/ci/install_nanobind.sh
index 50b6766b81..8a885a6b4a 100755
--- a/ci/install_nanobind.sh
+++ b/ci/install_nanobind.sh
@@ -1,4 +1,6 @@
 #!/usr/bin/env bash
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
 
 set -ex
 
diff --git a/ci/install_tbb_macos.sh b/ci/install_tbb_macos.sh
index 4b71fe9e93..cb8d8d9595 100755
--- a/ci/install_tbb_macos.sh
+++ b/ci/install_tbb_macos.sh
@@ -1,4 +1,6 @@
 #!/usr/bin/env bash
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
 
 set -x
 
diff --git a/ci/test_install.sh b/ci/test_install.sh
index da4bc0f004..5ff44aecd4 100755
--- a/ci/test_install.sh
+++ b/ci/test_install.sh
@@ -1,4 +1,6 @@
 #!/usr/bin/env bash
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
 set -e
 
 # Various tests to test the FindOpenVDB CMake modules and
diff --git a/ci/test_sonar.sh b/ci/test_sonar.sh
index 15cc571d23..c8f72a0c86 100755
--- a/ci/test_sonar.sh
+++ b/ci/test_sonar.sh
@@ -1,4 +1,6 @@
 #!/usr/bin/env bash
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
 
 set -ex
 
diff --git a/nanovdb/nanovdb/cmd/updateFiles.py b/nanovdb/nanovdb/cmd/updateFiles.py
index e4041c91f6..16fe6b8806 100644
--- a/nanovdb/nanovdb/cmd/updateFiles.py
+++ b/nanovdb/nanovdb/cmd/updateFiles.py
@@ -1,3 +1,5 @@
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
 import argparse
 import os
 from pathlib import Path
diff --git a/nanovdb/nanovdb/cmd/updateFiles.sh b/nanovdb/nanovdb/cmd/updateFiles.sh
index 87613c3a44..b75a65656f 100755
--- a/nanovdb/nanovdb/cmd/updateFiles.sh
+++ b/nanovdb/nanovdb/cmd/updateFiles.sh
@@ -1,4 +1,6 @@
 #!/bin/bash
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
 #Usage process all files in this directory or optionally specify a target directory
 
 # Define directory in which to find files
diff --git a/nanovdb/nanovdb/docs/CMakeLists.txt b/nanovdb/nanovdb/docs/CMakeLists.txt
index 6a131e68e7..81910cbe3b 100644
--- a/nanovdb/nanovdb/docs/CMakeLists.txt
+++ b/nanovdb/nanovdb/docs/CMakeLists.txt
@@ -1,3 +1,5 @@
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
 # find_package(doxygen REQUIRED dot  )
 if(WIN32)
   set(DOXYGEN_EXECUTABLE "C:/Program Files/doxygen/bin/doxygen.exe")
diff --git a/nanovdb/nanovdb/examples/ex_read_nanovdb_sphere_accessor/read_nanovdb_sphere_accessor.cc b/nanovdb/nanovdb/examples/ex_read_nanovdb_sphere_accessor/read_nanovdb_sphere_accessor.cc
index 91010b6cf7..dc763ddc8a 100644
--- a/nanovdb/nanovdb/examples/ex_read_nanovdb_sphere_accessor/read_nanovdb_sphere_accessor.cc
+++ b/nanovdb/nanovdb/examples/ex_read_nanovdb_sphere_accessor/read_nanovdb_sphere_accessor.cc
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #include <nanovdb/io/IO.h> // this is required to read (and write) NanoVDB files on the host
 
 /// @brief Read a NanoVDB grid from a file and print out multiple values.
diff --git a/nanovdb/nanovdb/python/CMakeLists.txt b/nanovdb/nanovdb/python/CMakeLists.txt
index d5c50792ee..f13cf10116 100644
--- a/nanovdb/nanovdb/python/CMakeLists.txt
+++ b/nanovdb/nanovdb/python/CMakeLists.txt
@@ -1,3 +1,5 @@
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
 option(NANOVDB_BUILD_PYTHON_UNITTESTS [=[
   "Include the NanoVDB Python unit test. Requires a python interpreter]=]
 ${NANOVDB_BUILD_UNITTESTS})
diff --git a/nanovdb/nanovdb/python/NanoVDBModule.cc b/nanovdb/nanovdb/python/NanoVDBModule.cc
index 583b5464f9..89af0ec095 100644
--- a/nanovdb/nanovdb/python/NanoVDBModule.cc
+++ b/nanovdb/nanovdb/python/NanoVDBModule.cc
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #include <nanobind/nanobind.h>
 #include <nanobind/operators.h>
 #include <nanobind/stl/string.h>
diff --git a/nanovdb/nanovdb/python/PyCreateNanoGrid.cc b/nanovdb/nanovdb/python/PyCreateNanoGrid.cc
index c776865306..eed931766c 100644
--- a/nanovdb/nanovdb/python/PyCreateNanoGrid.cc
+++ b/nanovdb/nanovdb/python/PyCreateNanoGrid.cc
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #include "PyCreateNanoGrid.h"
 
 #include <nanobind/stl/string.h>
diff --git a/nanovdb/nanovdb/python/PyCreateNanoGrid.h b/nanovdb/nanovdb/python/PyCreateNanoGrid.h
index 863ca1b396..3402ca67b8 100644
--- a/nanovdb/nanovdb/python/PyCreateNanoGrid.h
+++ b/nanovdb/nanovdb/python/PyCreateNanoGrid.h
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #ifndef NANOVDB_PYCREATENANOGRID_HAS_BEEN_INCLUDED
 #define NANOVDB_PYCREATENANOGRID_HAS_BEEN_INCLUDED
 
diff --git a/nanovdb/nanovdb/python/PyGridChecksum.cc b/nanovdb/nanovdb/python/PyGridChecksum.cc
index ba6e709b10..281fd05c0b 100644
--- a/nanovdb/nanovdb/python/PyGridChecksum.cc
+++ b/nanovdb/nanovdb/python/PyGridChecksum.cc
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #include "PyGridChecksum.h"
 
 #include <nanovdb/tools/GridChecksum.h>
diff --git a/nanovdb/nanovdb/python/PyGridChecksum.h b/nanovdb/nanovdb/python/PyGridChecksum.h
index f8c2048b83..dd988f871c 100644
--- a/nanovdb/nanovdb/python/PyGridChecksum.h
+++ b/nanovdb/nanovdb/python/PyGridChecksum.h
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #ifndef NANOVDB_PYGRIDCHECKSUM_HAS_BEEN_INCLUDED
 #define NANOVDB_PYGRIDCHECKSUM_HAS_BEEN_INCLUDED
 
diff --git a/nanovdb/nanovdb/python/PyGridHandle.cc b/nanovdb/nanovdb/python/PyGridHandle.cc
index 4f4e8ff72d..efd4253337 100644
--- a/nanovdb/nanovdb/python/PyGridHandle.cc
+++ b/nanovdb/nanovdb/python/PyGridHandle.cc
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #include "PyGridHandle.h"
 #include <nanobind/ndarray.h>
 
diff --git a/nanovdb/nanovdb/python/PyGridHandle.h b/nanovdb/nanovdb/python/PyGridHandle.h
index a15c588c83..190c6ab15f 100644
--- a/nanovdb/nanovdb/python/PyGridHandle.h
+++ b/nanovdb/nanovdb/python/PyGridHandle.h
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #ifndef NANOVDB_PYGRIDHANDLE_HAS_BEEN_INCLUDED
 #define NANOVDB_PYGRIDHANDLE_HAS_BEEN_INCLUDED
 
diff --git a/nanovdb/nanovdb/python/PyGridStats.cc b/nanovdb/nanovdb/python/PyGridStats.cc
index 7f8598a7ac..fbd8caec15 100644
--- a/nanovdb/nanovdb/python/PyGridStats.cc
+++ b/nanovdb/nanovdb/python/PyGridStats.cc
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #include "PyGridStats.h"
 
 #include <nanovdb/tools/GridStats.h>
diff --git a/nanovdb/nanovdb/python/PyGridStats.h b/nanovdb/nanovdb/python/PyGridStats.h
index c53af4567d..90254bc23b 100644
--- a/nanovdb/nanovdb/python/PyGridStats.h
+++ b/nanovdb/nanovdb/python/PyGridStats.h
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #ifndef NANOVDB_PYGRIDSTATS_HAS_BEEN_INCLUDED
 #define NANOVDB_PYGRIDSTATS_HAS_BEEN_INCLUDED
 
diff --git a/nanovdb/nanovdb/python/PyGridValidator.cc b/nanovdb/nanovdb/python/PyGridValidator.cc
index 278588b466..8e4f20df64 100644
--- a/nanovdb/nanovdb/python/PyGridValidator.cc
+++ b/nanovdb/nanovdb/python/PyGridValidator.cc
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #include "PyGridValidator.h"
 
 #include <nanovdb/GridHandle.h>
diff --git a/nanovdb/nanovdb/python/PyGridValidator.h b/nanovdb/nanovdb/python/PyGridValidator.h
index 6725f88b4d..659dede241 100644
--- a/nanovdb/nanovdb/python/PyGridValidator.h
+++ b/nanovdb/nanovdb/python/PyGridValidator.h
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #ifndef NANOVDB_PYGRIDVALIDATOR_HAS_BEEN_INCLUDED
 #define NANOVDB_PYGRIDVALIDATOR_HAS_BEEN_INCLUDED
 
diff --git a/nanovdb/nanovdb/python/PyHostBuffer.cc b/nanovdb/nanovdb/python/PyHostBuffer.cc
index 4af48c2040..e8fdfb3946 100644
--- a/nanovdb/nanovdb/python/PyHostBuffer.cc
+++ b/nanovdb/nanovdb/python/PyHostBuffer.cc
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #include "PyHostBuffer.h"
 
 #include <nanovdb/HostBuffer.h>
diff --git a/nanovdb/nanovdb/python/PyHostBuffer.h b/nanovdb/nanovdb/python/PyHostBuffer.h
index f1bf704edc..29b5a917ce 100644
--- a/nanovdb/nanovdb/python/PyHostBuffer.h
+++ b/nanovdb/nanovdb/python/PyHostBuffer.h
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #ifndef NANOVDB_PYHOSTBUFFER_HAS_BEEN_INCLUDED
 #define NANOVDB_PYHOSTBUFFER_HAS_BEEN_INCLUDED
 
diff --git a/nanovdb/nanovdb/python/PyIO.cc b/nanovdb/nanovdb/python/PyIO.cc
index d77cb40f86..b46a52ba74 100644
--- a/nanovdb/nanovdb/python/PyIO.cc
+++ b/nanovdb/nanovdb/python/PyIO.cc
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #include "PyIO.h"
 
 #include <nanovdb/io/IO.h>
diff --git a/nanovdb/nanovdb/python/PyIO.h b/nanovdb/nanovdb/python/PyIO.h
index 907ef0f7f9..de857a5148 100644
--- a/nanovdb/nanovdb/python/PyIO.h
+++ b/nanovdb/nanovdb/python/PyIO.h
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #ifndef NANOVDB_PYIO_HAS_BEEN_INCLUDED
 #define NANOVDB_PYIO_HAS_BEEN_INCLUDED
 
diff --git a/nanovdb/nanovdb/python/PyMath.cc b/nanovdb/nanovdb/python/PyMath.cc
index 06a4ad5606..337865ed8c 100644
--- a/nanovdb/nanovdb/python/PyMath.cc
+++ b/nanovdb/nanovdb/python/PyMath.cc
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #include "PyMath.h"
 
 #include <sstream>
diff --git a/nanovdb/nanovdb/python/PyMath.h b/nanovdb/nanovdb/python/PyMath.h
index e9a2cde791..13c288e110 100644
--- a/nanovdb/nanovdb/python/PyMath.h
+++ b/nanovdb/nanovdb/python/PyMath.h
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #ifndef NANOVDB_PYMATH_HAS_BEEN_INCLUDED
 #define NANOVDB_PYMATH_HAS_BEEN_INCLUDED
 
diff --git a/nanovdb/nanovdb/python/PyNanoToOpenVDB.cc b/nanovdb/nanovdb/python/PyNanoToOpenVDB.cc
index 8225cd78ca..49dc464a05 100644
--- a/nanovdb/nanovdb/python/PyNanoToOpenVDB.cc
+++ b/nanovdb/nanovdb/python/PyNanoToOpenVDB.cc
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #include "PyNanoToOpenVDB.h"
 
 #include <nanovdb/HostBuffer.h>
diff --git a/nanovdb/nanovdb/python/PyNanoToOpenVDB.h b/nanovdb/nanovdb/python/PyNanoToOpenVDB.h
index ac60fdbf96..54c38501e5 100644
--- a/nanovdb/nanovdb/python/PyNanoToOpenVDB.h
+++ b/nanovdb/nanovdb/python/PyNanoToOpenVDB.h
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #ifndef NANOVDB_PYNANOTOOPENVDB_HAS_BEEN_INCLUDED
 #define NANOVDB_PYNANOTOOPENVDB_HAS_BEEN_INCLUDED
 
diff --git a/nanovdb/nanovdb/python/PyPrimitives.cc b/nanovdb/nanovdb/python/PyPrimitives.cc
index 3b7f11c0d9..29053d4e68 100644
--- a/nanovdb/nanovdb/python/PyPrimitives.cc
+++ b/nanovdb/nanovdb/python/PyPrimitives.cc
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #include "PyPrimitives.h"
 
 #include <nanobind/stl/string.h>
diff --git a/nanovdb/nanovdb/python/PyPrimitives.h b/nanovdb/nanovdb/python/PyPrimitives.h
index 8be1e0208c..a930ee5d8d 100644
--- a/nanovdb/nanovdb/python/PyPrimitives.h
+++ b/nanovdb/nanovdb/python/PyPrimitives.h
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #ifndef NANOVDB_PYPRIMITIVES_HAS_BEEN_INCLUDED
 #define NANOVDB_PYPRIMITIVES_HAS_BEEN_INCLUDED
 
diff --git a/nanovdb/nanovdb/python/PySampleFromVoxels.cc b/nanovdb/nanovdb/python/PySampleFromVoxels.cc
index 2a3b7e5b6e..82dc9a26d1 100644
--- a/nanovdb/nanovdb/python/PySampleFromVoxels.cc
+++ b/nanovdb/nanovdb/python/PySampleFromVoxels.cc
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #include "PySampleFromVoxels.h"
 
 #include <nanovdb/NanoVDB.h>
diff --git a/nanovdb/nanovdb/python/PySampleFromVoxels.h b/nanovdb/nanovdb/python/PySampleFromVoxels.h
index 140db2f3be..6c5733f714 100644
--- a/nanovdb/nanovdb/python/PySampleFromVoxels.h
+++ b/nanovdb/nanovdb/python/PySampleFromVoxels.h
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #ifndef NANOVDB_PYSAMPLEFROMVOXELS_HAS_BEEN_INCLUDED
 #define NANOVDB_PYSAMPLEFROMVOXELS_HAS_BEEN_INCLUDED
 
diff --git a/nanovdb/nanovdb/python/PyTools.cc b/nanovdb/nanovdb/python/PyTools.cc
index de74380987..4df996d557 100644
--- a/nanovdb/nanovdb/python/PyTools.cc
+++ b/nanovdb/nanovdb/python/PyTools.cc
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #include "PyTools.h"
 
 #include <nanovdb/NanoVDB.h>
diff --git a/nanovdb/nanovdb/python/PyTools.h b/nanovdb/nanovdb/python/PyTools.h
index be6e49ff53..bfb9d42ef8 100644
--- a/nanovdb/nanovdb/python/PyTools.h
+++ b/nanovdb/nanovdb/python/PyTools.h
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #ifndef NANOVDB_PYTOOLS_HAS_BEEN_INCLUDED
 #define NANOVDB_PYTOOLS_HAS_BEEN_INCLUDED
 
diff --git a/nanovdb/nanovdb/python/__init__.py b/nanovdb/nanovdb/python/__init__.py
index 7bce6a9559..4f955e4a02 100644
--- a/nanovdb/nanovdb/python/__init__.py
+++ b/nanovdb/nanovdb/python/__init__.py
@@ -1,3 +1,5 @@
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
 import sys
 if sys.platform == "win32":
     import os
diff --git a/nanovdb/nanovdb/python/cuda/PyDeviceBuffer.cc b/nanovdb/nanovdb/python/cuda/PyDeviceBuffer.cc
index 58a6d984f3..523e4b5834 100644
--- a/nanovdb/nanovdb/python/cuda/PyDeviceBuffer.cc
+++ b/nanovdb/nanovdb/python/cuda/PyDeviceBuffer.cc
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #ifdef NANOVDB_USE_CUDA
 
 #include "PyDeviceBuffer.h"
diff --git a/nanovdb/nanovdb/python/cuda/PyDeviceBuffer.h b/nanovdb/nanovdb/python/cuda/PyDeviceBuffer.h
index c65239cb69..87a081f638 100644
--- a/nanovdb/nanovdb/python/cuda/PyDeviceBuffer.h
+++ b/nanovdb/nanovdb/python/cuda/PyDeviceBuffer.h
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #ifndef NANOVDB_CUDA_PYDEVICEBUFFER_HAS_BEEN_INCLUDED
 #define NANOVDB_CUDA_PYDEVICEBUFFER_HAS_BEEN_INCLUDED
 
diff --git a/nanovdb/nanovdb/python/cuda/PyDeviceGridHandle.cu b/nanovdb/nanovdb/python/cuda/PyDeviceGridHandle.cu
index a8dc924d7f..983caebd05 100644
--- a/nanovdb/nanovdb/python/cuda/PyDeviceGridHandle.cu
+++ b/nanovdb/nanovdb/python/cuda/PyDeviceGridHandle.cu
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #ifdef NANOVDB_USE_CUDA
 
 #include "../PyGridHandle.h"
diff --git a/nanovdb/nanovdb/python/cuda/PyPointsToGrid.cu b/nanovdb/nanovdb/python/cuda/PyPointsToGrid.cu
index 82a5826fc4..933f5570b6 100644
--- a/nanovdb/nanovdb/python/cuda/PyPointsToGrid.cu
+++ b/nanovdb/nanovdb/python/cuda/PyPointsToGrid.cu
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #include "PyPointsToGrid.h"
 
 #include <nanobind/ndarray.h>
diff --git a/nanovdb/nanovdb/python/cuda/PyPointsToGrid.h b/nanovdb/nanovdb/python/cuda/PyPointsToGrid.h
index 4bb530ac22..4af07f5657 100644
--- a/nanovdb/nanovdb/python/cuda/PyPointsToGrid.h
+++ b/nanovdb/nanovdb/python/cuda/PyPointsToGrid.h
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #ifndef NANOVDB_CUDA_PYPOINTSTOGRID_HAS_BEEN_INCLUDED
 #define NANOVDB_CUDA_PYPOINTSTOGRID_HAS_BEEN_INCLUDED
 
diff --git a/nanovdb/nanovdb/python/cuda/PySampleFromVoxels.cu b/nanovdb/nanovdb/python/cuda/PySampleFromVoxels.cu
index 8bab06e51e..aaebf66772 100644
--- a/nanovdb/nanovdb/python/cuda/PySampleFromVoxels.cu
+++ b/nanovdb/nanovdb/python/cuda/PySampleFromVoxels.cu
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #include "PySampleFromVoxels.h"
 
 #include <nanobind/ndarray.h>
diff --git a/nanovdb/nanovdb/python/cuda/PySampleFromVoxels.h b/nanovdb/nanovdb/python/cuda/PySampleFromVoxels.h
index c23a97e849..22cc4aec4d 100644
--- a/nanovdb/nanovdb/python/cuda/PySampleFromVoxels.h
+++ b/nanovdb/nanovdb/python/cuda/PySampleFromVoxels.h
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #ifndef NANOVDB_CUDA_PYSAMPLEFROMVOXELS_HAS_BEEN_INCLUDED
 #define NANOVDB_CUDA_PYSAMPLEFROMVOXELS_HAS_BEEN_INCLUDED
 
diff --git a/nanovdb/nanovdb/python/cuda/PySignedFloodFill.cu b/nanovdb/nanovdb/python/cuda/PySignedFloodFill.cu
index 2b68ca38df..ddbd5c67cb 100644
--- a/nanovdb/nanovdb/python/cuda/PySignedFloodFill.cu
+++ b/nanovdb/nanovdb/python/cuda/PySignedFloodFill.cu
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #include "PySignedFloodFill.h"
 
 #include <nanovdb/tools/cuda/SignedFloodFill.cuh>
diff --git a/nanovdb/nanovdb/python/cuda/PySignedFloodFill.h b/nanovdb/nanovdb/python/cuda/PySignedFloodFill.h
index 7ae16defe7..d5f19bcbe2 100644
--- a/nanovdb/nanovdb/python/cuda/PySignedFloodFill.h
+++ b/nanovdb/nanovdb/python/cuda/PySignedFloodFill.h
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #ifndef NANOVDB_CUDA_PYSIGNEDFLOODFILL_HAS_BEEN_INCLUDED
 #define NANOVDB_CUDA_PYSIGNEDFLOODFILL_HAS_BEEN_INCLUDED
 
diff --git a/nanovdb/nanovdb/python/test/TestNanoVDB.py b/nanovdb/nanovdb/python/test/TestNanoVDB.py
index fca793295e..03660c6342 100644
--- a/nanovdb/nanovdb/python/test/TestNanoVDB.py
+++ b/nanovdb/nanovdb/python/test/TestNanoVDB.py
@@ -1,4 +1,6 @@
-# /usr/bin/env python
+#!/usr/bin/env python
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
 
 import nanovdb
 import unittest
diff --git a/openvdb/openvdb/python/__init__.py b/openvdb/openvdb/python/__init__.py
index 1b25ddfcc5..39242a9737 100644
--- a/openvdb/openvdb/python/__init__.py
+++ b/openvdb/openvdb/python/__init__.py
@@ -1 +1,3 @@
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
 from .lib.openvdb import *
diff --git a/openvdb/openvdb/python/pyTypeCasters.h b/openvdb/openvdb/python/pyTypeCasters.h
index c9faf3f0bd..56159666bd 100644
--- a/openvdb/openvdb/python/pyTypeCasters.h
+++ b/openvdb/openvdb/python/pyTypeCasters.h
@@ -1,3 +1,5 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
 #ifndef OPENVDB_PYTYPECASTERS_HAS_BEEN_INCLUDED
 #define OPENVDB_PYTYPECASTERS_HAS_BEEN_INCLUDED
 
diff --git a/openvdb_cmd/vdb_tool/CMakeLists.txt b/openvdb_cmd/vdb_tool/CMakeLists.txt
index c0a2d64277..8101a78816 100644
--- a/openvdb_cmd/vdb_tool/CMakeLists.txt
+++ b/openvdb_cmd/vdb_tool/CMakeLists.txt
@@ -1,3 +1,5 @@
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
 cmake_minimum_required(VERSION 3.20)
 
 # CMP0091 allows for MSVC ABI targetting via CMAKE_MSVC_RUNTIME_LIBRARY

From 1d9c7345b6c443df76dc37aa518f70650f9193af Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Tue, 17 Dec 2024 17:56:40 +1300
Subject: [PATCH 11/59] Addressing review notes

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 .github/workflows/fvdb_codestyle.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/fvdb_codestyle.yml b/.github/workflows/fvdb_codestyle.yml
index 5741ad6f52..90983770d4 100644
--- a/.github/workflows/fvdb_codestyle.yml
+++ b/.github/workflows/fvdb_codestyle.yml
@@ -51,7 +51,7 @@ jobs:
     - uses: DoozyX/clang-format-lint-action@v0.18.2
       with:
         source: 'fvdb/src/'
-        extensions: 'h,cpp,cc,cu,ch'
+        extensions: 'h,cpp,cc,cu,cuh'
         clangFormatVersion: 18
         style: file
 
@@ -74,4 +74,4 @@ jobs:
     - uses: enarx/spdx@master
       with:
         licenses: |-
-          Apache-2.0
\ No newline at end of file
+          Apache-2.0

From b78b48204e89a54b36f88fd3802950716eef42c0 Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Wed, 18 Dec 2024 13:08:04 +1300
Subject: [PATCH 12/59] Fix typo in SPDX license identifier

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 openvdb_ax/openvdb_ax/compiler/AttributeBindings.h | 2 +-
 openvdb_ax/openvdb_ax/compiler/AttributeRegistry.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/openvdb_ax/openvdb_ax/compiler/AttributeBindings.h b/openvdb_ax/openvdb_ax/compiler/AttributeBindings.h
index ebd7bba23b..dd295b2501 100644
--- a/openvdb_ax/openvdb_ax/compiler/AttributeBindings.h
+++ b/openvdb_ax/openvdb_ax/compiler/AttributeBindings.h
@@ -1,5 +1,5 @@
 // Copyright Contributors to the OpenVDB Project
-// SPDX-License-Identifier: Apache-2.0/
+// SPDX-License-Identifier: Apache-2.0
 
 /// @file compiler/AttributeBindings.h
 ///
diff --git a/openvdb_ax/openvdb_ax/compiler/AttributeRegistry.h b/openvdb_ax/openvdb_ax/compiler/AttributeRegistry.h
index 6f40790d2d..1f03655f54 100644
--- a/openvdb_ax/openvdb_ax/compiler/AttributeRegistry.h
+++ b/openvdb_ax/openvdb_ax/compiler/AttributeRegistry.h
@@ -1,5 +1,5 @@
 // Copyright Contributors to the OpenVDB Project
-// SPDX-License-Identifier: Apache-2.0/
+// SPDX-License-Identifier: Apache-2.0
 
 /// @file compiler/AttributeRegistry.h
 ///

From 5ac52485b1b7239af6c862a3950252543bf6d73b Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Wed, 18 Dec 2024 14:09:10 +1300
Subject: [PATCH 13/59] Disabling docs action for fvdb changes

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 .github/workflows/docs.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index bdd4b3e2c1..eb0be11615 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -12,6 +12,7 @@ on:
       - 'openvdb_maya/**'
       - 'pendingchanges/**'
       - '**.md'
+      - 'fvdb/**'
   pull_request:
     branches:
       - '**'
@@ -20,6 +21,7 @@ on:
       - 'openvdb_maya/**'
       - 'pendingchanges/**'
       - '**.md'
+      - 'fvdb/**'
   workflow_dispatch:
     inputs:
       deploy:

From 29606d6a74fa958f45912777fe20c25289d5cd6e Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Wed, 18 Dec 2024 16:48:33 +1300
Subject: [PATCH 14/59] Update SPDX identifier action with ignore-paths for
 source files/dirs that have different licenses

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 .github/workflows/fvdb_codestyle.yml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/fvdb_codestyle.yml b/.github/workflows/fvdb_codestyle.yml
index 90983770d4..d6ccd5c31c 100644
--- a/.github/workflows/fvdb_codestyle.yml
+++ b/.github/workflows/fvdb_codestyle.yml
@@ -71,7 +71,12 @@ jobs:
     steps:
     - name: checkout
       uses: actions/checkout@v4
-    - uses: enarx/spdx@master
+    - uses: swahtz/spdx@feature/ignore_paths
       with:
         licenses: |-
           Apache-2.0
+        ignore-paths: |-
+            openvdb/openvdb/math/Half.cc
+            openvdb/openvdb/math/Half.h
+            openvdb_wolfram/OpenVDBLink
+            openvdb_ax/openvdb_ax/grammar/generated
\ No newline at end of file

From 6ccc92f0c6d044516e890aa19b6b7d29f393eac1 Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Thu, 19 Dec 2024 09:41:24 +1300
Subject: [PATCH 15/59] newline

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 .github/workflows/fvdb_codestyle.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/fvdb_codestyle.yml b/.github/workflows/fvdb_codestyle.yml
index d6ccd5c31c..c3cb436dd4 100644
--- a/.github/workflows/fvdb_codestyle.yml
+++ b/.github/workflows/fvdb_codestyle.yml
@@ -79,4 +79,4 @@ jobs:
             openvdb/openvdb/math/Half.cc
             openvdb/openvdb/math/Half.h
             openvdb_wolfram/OpenVDBLink
-            openvdb_ax/openvdb_ax/grammar/generated
\ No newline at end of file
+            openvdb_ax/openvdb_ax/grammar/generated

From 200cf246ee2a3e31d4330fc61d9cb1ae7ed4b151 Mon Sep 17 00:00:00 2001
From: Francis Williams <francis@fwilliams.info>
Date: Thu, 19 Dec 2024 16:40:46 -0500
Subject: [PATCH 16/59] Enhance JCat0 kernel to handle empty JIdx tensor case
 and add unit test for concatenation of JaggedTensors

---
 fvdb/src/detail/ops/JCat0.cu          | 17 ++++++++++------
 fvdb/tests/unit/test_jagged_tensor.py | 29 +++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/fvdb/src/detail/ops/JCat0.cu b/fvdb/src/detail/ops/JCat0.cu
index c9dd2e6995..39c494e7e5 100644
--- a/fvdb/src/detail/ops/JCat0.cu
+++ b/fvdb/src/detail/ops/JCat0.cu
@@ -2,11 +2,11 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "Ops.h"
-
 #include <detail/utils/Utils.h>
 #include <detail/utils/cuda/Utils.cuh>
 
 #include <ATen/cuda/Atomic.cuh>
+
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 
@@ -40,20 +40,24 @@ template <typename IdxT>
 __global__ void
 computeIndexPutArg(
     const size_t jti, const JOffsetsType *__restrict__ const *__restrict__ offsets,
-    const size_t numOffsets,
+    const size_t numOffsets, const size_t numElements,
     const TorchRAcc32<JIdxType, 1>     inJIdxI,     // Jidx of the i^th input tensor
     const TorchRAcc32<JOffsetsType, 1> inJoffsetsI, // JOffsets of the i^th input tensor
     const TorchRAcc32<JOffsetsType, 1> outJOffsets, // Output JOffsets (already computed earlier)
     TorchRAcc32<IdxT, 1>               outSelIdx,   // Output selection indices
-    TorchRAcc32<JIdxType, 1>           outJIdx) {             // Output Jidx
-    int32_t       idx         = blockIdx.x * blockDim.x + threadIdx.x;
-    const int64_t numElements = inJIdxI.size(0);
+    TorchRAcc32<JIdxType, 1>           outJIdx      // Output Jidx
+) {
+    int32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (idx >= numElements) {
         return;
     }
 
-    const JIdxType jidx = inJIdxI[idx]; // Which tensor this element belongs to
+    // When you have a JaggedTensor that has only one tensor in it, the JIdx tensor is empty to
+    // save memory (it's effectively all zeros anyway).l We need to handle this case, which is
+    // what this flag is doing
+    const bool     emptyJidx = inJIdxI.size(0) == 0 && numElements != 0;
+    const JIdxType jidx      = emptyJidx ? 0 : inJIdxI[idx]; // Which tensor this element belongs to
 
     // Where in the output tensor we're going to write to
     JOffsetsType tensorWriteOffset = 0;
@@ -146,6 +150,7 @@ dispatchJCat0<torch::kCUDA>(const std::vector<JaggedTensor> &vec) {
                 GET_BLOCKS(numElements, numThreadsComputeIndexPutArg);
             computeIndexPutArg<<<numBlocksComputeIndexPutArg, numThreadsComputeIndexPutArg>>>(
                 jti, thrust::raw_pointer_cast(offsets_d.data()), offsets_d.size(),
+                jt.jdata().size(0),
                 jt.jidx().packed_accessor32<JIdxType, 1, torch::RestrictPtrTraits>(),
                 jt.joffsets().packed_accessor32<JOffsetsType, 1, torch::RestrictPtrTraits>(),
                 outJOffsets.packed_accessor32<JOffsetsType, 1, torch::RestrictPtrTraits>(),
diff --git a/fvdb/tests/unit/test_jagged_tensor.py b/fvdb/tests/unit/test_jagged_tensor.py
index 6972b6a02a..f99053fb03 100644
--- a/fvdb/tests/unit/test_jagged_tensor.py
+++ b/fvdb/tests/unit/test_jagged_tensor.py
@@ -85,6 +85,35 @@ def check_lshape(self, jt: fvdb.JaggedTensor, lt: List[torch.Tensor] | List[List
         else:
             assert False, "jagged tensor ldim should be 1 or 2"
 
+    @parameterized.expand(all_device_dtype_combos)
+    def test_jcat_along_dim_0_with_one_tensor(self, device, dtype):
+        batch_size = 1
+
+        # Make a point cloud with a random number of points
+        def get_pc(num_pc_list: list):
+            pc_list = []
+            for num_pc in num_pc_list:
+                pc_list.append(torch.rand((num_pc, 3)).to(device))
+            return pc_list
+
+        num_pc_list = torch.randint(low=50, high=1000, size=(batch_size,), device=device).cpu().tolist()
+
+        pc1_tensor_list = get_pc(num_pc_list)
+        pc2_tensor_list = get_pc(num_pc_list)
+
+        pc1_jagged = fvdb.JaggedTensor(pc1_tensor_list)
+        pc2_jagged = fvdb.JaggedTensor(pc2_tensor_list)
+
+        cat_dim = 0
+        concat_tensor_list = [
+            torch.cat([pc1_tensor_list[i], pc2_tensor_list[i]], dim=cat_dim) for i in range(batch_size)
+        ]
+
+        jagged_from_concat_list = fvdb.JaggedTensor(concat_tensor_list)
+        jcat_result = fvdb.jcat([pc1_jagged, pc2_jagged], dim=cat_dim)
+
+        self.assertTrue(torch.equal(jagged_from_concat_list.jdata, jcat_result.jdata))
+
     @parameterized.expand(all_device_dtype_combos)
     def test_pickle(self, device, dtype):
         jt, _ = self.mklol(7, 4, 8, device, dtype)

From ce544bd2ffd8daab16ba6d92064368d268daef44 Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Sat, 21 Dec 2024 11:26:13 +1300
Subject: [PATCH 17/59] fix clang-format issue with JCat0.cu

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 fvdb/src/detail/ops/JCat0.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fvdb/src/detail/ops/JCat0.cu b/fvdb/src/detail/ops/JCat0.cu
index 39c494e7e5..5e1b58b05b 100644
--- a/fvdb/src/detail/ops/JCat0.cu
+++ b/fvdb/src/detail/ops/JCat0.cu
@@ -1,9 +1,9 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: Apache-2.0
 //
-#include "Ops.h"
 #include <detail/utils/Utils.h>
 #include <detail/utils/cuda/Utils.cuh>
+#include "Ops.h"
 
 #include <ATen/cuda/Atomic.cuh>
 
@@ -227,4 +227,4 @@ dispatchJCat0<torch::kCPU>(const std::vector<JaggedTensor> &vec) {
 
 } // namespace ops
 } // namespace detail
-} // namespace fvdb
\ No newline at end of file
+} // namespace fvdb

From faa644ab3d7c7ccb85a359f5a0fb6442a7976ad1 Mon Sep 17 00:00:00 2001
From: Matthew Cong <mcong@nvidia.com>
Date: Wed, 18 Dec 2024 17:36:33 -0800
Subject: [PATCH 18/59] Merge changes from latest NanoVDB

Signed-off-by: Matthew Cong <mcong@nvidia.com>
---
 .github/workflows/build.yml                   |   2 +-
 .github/workflows/nanovdb.yml                 |   2 +
 nanovdb/nanovdb/CMakeLists.txt                |   4 +
 nanovdb/nanovdb/GridHandle.h                  |  55 +-
 nanovdb/nanovdb/NanoVDB.h                     |  34 ++
 nanovdb/nanovdb/NodeManager.h                 |   2 +-
 nanovdb/nanovdb/cuda/DeviceBuffer.h           | 337 +++++++----
 nanovdb/nanovdb/cuda/DeviceStreamMap.h        | 120 ++++
 nanovdb/nanovdb/cuda/GridHandle.cuh           |   8 +-
 nanovdb/nanovdb/cuda/NodeManager.cuh          |   6 +-
 nanovdb/nanovdb/cuda/UnifiedBuffer.h          | 327 +++++++++++
 nanovdb/nanovdb/examples/CMakeLists.txt       |   5 +
 .../examples/ex_collide_level_set/main.cc     |   6 +-
 .../make_custom_nanovdb_cuda.cc               |   1 +
 .../examples/ex_make_mgpu_nanovdb/main.cu     | 306 ++++++++++
 .../modify_nanovdb_thrust.cc                  |   6 +-
 .../examples/ex_raytrace_fog_volume/main.cc   |   3 +-
 .../examples/ex_raytrace_level_set/main.cc    |   3 +-
 nanovdb/nanovdb/io/IO.h                       |  29 +-
 nanovdb/nanovdb/math/Math.h                   |   8 +-
 nanovdb/nanovdb/python/PyGridHandle.h         |   2 +-
 nanovdb/nanovdb/tools/GridStats.h             |   1 +
 nanovdb/nanovdb/tools/cuda/IndexToGrid.cuh    |   4 +-
 nanovdb/nanovdb/tools/cuda/PointsToGrid.cuh   | 201 ++++---
 nanovdb/nanovdb/unittest/TestNanoVDB.cc       |  18 +-
 nanovdb/nanovdb/unittest/TestNanoVDB.cu       | 536 +++++++++++++++++-
 nanovdb/nanovdb/unittest/TestOpenVDB.cc       |  25 +-
 nanovdb/nanovdb/util/Timer.h                  |  28 +-
 nanovdb/nanovdb/util/cuda/Timer.h             |  73 ++-
 nanovdb/nanovdb/util/cuda/Util.h              |  39 +-
 pendingchanges/nanovdb.txt                    |   8 +
 31 files changed, 1924 insertions(+), 275 deletions(-)
 create mode 100644 nanovdb/nanovdb/cuda/DeviceStreamMap.h
 create mode 100644 nanovdb/nanovdb/cuda/UnifiedBuffer.h
 create mode 100644 nanovdb/nanovdb/examples/ex_make_mgpu_nanovdb/main.cu
 create mode 100644 pendingchanges/nanovdb.txt

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 6d479e57a5..ed4b12d8fb 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -78,7 +78,7 @@ jobs:
         config:
           - { cxx: clang++, image: '2024', abi: '12', build: 'Release', cmake: '' }
           - { cxx: g++,     image: '2024', abi: '12', build: 'Release', cmake: '' }
-          - { cxx: clang++, image: '2024', abi: '12', build: 'Debug',   cmake: '' }
+          - { cxx: clang++, image: '2024', abi: '12', build: 'Debug',   cmake: '-DUSE_EXPLICIT_INSTANTIATION=OFF' }
           - { cxx: clang++, image: '2023', abi: '11', build: 'Release', cmake: '-DDISABLE_DEPENDENCY_VERSION_CHECKS=ON' }
           - { cxx: g++,     image: '2023', abi: '11', build: 'Release', cmake: '-DDISABLE_DEPENDENCY_VERSION_CHECKS=ON' }
       fail-fast: false
diff --git a/.github/workflows/nanovdb.yml b/.github/workflows/nanovdb.yml
index 0efc4580a8..fdaa3d3a36 100644
--- a/.github/workflows/nanovdb.yml
+++ b/.github/workflows/nanovdb.yml
@@ -83,6 +83,7 @@ jobs:
           --cargs=\'
           -DUSE_EXPLICIT_INSTANTIATION=OFF
           -DNANOVDB_USE_CUDA=ON
+          -DCMAKE_CUDA_ARCHITECTURES="80"
           -DNANOVDB_USE_OPENVDB=ON
           -DCMAKE_INSTALL_PREFIX=`pwd`
           -DUSE_BLOSC=OFF
@@ -127,6 +128,7 @@ jobs:
         -DMSVC_COMPRESS_PDB=ON
         -DUSE_EXPLICIT_INSTANTIATION=OFF
         -DNANOVDB_USE_CUDA=ON
+        -DCMAKE_CUDA_ARCHITECTURES="80"
         -DNANOVDB_USE_OPENVDB=ON
         -DVCPKG_TARGET_TRIPLET=${VCPKG_DEFAULT_TRIPLET}
         -DCMAKE_TOOLCHAIN_FILE=\"${VCPKG_INSTALLATION_ROOT}\\scripts\\buildsystems\\vcpkg.cmake\"
diff --git a/nanovdb/nanovdb/CMakeLists.txt b/nanovdb/nanovdb/CMakeLists.txt
index 4e0284ecbf..45e1e13dc6 100644
--- a/nanovdb/nanovdb/CMakeLists.txt
+++ b/nanovdb/nanovdb/CMakeLists.txt
@@ -113,6 +113,8 @@ if(NANOVDB_USE_CUDA)
       get_target_property(VDB_MSVC_RUNTIME_SELECTION openvdb MSVC_RUNTIME_LIBRARY)
     endif()
   endif()
+
+  find_package(CUDAToolkit)
 endif()
 
 if(NANOVDB_USE_OPENVDB)
@@ -169,8 +171,10 @@ set(NANOVDB_INCLUDE_FILES
 # NanoVDB cuda header files
 set(NANOVDB_INCLUDE_CUDA_FILES
   cuda/DeviceBuffer.h
+  cuda/DeviceStreamMap.h
   cuda/GridHandle.cuh
   cuda/NodeManager.cuh
+  cuda/UnifiedBuffer.h
 )
 
 # NanoVDB io header files
diff --git a/nanovdb/nanovdb/GridHandle.h b/nanovdb/nanovdb/GridHandle.h
index 05e492046b..d6c4521f70 100644
--- a/nanovdb/nanovdb/GridHandle.h
+++ b/nanovdb/nanovdb/GridHandle.h
@@ -45,13 +45,14 @@ class GridHandle
 public:
     using BufferType = BufferT;
 
-    /// @brief  Move constructor from a host buffer
+    /// @brief  Move constructor from a dual host-device buffer
     /// @param buffer buffer containing one or more NanoGrids that will be moved into this GridHandle
     /// @throw Will throw and error with the buffer does not contain a valid NanoGrid!
+    /// @note The implementation of this template specialization is in nanovdb/cuda/GridHandle.cuh since it requires CUDA
     template<typename T = BufferT, typename util::enable_if<BufferTraits<T>::hasDeviceDual, int>::type = 0>
     GridHandle(T&& buffer);
 
-    /// @brief  Move constructor from a dual host-device buffer
+    /// @brief  Move constructor from a host buffer
     /// @param buffer buffer containing one or more NanoGrids that will be moved into this GridHandle
     /// @throw Will throw and error with the buffer does not contain a valid NanoGrid!
     template<typename T = BufferT, typename util::disable_if<BufferTraits<T>::hasDeviceDual, int>::type = 0>
@@ -112,17 +113,23 @@ class GridHandle
     template<typename U = BufferT>
     typename util::enable_if<BufferTraits<U>::hasDeviceDual, void*>::type
     deviceData() { return mBuffer.deviceData(); }
+    template<typename U = BufferT>
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, void*>::type
+    deviceData(int device) { return mBuffer.deviceData(device); }
 
+    //@{
     /// @brief Returns the size in bytes of the raw memory buffer managed by this GridHandle.
-    uint64_t size() const { return mBuffer.size(); }
+    [[deprecated("Use GridHandle::bufferSize instead.")]] uint64_t size() const { return mBuffer.size(); }
+    uint64_t bufferSize() const { return mBuffer.size(); }
+    //@}
 
     //@{
     /// @brief Return true if this handle is empty, i.e. has no allocated memory
-    bool empty()   const { return this->size() == 0; }
-    bool isEmpty() const { return this->size() == 0; }
+    bool empty()   const { return mBuffer.size() == 0; }
+    bool isEmpty() const { return mBuffer.size() == 0; }
     //@}
 
-    /// @brief Return true if this handle contains any grids
+    /// @brief Return true if this handle is not empty, i.e. contains at least one grid
     operator bool() const { return !this->empty(); }
 
     /// @brief Returns a const host pointer to the @a n'th NanoVDB grid encoded in this GridHandle.
@@ -152,7 +159,7 @@ class GridHandle
 
     /// @brief Return a const pointer to the @a n'th grid encoded in this GridHandle on the device, e.g. GPU
     /// @tparam ValueT Value type of the grid point to be returned
-    /// @param n Index if of the grid pointer to be returned
+    /// @param n Index of the grid pointer to be returned
     /// @param verbose if non-zero error messages will be printed in case something failed
     /// @warning Note that the return pointer can be NULL if the GridHandle was not initialized, @a n is invalid,
     ///          or if the template parameter does not match the specified grid.
@@ -164,13 +171,25 @@ class GridHandle
     /// @note This method is only available if the buffer supports devices
     template<typename U = BufferT>
     typename util::enable_if<BufferTraits<U>::hasDeviceDual, void>::type
-    deviceUpload(void* stream = nullptr, bool sync = true) { mBuffer.deviceUpload(stream, sync); }
+    deviceUpload(void* stream, bool sync = true) { mBuffer.deviceUpload(stream, sync); }
+
+    /// @brief Upload the host buffer to a specefic device buffer. It device buffer doesn't exist it's created first
+    /// @param device Device to upload host data to
+    /// @param stream cuda stream
+    /// @param sync if false the memory copy is asynchronous
+    template<typename U = BufferT>
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, void>::type
+    deviceUpload(int device = 0, void* stream = nullptr, bool sync = true) { mBuffer.deviceUpload(device, stream, sync); }
 
     /// @brief Download the grid to from the device, e.g. from GPU to CPU
     /// @note This method is only available if the buffer supports devices
     template<typename U = BufferT>
     typename util::enable_if<BufferTraits<U>::hasDeviceDual, void>::type
-    deviceDownload(void* stream = nullptr, bool sync = true) { mBuffer.deviceDownload(stream, sync); }
+    deviceDownload(void* stream, bool sync = true) { mBuffer.deviceDownload(stream, sync); }
+
+    template<typename U = BufferT>
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, void>::type
+    deviceDownload(int device = 0, void* stream = nullptr, bool sync = true) { mBuffer.deviceDownload(device, stream, sync); }
 
     /// @brief Check if the buffer is this handle has any padding, i.e. if the buffer is larger than the combined size of all its grids
     /// @return true is the combined size of all grid is smaller than the buffer size
@@ -184,6 +203,23 @@ class GridHandle
     /// @return Return the byte size of the specified grid
     uint64_t gridSize(uint32_t n = 0) const {return mMetaData[n].size; }
 
+    /// @brief compute the total sum of memory footprints of all the grids in this buffer
+    /// @return the number of bytes occupied by all grids associated with this buffer
+    uint64_t totalGridSize() const {
+        uint64_t sum = 0;
+        for (auto &m : mMetaData) sum += m.size;
+        NANOVDB_ASSERT(sum <= mBuffer.size());
+        return sum;
+    }
+
+    /// @brief compute the size of unusedstorage in this buffer
+    /// @return the number of unused bytes in this buffer.
+    uint64_t freeSize() const {return mBuffer.size() - this->totalGridSize();}
+
+    /// @brief Test if this buffer has any unused storage left, i.e. memory not occupied by grids
+    /// @return true if there is no extra storage left in this buffer, i.e. empty or fully occupied with grids
+    bool isFull() const { return this->totalGridSize() == mBuffer.size(); }
+
     /// @brief Return the GridType of the @a n'th grid in this GridHandle
     /// @param n index of the grid (assumed to be less than gridCount())
     /// @return Return the GridType of the specified grid
@@ -315,6 +351,7 @@ inline __hostdev__ void cpyGridHandleMeta(const GridData *data, GridHandleMetaDa
     }
 }// void cpyGridHandleMeta(const GridData *data, GridHandleMetaData *meta)
 
+// template specialization of move constructor from a host buffer
 template<typename BufferT>
 template<typename T, typename util::disable_if<BufferTraits<T>::hasDeviceDual, int>::type>
 GridHandle<BufferT>::GridHandle(T&& buffer)
diff --git a/nanovdb/nanovdb/NanoVDB.h b/nanovdb/nanovdb/NanoVDB.h
index 36b412b0e4..dd10599b7e 100644
--- a/nanovdb/nanovdb/NanoVDB.h
+++ b/nanovdb/nanovdb/NanoVDB.h
@@ -1159,6 +1159,21 @@ class Mask
     __hostdev__ uint64_t*       words() { return mWords; }
     __hostdev__ const uint64_t* words() const { return mWords; }
 
+    template<typename WordT>
+    __hostdev__ WordT getWord(uint32_t n) const
+    {
+        static_assert(util::is_same<WordT, uint8_t, uint16_t, uint32_t, uint64_t>::value);
+        NANOVDB_ASSERT(n*8*sizeof(WordT) < WORD_COUNT);
+        return reinterpret_cast<WordT*>(mWords)[n];
+    }
+    template<typename WordT>
+    __hostdev__ void setWord(WordT w, uint32_t n)
+    {
+        static_assert(util::is_same<WordT, uint8_t, uint16_t, uint32_t, uint64_t>::value);
+        NANOVDB_ASSERT(n*8*sizeof(WordT) < WORD_COUNT);
+        reinterpret_cast<WordT*>(mWords)[n] = w;
+    }
+
     /// @brief Assignment operator that works with openvdb::util::NodeMask
     template<typename MaskT = Mask>
     __hostdev__ typename util::enable_if<!util::is_same<MaskT, Mask>::value, Mask&>::type operator=(const MaskT& other)
@@ -1228,6 +1243,25 @@ class Mask
     {
         on ? this->setOnAtomic(n) : this->setOffAtomic(n);
     }
+/*
+    template<typename WordT>
+    __device__ inline void setWordAtomic(WordT w, uint32_t n)
+    {
+        static_assert(util::is_same<WordT, uint8_t, uint16_t, uint32_t, uint64_t>::value);
+        NANOVDB_ASSERT(n*8*sizeof(WordT) < WORD_COUNT);
+        if constexpr(util::is_same<WordT,uint8_t>::value) {
+            mask <<= x;
+        } else if constexpr(util::is_same<WordT,uint16_t>::value) {
+            unsigned int mask = w;
+            if (n >> 1) mask <<= 16;
+            atomicOr(reinterpret_cast<unsigned int*>(this) + n, mask);
+        } else if constexpr(util::is_same<WordT,uint32_t>::value) {
+            atomicOr(reinterpret_cast<unsigned int*>(this) + n, w);
+        } else {
+            atomicOr(reinterpret_cast<unsigned long long int*>(this) + n, w);
+        }
+    }
+*/
 #endif
     /// @brief Set the specified bit on or off.
     __hostdev__ void set(uint32_t n, bool on)
diff --git a/nanovdb/nanovdb/NodeManager.h b/nanovdb/nanovdb/NodeManager.h
index 0d7686eb2d..486b5fda1b 100644
--- a/nanovdb/nanovdb/NodeManager.h
+++ b/nanovdb/nanovdb/NodeManager.h
@@ -316,7 +316,7 @@ NodeManagerHandle<BufferT> createNodeManager(const NanoGrid<BuildT> &grid,
     }
 
     return handle;// // is converted to r-value so return value is move constructed!
-}
+}// createNodeManager
 
 } // namespace nanovdb
 
diff --git a/nanovdb/nanovdb/cuda/DeviceBuffer.h b/nanovdb/nanovdb/cuda/DeviceBuffer.h
index 465bd9dc6b..7d304e86d1 100644
--- a/nanovdb/nanovdb/cuda/DeviceBuffer.h
+++ b/nanovdb/nanovdb/cuda/DeviceBuffer.h
@@ -8,7 +8,7 @@
 
     \date January 8, 2020
 
-    \brief Implements a simple dual (host/device) CUDA buffer.
+    \brief DeviceBuffer has one pinned host buffer and multiple device CUDA buffers
 
     \note This file has no device-only kernel functions,
           which explains why it's a .h and not .cuh file.
@@ -17,6 +17,8 @@
 #ifndef NANOVDB_CUDA_DEVICEBUFFER_H_HAS_BEEN_INCLUDED
 #define NANOVDB_CUDA_DEVICEBUFFER_H_HAS_BEEN_INCLUDED
 
+#include <cuda.h>
+#include <memory>// for std::shared_ptr
 #include <nanovdb/HostBuffer.h>// for BufferTraits
 #include <nanovdb/util/cuda/Util.h>// for cudaMalloc/cudaMallocManaged/cudaFree
 
@@ -34,44 +36,80 @@ namespace cuda {// =============================================================
 class DeviceBuffer
 {
     uint64_t mSize; // total number of bytes managed by this buffer (assumed to be identical for host and device)
-    void *mCpuData, *mGpuData; // raw pointers to the host and device buffers
-    bool mManaged;
+    void *mCpuData, **mGpuData; // raw pointers to the host and device buffers
+    int   mDeviceCount, mManaged;// if mManaged is non-zero this class is responsible for allocating and freeing memory buffers. Otherwise this is assumed to be handled externally
 
-public:
-    /// @brief Static factory method that return an instance of this buffer
+    /// @brief Initialize buffer
     /// @param size byte size of buffer to be initialized
-    /// @param dummy this argument is currently ignored but required to match the API of the HostBuffer
     /// @param host If true buffer is initialized only on the host/CPU, else on the device/GPU
-    /// @param stream optional stream argument (defaults to stream NULL)
-    /// @return An instance of this class using move semantics
-    static DeviceBuffer create(uint64_t size, const DeviceBuffer* dummy = nullptr, bool host = true, void* stream = nullptr);
+    /// @note All existing buffers are first cleared
+    /// @warning size is expected to be non-zero. Use clear() clear buffer!
+    void init(uint64_t size, int device, cudaStream_t stream);
 
-    /// @brief Static factory method that return an instance of this buffer that wraps externally managed memory
-    /// @param size byte size of buffer specified by external memory
-    /// @param cpuData pointer to externally managed host memory
-    /// @param gpuData pointer to externally managed device memory
-    /// @return An instance of this class using move semantics
-    static DeviceBuffer create(uint64_t size, void* cpuData, void* gpuData);
+public:
+
+    using PtrT = std::shared_ptr<DeviceBuffer>;
+
+    /// @brief Default constructor of an empty buffer
+    DeviceBuffer() : mSize(0), mCpuData(nullptr), mGpuData(nullptr), mDeviceCount(0), mManaged(0){}
+
+    /// @brief Constructor with a specified device and size
+    /// @param size byte size of buffer to be initialized
+    /// @param device id of the device on which to initialize the buffer
+    /// @param stream cuda stream
+    DeviceBuffer(uint64_t size, int device = cudaCpuDeviceId, cudaStream_t stream = 0) : DeviceBuffer()
+    {
+        this->init(size, device, stream);
+    }
 
     /// @brief Constructor
     /// @param size byte size of buffer to be initialized
-    /// @param host If true buffer is initialized only on the host/CPU, else on the device/GPU
+    /// @param host If true buffer is initialized only on the host/CPU, else on the current device/GPU
     /// @param stream optional stream argument (defaults to stream NULL)
-    DeviceBuffer(uint64_t size = 0, bool host = true, void* stream = nullptr)
-        : mSize(0)
-        , mCpuData(nullptr)
-        , mGpuData(nullptr)
-        , mManaged(false)
+    DeviceBuffer(uint64_t size, bool host, void* stream) : DeviceBuffer()
     {
-        if (size > 0) this->init(size, host, stream);
+        int device = cudaCpuDeviceId;
+        if (!host) cudaCheck(cudaGetDevice(&device));
+        this->init(size, device, reinterpret_cast<cudaStream_t>(stream));
     }
 
+    /// @brief Constructor for externally managed host and device buffers
+    /// @param size byte size of the two external buffers
+    /// @param cpuData host buffer, assumed to NOT be NULL
+    /// @param gpuData device buffer, assumed to NOT be NULL;
+    /// @note The device buffer, @c gpuData, will be associated
+    ///       with the current device ID given by cudaGetDevice
     DeviceBuffer(uint64_t size, void* cpuData, void* gpuData)
         : mSize(size)
         , mCpuData(cpuData)
-        , mGpuData(gpuData)
-        , mManaged(false)
+        , mManaged(0)
+    {
+        cudaCheck(cudaGetDeviceCount(&mDeviceCount));
+        mGpuData = new void*[mDeviceCount]();// NULL initialization
+        NANOVDB_ASSERT(cpuData);
+        NANOVDB_ASSERT(gpuData);
+        int device = 0;
+        cudaCheck(cudaGetDevice(&device));
+        mGpuData[device] = gpuData;
+    }
+
+    /// @brief Constructor for externally managed host and multiple device buffers
+    /// @param size byte size of the two external buffers
+    /// @param cpuData host buffer, assumed to NOT be NULL
+    /// @param list list of device IDs and external device buffers, all assumed to not be NULL
+    DeviceBuffer(uint64_t size, void* cpuData, std::initializer_list<std::pair<int,void*>> list)
+        : mSize(size)
+        , mCpuData(cpuData)
+        , mManaged(0)
     {
+        NANOVDB_ASSERT(cpuData);
+        cudaCheck(cudaGetDeviceCount(&mDeviceCount));
+        mGpuData = new void*[mDeviceCount]();// NULL initialization
+        for (auto &p : list) {
+            NANOVDB_ASSERT(p.first>=0 && p.first<mDeviceCount);
+            NANOVDB_ASSERT(p.second);
+            mGpuData[p.first] = p.second;
+        }
     }
 
     /// @brief Disallow copy-construction
@@ -82,138 +120,245 @@ class DeviceBuffer
         : mSize(other.mSize)
         , mCpuData(other.mCpuData)
         , mGpuData(other.mGpuData)
+        , mDeviceCount(other.mDeviceCount)
         , mManaged(other.mManaged)
     {
-        other.mSize = 0;
-        other.mCpuData = nullptr;
-        other.mGpuData = nullptr;
-        other.mManaged = false;
+        other.mCpuData = other.mGpuData = nullptr;
+        other.mSize = other.mDeviceCount = other.mManaged = 0;
     }
 
+     /// @brief Destructor frees memory on both the host and device
+    ~DeviceBuffer() { this->clear(); };
+
+    /// @brief Static factory method that return an instance of this buffer
+    /// @param size byte size of buffer to be initialized
+    /// @param dummy this argument is currently ignored but required to match the API of the HostBuffer
+    /// @param host If true buffer is initialized only on the host/CPU, else only on the device/GPU
+    /// @param stream optional stream argument (defaults to stream NULL)
+    /// @return An instance of this class using move semantics
+    static DeviceBuffer create(uint64_t size, const DeviceBuffer* dummy, bool host, void* stream){return DeviceBuffer(size, host, stream);}
+
+    static DeviceBuffer create(uint64_t size, const DeviceBuffer* dummy = nullptr, int device = cudaCpuDeviceId, cudaStream_t stream = 0){return DeviceBuffer(size, device, stream);}
+
+    /// @brief Static factory method that returns an instance of this buffer that wraps externally managed memory
+    /// @param size byte size of buffer specified by external memory
+    /// @param cpuData pointer to externally managed host memory
+    /// @param gpuData pointer to externally managed device memory
+    /// @return An instance of this class using move semantics
+    static DeviceBuffer create(uint64_t size, void* cpuData, void* gpuData) {return DeviceBuffer(size, cpuData, gpuData);}
+
+    /// @brief  Static factory method that returns an instance of this buffer that wraps externally managed host and device memory
+    /// @param size byte size of buffer to be initialized
+    /// @param cpuData  pointer to externally managed host memory
+    /// @param list list of device IDs and device memory pointers
+    static DeviceBuffer create(uint64_t size, void* cpuData, std::initializer_list<std::pair<int,void*>> list) {return DeviceBuffer(size, cpuData, list);}
+
+    ///////////////////////////////////////////////////////////////////////
+
+    //@{
+    /// @brief Factory methods that create a shared pointer to an DeviceBuffer instance
+    static PtrT createPtr(uint64_t size, const DeviceBuffer* = nullptr, int device = cudaCpuDeviceId, cudaStream_t stream = 0) {return std::make_shared<DeviceBuffer>(size, device, stream);}
+    static PtrT createPtr(uint64_t size, void* cpuData, void* gpuData) {return std::make_shared<DeviceBuffer>(size, cpuData, gpuData);}
+    static PtrT createPtr(uint64_t size, void* cpuData, std::initializer_list<std::pair<int,void*>> list) {return std::make_shared<DeviceBuffer>(size, cpuData, list);}
+    ///@}
+
+    ///////////////////////////////////////////////////////////////////////
+
     /// @brief Disallow copy assignment operation
     DeviceBuffer& operator=(const DeviceBuffer&) = delete;
 
     /// @brief Move copy assignment operation
     DeviceBuffer& operator=(DeviceBuffer&& other) noexcept
     {
-        this->clear();
-        mSize = other.mSize;
+        mSize    = other.mSize;
         mCpuData = other.mCpuData;
+        delete [] mGpuData;
         mGpuData = other.mGpuData;
+        mDeviceCount = other.mDeviceCount;
         mManaged = other.mManaged;
-        other.mSize = 0;
-        other.mCpuData = nullptr;
-        other.mGpuData = nullptr;
-        other.mManaged = false;
+        other.mCpuData = other.mGpuData = nullptr;
+        other.mSize = other.mDeviceCount = other.mManaged = 0;
         return *this;
     }
 
-    /// @brief Destructor frees memory on both the host and device
-    ~DeviceBuffer() { this->clear(); };
+    ///////////////////////////////////////////////////////////////////////
 
-    /// @brief Initialize buffer
-    /// @param size byte size of buffer to be initialized
-    /// @param host If true buffer is initialized only on the host/CPU, else on the device/GPU
-    /// @note All existing buffers are first cleared
-    /// @warning size is expected to be non-zero. Use clear() clear buffer!
-    void init(uint64_t size, bool host = true, void* stream = nullptr);
-
-    /// @brief Retuns a raw pointer to the host/CPU buffer managed by this allocator.
+    /// @brief Retuns a raw void pointer to the host/CPU buffer managed by this allocator.
     /// @warning Note that the pointer can be NULL!
     void* data() const { return mCpuData; }
 
-    /// @brief Retuns a raw pointer to the device/GPU buffer managed by this allocator.
+    /// @brief Returns an offset pointer of a specific type from the allocated host memory
+    /// @tparam T Type of the pointer returned
+    /// @param count Numbers of elements of @c parameter type T to skip
+    /// @warning assumes that this instance is not empty!
+    template <typename T>
+    T* data(ptrdiff_t count = 0, int device = cudaCpuDeviceId) const
+    {
+        NANOVDB_ASSERT(device >= cudaCpuDeviceId && device < mDeviceCount);
+        void *ptr = device == cudaCpuDeviceId ? mCpuData : mGpuData[device];
+        return ptr ? reinterpret_cast<T*>(ptr) + count : nullptr;
+    }
+
+    /// @brief Returns a byte offset void pointer from the allocated host memory
+    /// @param byteOffset offset of return pointer in units of bytes
+    /// @warning assumes that this instance is not empty!
+    void* data(ptrdiff_t byteOffset, int device = cudaCpuDeviceId) const
+    {
+        NANOVDB_ASSERT(device >= cudaCpuDeviceId && device < mDeviceCount);
+        void *ptr = device == cudaCpuDeviceId ? mCpuData : mGpuData[device];
+        return ptr ? reinterpret_cast<char*>(ptr) + byteOffset : nullptr;
+    }
+
+    ///////////////////////////////////////////////////////////////////////
+
+    /// @brief Retuns a raw pointer to the specified device/GPU buffer managed by this allocator.
     /// @warning Note that the pointer can be NULL!
-    void* deviceData() const { return mGpuData; }
+    void* deviceData(int device) const {
+        NANOVDB_ASSERT(device >= 0 && device < mDeviceCount);
+        return mGpuData[device];
+    }
 
-    /// @brief  Upload this buffer from the host to the device, i.e. CPU -> GPU.
-    /// @param stream optional CUDA stream (defaults to CUDA stream 0)
-    /// @param sync if false the memory copy is asynchronous
-    /// @note If the device/GPU buffer does not exist it is first allocated
-    /// @warning Assumes that the host/CPU buffer already exists
-    void deviceUpload(void* stream = nullptr, bool sync = true) const;
+    /// @brief Retuns a raw pointer to the current device/GPU buffer managed by this allocator.
+    /// @warning Note that the pointer can be NULL!
+    void* deviceData() const {
+        int device = cudaCpuDeviceId;
+        cudaCheck(cudaGetDevice(&device));
+        return this->deviceData(device);
+    }
 
-    /// @brief Upload this buffer from the device to the host, i.e. GPU -> CPU.
-    /// @param stream optional CUDA stream (defaults to CUDA stream 0)
+    ///////////////////////////////////////////////////////////////////////
+
+    /// @brief Uploads buffer on the host to a specific device. If it doesn't exist it's created first.
+    /// @param device Device ID that the data is copied to
+    /// @param stream cuda stream
+    /// @param sync if false the memory copy is asynchronous.
+    /// @warning Assumes that the host buffer already exists!
+    /// @note determine the current device with cudaGetDevice
+    void deviceUpload(int device = 0, cudaStream_t stream = 0, bool sync = true);
+    void deviceUpload(int device, void* stream, bool sync){this->deviceUpload(device, cudaStream_t(stream), sync);}
+
+    /// @brief Upload buffer from the host to ALL the existing devices, i.e. CPU -> GPU.
+    ///        If no device buffers exist one is created for the current device (typically 0)
+    ///        and subsequently populated with the host data.
+    /// @param stream CUDA stream.
+    /// @param sync if false the memory copy is asynchronous.
+    /// @warning Assumes that the host buffer already exists!
+    void deviceUpload(cudaStream_t stream, bool sync);
+    void deviceUpload(void* stream, bool sync) {this->deviceUpload(cudaStream_t(stream), sync);}
+
+    ///////////////////////////////////////////////////////////////////////
+
+    /// @brief Download data from a specified device to the host. If the host buffer des not exist it will first be allocated
+    /// @param device device ID to download source data from
+    /// @param stream cuda stream
+    /// @param sync if false the memory copy is asynchronous.
+    /// @warning Assumes that the specifed device buffer already exists!
+    void deviceDownload(int device = 0, cudaStream_t stream = 0, bool sync = true);
+    void deviceDownload(int device, void* stream , bool sync) {this->deviceDownload(device, cudaStream_t(stream), sync);}
+
+    /// @brief Download the buffer from the current device to the host, i.e. GPU -> CPU.
+    ///        If the host buffer des not exist it will first be allocated
+    /// @param stream CUDA stream
     /// @param sync if false the memory copy is asynchronous
     /// @note If the host/CPU buffer does not exist it is first allocated
     /// @warning Assumes that the device/GPU buffer already exists
-    void deviceDownload(void* stream = nullptr, bool sync = true) const;
+    void deviceDownload(void* stream, bool sync);
+
+    ///////////////////////////////////////////////////////////////////////
 
     /// @brief Returns the size in bytes of the raw memory buffer managed by this allocator.
     uint64_t size() const { return mSize; }
+    uint64_t capacity() const {return this->size();}
+
+    /// @brief Returns the number of buffers that are not NULL
+    int bufferCount() const {
+        int count = mCpuData ? 1 : 0;
+        for (int i=0; i<mDeviceCount; ++i) if (mGpuData[i]) ++count;
+        return count;
+    }
+
+    int deviceCount() const {return mDeviceCount;}
 
     //@{
     /// @brief Returns true if this allocator is empty, i.e. has no allocated memory
     bool empty() const { return mSize == 0; }
-    bool isEmpty() const { return mSize == 0; }
+    bool isEmpty() const { return this->empty(); }
     //@}
 
     /// @brief De-allocate all memory managed by this allocator and set all pointers to NULL
-    void clear(void* stream = nullptr);
+    void clear(cudaStream_t stream = 0);
+    void clear(void* stream){this->clear(cudaStream_t(stream));}
 
 }; // DeviceBuffer class
 
 // --------------------------> Implementations below <------------------------------------
 
-inline DeviceBuffer DeviceBuffer::create(uint64_t size, const DeviceBuffer*, bool host, void* stream)
+inline void DeviceBuffer::init(uint64_t size, int device, cudaStream_t stream)
 {
-    return DeviceBuffer(size, host, stream);
-}
-
-inline DeviceBuffer DeviceBuffer::create(uint64_t size, void* cpuData, void* gpuData)
-{
-    return DeviceBuffer(size, cpuData, gpuData);
-}
-
-inline void DeviceBuffer::init(uint64_t size, bool host, void* stream)
-{
-    if (mSize>0) this->clear(stream);
-    NANOVDB_ASSERT(size > 0);
-    if (host) {
+    if (size==0) return;
+    cudaCheck(cudaGetDeviceCount(&mDeviceCount));
+    mGpuData = new void*[mDeviceCount]();// NULL initialization
+    NANOVDB_ASSERT(device >= cudaCpuDeviceId && device < mDeviceCount);
+    if (device == cudaCpuDeviceId) {
         cudaCheck(cudaMallocHost((void**)&mCpuData, size)); // un-managed pinned memory on the host (can be slow to access!). Always 32B aligned
         checkPtr(mCpuData, "cuda::DeviceBuffer::init: failed to allocate host buffer");
     } else {
-        cudaCheck(util::cuda::mallocAsync((void**)&mGpuData, size, reinterpret_cast<cudaStream_t>(stream))); // un-managed memory on the device, always 32B aligned!
-        checkPtr(mGpuData, "cuda::DeviceBuffer::init: failed to allocate device buffer");
+        cudaCheck(cudaMallocAsync(mGpuData+device, size, stream)); // un-managed memory on the device, always 32B aligned!
+        checkPtr(mGpuData[device], "cuda::DeviceBuffer::init: failed to allocate device buffer");
     }
     mSize = size;
-    mManaged = true;
+    mManaged = 1;// i.e. this instance is responsible for allocating and delete memory
 } // DeviceBuffer::init
 
-inline void DeviceBuffer::deviceUpload(void* stream, bool sync) const
+inline void DeviceBuffer::deviceUpload(int device, cudaStream_t stream, bool sync)
 {
-    if (!mManaged) throw std::runtime_error("DeviceBuffer::deviceUpload called on externally managed memory. Replace deviceUpload call with the appropriate external copy operation.");
-
-    checkPtr(mCpuData, "uninitialized cpu data");
-    if (mGpuData == nullptr) {
-        cudaCheck(util::cuda::mallocAsync((void**)&mGpuData, mSize, reinterpret_cast<cudaStream_t>(stream))); // un-managed memory on the device, always 32B aligned!
+    NANOVDB_ASSERT(device >= 0 && device < mDeviceCount);// should be device and not the host
+    checkPtr(mCpuData, "uninitialized cpu source data");
+    if (mGpuData[device] == nullptr) {
+        if (mManaged==0) throw std::runtime_error("DeviceBuffer::deviceUpload called on externally managed memory that wasn\'t allocated.");
+        cudaCheck(cudaMallocAsync(mGpuData+device, mSize, stream)); // un-managed memory on the device, always 32B aligned!
     }
-    checkPtr(mGpuData, "uninitialized gpu data");
-    cudaCheck(cudaMemcpyAsync(mGpuData, mCpuData, mSize, cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream)));
-    if (sync) cudaCheck(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream)));
-} // DeviceBuffer::gpuUpload
+    checkPtr(mGpuData[device], "uninitialized gpu destination data");
+    cudaCheck(cudaMemcpyAsync(mGpuData[device], mCpuData, mSize, cudaMemcpyHostToDevice, stream));
+    if (sync) cudaCheck(cudaStreamSynchronize(stream));
+} // DeviceBuffer::deviceUpload
 
-inline void DeviceBuffer::deviceDownload(void* stream, bool sync) const
+inline void DeviceBuffer::deviceUpload(cudaStream_t stream, bool sync)
 {
-    if (!mManaged) throw std::runtime_error("DeviceBuffer::deviceDownload called on externally managed memory. Replace deviceDownload call with the appropriate external copy operation.");
+    int device = 0;
+    cudaGetDevice(&device);
+    this->deviceUpload(device, stream, sync);
+} // DeviceBuffer::deviceUpload
 
-    checkPtr(mGpuData, "uninitialized gpu data");
+inline void DeviceBuffer::deviceDownload(int device, cudaStream_t stream, bool sync)
+{
+    NANOVDB_ASSERT(device >= 0 && device < mDeviceCount);
+    checkPtr(mGpuData[device], "uninitialized gpu source data");// no source data on the specified device
     if (mCpuData == nullptr) {
+        if (mManaged==0) throw std::runtime_error("DeviceBuffer::deviceDownload called on uninitialized cpu destination memory that is externally managed.");
         cudaCheck(cudaMallocHost((void**)&mCpuData, mSize)); // un-managed pinned memory on the host (can be slow to access!). Always 32B aligned
     }
-    checkPtr(mCpuData, "uninitialized cpu data");
-    cudaCheck(cudaMemcpyAsync(mCpuData, mGpuData, mSize, cudaMemcpyDeviceToHost, reinterpret_cast<cudaStream_t>(stream)));
-    if (sync) cudaCheck(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream)));
-} // DeviceBuffer::gpuDownload
+    checkPtr(mCpuData, "uninitialized cpu destination data");
+    cudaCheck(cudaMemcpyAsync(mCpuData, mGpuData[device], mSize, cudaMemcpyDeviceToHost, stream));
+    if (sync) cudaCheck(cudaStreamSynchronize(stream));
+} // DeviceBuffer::deviceDownload
+
+inline void DeviceBuffer::deviceDownload(void* stream, bool sync)
+{
+    int device = 0;
+    cudaCheck(cudaGetDevice(&device));
+    this->deviceDownload(device, cudaStream_t(stream), sync);
+} // DeviceBuffer::deviceDownload
 
-inline void DeviceBuffer::clear(void *stream)
+inline void DeviceBuffer::clear(cudaStream_t stream)
 {
-    if (mManaged && mGpuData) cudaCheck(util::cuda::freeAsync(mGpuData, reinterpret_cast<cudaStream_t>(stream)));
-    if (mManaged && mCpuData) cudaCheck(cudaFreeHost(mCpuData));
+    if (mManaged!=0) {// free all the managed data buffers
+        cudaCheck(cudaFreeHost(mCpuData));
+        for (int i=0; i<mDeviceCount; ++i) cudaCheck(cudaFreeAsync(mGpuData[i], stream));
+    }
+    delete [] mGpuData;
     mCpuData = mGpuData = nullptr;
-    mSize = 0;
-    mManaged = false;
+    mSize = mDeviceCount = mManaged = 0;
 } // DeviceBuffer::clear
 
 }// namespace cuda
diff --git a/nanovdb/nanovdb/cuda/DeviceStreamMap.h b/nanovdb/nanovdb/cuda/DeviceStreamMap.h
new file mode 100644
index 0000000000..4d05409d71
--- /dev/null
+++ b/nanovdb/nanovdb/cuda/DeviceStreamMap.h
@@ -0,0 +1,120 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file DeviceStreamMap.h
+
+    \author Ken Museth
+
+    \date October 15, 2024
+
+    \brief nanovdb::cuda::DeviceStreamMap maps device IDs to CUDA streams,
+           which is useful for multi-GPU applications.
+*/
+
+#ifndef NANOVDB_CUDA_DEVICESTREAMMAP_H_HAS_BEEN_INCLUDED
+#define NANOVDB_CUDA_DEVICESTREAMMAP_H_HAS_BEEN_INCLUDED
+
+#include <cuda.h>
+#include <map>
+#include <vector>
+#include <nanovdb/util/cuda/Util.h>// for cudaCheck, deviceCount etc
+
+namespace nanovdb {// ================================================================
+
+namespace cuda {// ===================================================================
+
+/// @brief map from a device ID to an associated cuda stream. Useful for multi-GPU applications.
+class DeviceStreamMap : public std::map<int, cudaStream_t>
+{
+     using FuncT = CUresult(size_t*, const CUmemAllocationProp*, CUmemAllocationGranularity_flags);
+     FuncT *mFunctPtr;
+
+public:
+
+    enum DeviceType { Any = 0, PeerToPeer = 1, Unified = 3 };
+
+    /// @brief Initiates a map between CUDA device IDs and corresponding streams that satisfy certain constraints.
+    ///        All devices should be able to access memory on all the other devices!
+    /// @param t Type of device to include in map. Any means all available devices, PeerToPeer means only devices that
+    ///          access all aother devices are included, and Unified means all devices support unified memory, concurrent access,
+    ///          and can be access by all other devices.
+    /// @param exclude optional list of device IDs to exclude from the map
+    /// @param verbose  0 means quiet, 1 means print if a device is ignores and 2 means print is a device is included
+    DeviceStreamMap(DeviceType t = DeviceType::Unified, std::vector<int> exclude = {}, int verbose = 0);
+
+    /// @brief Destructor
+    ~DeviceStreamMap();
+
+    /// @brief returns the minimum page size of all the devices in this map
+    size_t getMinPageSize() const;
+
+    /// @brief Print information about all the devices included in this map
+    void printDevInfo(std::FILE* file = stdout) const {for (auto &p : *this) util::cuda::printDevInfo(p.first, nullptr, file);}
+
+    /// @brief Returns the number of device associated with this map
+    int deviceCount() const {return this->size();}
+
+};// DeviceStreamMap
+
+DeviceStreamMap::DeviceStreamMap(DeviceType t, std::vector<int> exclude, int verbose)
+{
+    std::initializer_list<cudaDeviceAttr> filter = {cudaDevAttrUnifiedAddressing, cudaDevAttrConcurrentManagedAccess};
+    const int devCount = util::cuda::deviceCount(), current = util::cuda::currentDevice();
+    for (int dev = 0; dev < devCount; ++dev) {
+        int check = 1;
+        for (auto it=exclude.begin();         check && it!=exclude.end(); ++it) if (dev == *it) check = 0;
+        for (auto it=filter.begin(); (t&2) && check && it!=filter.end(); ++it) cudaCheck(cudaDeviceGetAttribute( &check, *it, dev));
+        for (auto it= this->begin(); (t&1) && check && it!= this->end(); ++it) cudaCheck(cudaDeviceCanAccessPeer(&check, dev, it->first));
+        if (check) {
+            cudaCheck(cudaSetDevice(dev));
+            cudaStream_t stream;
+            cudaCheck(cudaStreamCreate(&stream));
+            if (verbose>1) util::cuda::printDevInfo(dev, "Using");
+            (*this)[dev] = stream;
+        } else if (verbose) util::cuda::printDevInfo(dev, "Ignoring");
+    }
+    cudaCheck(cudaSetDevice(current));// reset to the previous device
+
+    void* entryPoint = nullptr;
+#if CUDART_VERSION >= 12000// queryResult argument was added in CUDA 12
+    cudaDriverEntryPointQueryResult queryResult;
+    cudaCheck(cudaGetDriverEntryPoint("cuMemGetAllocationGranularity", &entryPoint, cudaEnableDefault, &queryResult));
+    NANOVDB_ASSERT(queryResult == cudaDriverEntryPointSuccess);
+#else
+    cudaCheck(cudaGetDriverEntryPoint("cuMemGetAllocationGranularity", &entryPoint, cudaEnableDefault));
+#endif
+    mFunctPtr = reinterpret_cast<FuncT*>(entryPoint);
+}// DeviceStreamMap::DeviceStreamMap
+
+DeviceStreamMap::~DeviceStreamMap()
+{
+    const int current = util::cuda::currentDevice();
+    for (auto& [device, stream] : *this) {
+        cudaCheck(cudaSetDevice(device));
+        cudaCheck(cudaStreamDestroy(stream));
+    }
+    cudaCheck(cudaSetDevice(current));// reset to the previous device
+}
+
+inline size_t DeviceStreamMap::getMinPageSize() const
+{
+    NANOVDB_ASSERT(mFunctPtr);
+    CUmemAllocationProp prop = {};
+    prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+    prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    size_t minGranularity = 0;
+    for (auto it = this->begin(); it!=this->end(); ++it) {
+        prop.location.id = it->first;
+        size_t granularity = 0;
+        (*mFunctPtr)(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
+        if (minGranularity < granularity) minGranularity = granularity;
+    }
+    return minGranularity;
+}// DeviceStreamMap::getMinPageSize
+
+}// namespace cuda
+
+}// namespace nanovdb
+
+#endif // end of NANOVDB_CUDA_DEVICESTREAMMAP_H_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/cuda/GridHandle.cuh b/nanovdb/nanovdb/cuda/GridHandle.cuh
index a0fc96cb16..81446be193 100644
--- a/nanovdb/nanovdb/cuda/GridHandle.cuh
+++ b/nanovdb/nanovdb/cuda/GridHandle.cuh
@@ -51,8 +51,10 @@ splitGridHandles(const GridHandle<BufferT> &handle, const BufferT* other = nullp
     VectorT<GridHandle<BufferT>> handles(handle.gridCount());
     bool dirty, *d_dirty;// use this to check if the checksum needs to be recomputed
     cudaCheck(util::cuda::mallocAsync((void**)&d_dirty, sizeof(bool), stream));
+    int device = 0;
+    cudaCheck(cudaGetDevice(&device));
     for (uint32_t n=0; n<handle.gridCount(); ++n) {
-        auto buffer = BufferT::create(handle.gridSize(n), other, false, stream);
+        auto buffer = BufferT::create(handle.gridSize(n), other, device, stream);
         GridData *dst = reinterpret_cast<GridData*>(buffer.deviceData());
         const GridData *src = reinterpret_cast<const GridData*>(ptr);
         cudaCheck(cudaMemcpyAsync(dst, src, handle.gridSize(n), cudaMemcpyDeviceToDevice, stream));
@@ -77,7 +79,9 @@ mergeGridHandles(const VectorT<GridHandle<BufferT>> &handles, const BufferT* oth
         gridCount += h.gridCount();
         for (uint32_t n=0; n<h.gridCount(); ++n) size += h.gridSize(n);
     }
-    auto buffer = BufferT::create(size, other, false, stream);
+    int device = 0;
+    cudaCheck(cudaGetDevice(&device));
+    auto buffer = BufferT::create(size, other, device, stream);
     void *dst = buffer.deviceData();
     bool dirty, *d_dirty;// use this to check if the checksum needs to be recomputed
     cudaCheck(util::cuda::mallocAsync((void**)&d_dirty, sizeof(bool), stream));
diff --git a/nanovdb/nanovdb/cuda/NodeManager.cuh b/nanovdb/nanovdb/cuda/NodeManager.cuh
index 639155ce7f..bf9e05eb47 100644
--- a/nanovdb/nanovdb/cuda/NodeManager.cuh
+++ b/nanovdb/nanovdb/cuda/NodeManager.cuh
@@ -37,7 +37,9 @@ createNodeManager(const NanoGrid<BuildT> *d_grid,
                   const BufferT& pool = BufferT(),
                   cudaStream_t stream = 0)
 {
-    auto buffer = BufferT::create(sizeof(NodeManagerData), &pool, false, stream);
+    int device = 0;
+    cudaCheck(cudaGetDevice(&device));
+    auto buffer = BufferT::create(sizeof(NodeManagerData), &pool, device, stream);
     auto *d_data = (NodeManagerData*)buffer.deviceData();
     size_t size = 0u, *d_size;
     cudaCheck(util::cuda::mallocAsync((void**)&d_size, sizeof(size_t), stream));
@@ -62,7 +64,7 @@ createNodeManager(const NanoGrid<BuildT> *d_grid,
     cudaCheck(cudaMemcpyAsync(&size, d_size, sizeof(size_t), cudaMemcpyDeviceToHost, stream));
     cudaCheck(util::cuda::freeAsync(d_size, stream));
     if (size > sizeof(NodeManagerData)) {
-        auto tmp = BufferT::create(size, &pool, false, stream);// only allocate buffer on the device
+        auto tmp = BufferT::create(size, &pool, device, stream);// only allocate buffer on the device
         cudaCheck(cudaMemcpyAsync(tmp.deviceData(), buffer.deviceData(), sizeof(NodeManagerData), cudaMemcpyDeviceToDevice, stream));
         buffer = std::move(tmp);
         d_data = reinterpret_cast<NodeManagerData*>(buffer.deviceData());
diff --git a/nanovdb/nanovdb/cuda/UnifiedBuffer.h b/nanovdb/nanovdb/cuda/UnifiedBuffer.h
new file mode 100644
index 0000000000..0bb330e7ce
--- /dev/null
+++ b/nanovdb/nanovdb/cuda/UnifiedBuffer.h
@@ -0,0 +1,327 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+/*!
+    \file UnifiedBuffer.h
+
+    \author Ken Museth
+
+    \date October 15, 2024
+
+    \brief nanovdb::cuda::UnifiedBuffer that uses unified memory management
+
+    \note This file has no device-only kernel functions,
+          which explains why it's a .h and not .cuh file.
+*/
+
+#ifndef NANOVDB_CUDA_UNIFIEDBUFFER_H_HAS_BEEN_INCLUDED
+#define NANOVDB_CUDA_UNIFIEDBUFFER_H_HAS_BEEN_INCLUDED
+
+#include <cuda.h>
+#include <memory>// for std::shared_ptr
+#include <nanovdb/HostBuffer.h>// for BufferTraits
+#include <nanovdb/util/cuda/Util.h>// for cudaCheck
+
+namespace nanovdb {// ================================================================
+
+namespace cuda {// ===================================================================
+
+/// @brief  buffer, used for instance by the GridHandle, to allocate unified memory that
+///         can be resized and shared between multiple devices and the host.
+class UnifiedBuffer
+{
+    void *mPtr;
+    size_t mSize, mCapacity;
+public:
+
+    using PtrT = std::shared_ptr<UnifiedBuffer>;
+
+    /// @brief Default constructor of an empty buffer
+    UnifiedBuffer() : mPtr(nullptr), mSize(0), mCapacity(0){}
+
+    /// @brief Constructor that specifies both the size and capacity
+    /// @param size size of the bufffer in bytes, indication what is actually used
+    /// @param capacity number of bytes in the vitual page table, i.e max size for growing
+    /// @note Capacity can be over-estimated to allow for future groth. Memory is not allocated
+    ///       with this constructor, only a page table. Allocation happend on use or when calling prefetch
+    UnifiedBuffer(size_t size, size_t capacity) : mPtr(nullptr), mSize(size), mCapacity(capacity)
+    {
+        assert(mSize <= mCapacity);
+        cudaCheck(cudaMallocManaged(&mPtr, mCapacity, cudaMemAttachGlobal));
+    }
+
+     /// @brief Simular to the constructor above except the size and capacity are equal, so no future growth is supported
+    UnifiedBuffer(size_t size) : UnifiedBuffer(size, size){}
+
+    /// @brief Constructor that specifies the size, capacity, and device (for prefetching)
+    /// @param size
+    /// @param capacity
+    /// @param device
+    /// @param stream
+    UnifiedBuffer(uint64_t size, uint64_t capacity, int device, cudaStream_t stream = 0) : mPtr(nullptr), mSize(size), mCapacity(size)
+    {
+        cudaCheck(cudaMallocManaged(&mPtr, mCapacity, cudaMemAttachGlobal));
+        cudaCheck(cudaMemAdvise(mPtr, size, cudaMemAdviseSetPreferredLocation, device));
+        cudaCheck(cudaMemPrefetchAsync(mPtr, size, device, stream));
+    }
+
+    /// @brief Constructor with a specified device
+    /// @param size
+    /// @param device
+    /// @param stream
+    UnifiedBuffer(uint64_t size, int device, cudaStream_t stream = 0) : UnifiedBuffer(size, size, device, stream){}
+
+     /// @brief Disallow copy-construction
+    UnifiedBuffer(const UnifiedBuffer&) = delete;
+
+    /// @brief Move copy-constructor
+    UnifiedBuffer(UnifiedBuffer&& other) noexcept
+        : mSize(other.mSize)
+        , mCapacity(other.mCapacity)
+    {
+        cudaCheck(cudaFree(mPtr));
+        mPtr = other.mPtr;
+        other.mPtr = nullptr;
+        other.mSize = other.mCapacity = 0;
+    }
+
+    /// @brief Destructor
+    ~UnifiedBuffer(){cudaCheck(cudaFree(mPtr));}
+
+    ///////////////////////////////////////////////////////////////////////
+
+    //@{
+    /// @brief Factory methods that create an UnifiedBuffer instance and returns it with move semantics
+    static UnifiedBuffer create(size_t size, size_t capacity) {return UnifiedBuffer(size, capacity);}
+    static UnifiedBuffer create(size_t size) {return UnifiedBuffer(size);}
+    ///@}
+
+    //@{
+    /// @brief Factory methods that create a shared pointer to an UnifiedBuffer instance
+    static PtrT createPtr(size_t size, size_t capacity) {return std::make_shared<UnifiedBuffer>(size, capacity);}
+    static PtrT createPtr(size_t size) {return std::make_shared<UnifiedBuffer>(size);}
+    ///@}
+
+    /// @brief Legacy factory method that mirrors DeviceBuffer. It creates a UnifiedBuffer from a size and a reference buffer.
+    ///        If a reference buffer is provided and its non-empty, it is used to defined the capacity of the new buffer
+    /// @param size Size on bytes of the new buffer
+    /// @param reference reference buffer optionally used to define the capcity
+    /// @param host Ignored for now
+    /// @param stream cuda stream
+    /// @return An instance of a new UnifiedBuffer using move semantics
+    static UnifiedBuffer create(size_t size, const UnifiedBuffer* reference, int device, cudaStream_t stream)
+    {
+        const size_t capacity = (reference && reference->capacity()) ? reference->capacity() : size;
+        UnifiedBuffer buffer(size, capacity);
+        cudaCheck(cudaMemAdvise(buffer.mPtr, size, cudaMemAdviseSetPreferredLocation, device));
+        cudaCheck(cudaMemPrefetchAsync(buffer.mPtr, size, device, stream));
+        return buffer;
+    }
+
+    /// @brief Factory method that created a buffer on the host of the specified size. If the
+    ///        reference buffer has a capacity it is used. Also the buffer is prefethed to the host
+    /// @param size byte size of buffer initiated on the host
+    /// @param reference optional reference buffer from which the capacity is derived
+    static UnifiedBuffer create(size_t size, const UnifiedBuffer* reference){return create(size, reference, cudaCpuDeviceId, (cudaStream_t)0);}
+
+    /// @brief Factory method that created a buffer on the host or device of the specified size. If the
+    ///        reference buffer has a capacity it is used. Also the buffer is prefethed to the host or (current) device
+    /// @param size byte size of buffer initiated on the device or host
+    /// @param reference optional reference buffer from which the capacity is derived
+    /// @param host If true the buffer will be prefectched to the host, else to the current device
+    /// @param stream optional cuda stream
+    static UnifiedBuffer create(size_t size, const UnifiedBuffer* reference, bool host, void* stream = nullptr)
+    {
+        int device = cudaCpuDeviceId;
+        if (!host) cudaGetDevice(&device);
+        return create(size, reference, device, (cudaStream_t)stream);
+    }
+
+    /// @brief Free all memory and reset this instance to empty
+    void clear()
+    {
+        cudaCheck(cudaFree(mPtr));
+        mPtr = nullptr;
+        mSize = mCapacity = 0;
+    }
+
+    /// @brief Disallow copy assignment operation
+    UnifiedBuffer& operator=(const UnifiedBuffer&) = delete;
+
+    /// @brief Allow move assignment operation
+    UnifiedBuffer& operator=(UnifiedBuffer&& other)
+    {
+        cudaCheck(cudaFree(mPtr));
+        mPtr = other.mPtr;
+        mSize = other.mSize;
+        mCapacity = other.mCapacity;
+        other.mPtr = nullptr;
+        other.mSize = other.mCapacity = 0;
+        return *this;
+    }
+
+    /// @brief initialize buffer as a new with the specified size and capacity
+    /// @param size size of memory block to be used in bytes
+    /// @param capacity size of page table in bytes
+    void init(size_t size, size_t capacity)
+    {
+        NANOVDB_ASSERT(size <= capacity);
+        cudaCheck(cudaFree(mPtr));
+        mSize = size;
+        mCapacity = capacity;
+        cudaCheck(cudaMallocManaged(&mPtr, capacity, cudaMemAttachGlobal));
+    }
+
+    /// @brief Resize the memory block managed by this buffer. If the current capacity is larger than the new size this method
+    ///        simply redefines size. Otherwise a new page-table is defined, with the specified advice, and the old block is copied to the new block.
+    /// @param size size of the new memory block
+    /// @param dev
+    /// @param list
+    void resize(size_t size, int dev = cudaCpuDeviceId, std::initializer_list<cudaMemoryAdvise> list = {cudaMemAdviseSetPreferredLocation})
+    {
+        if (size <= mCapacity) {
+            mSize = size;
+        } else {
+            void *ptr = 0;
+            cudaCheck(cudaMallocManaged(&ptr, size, cudaMemAttachGlobal));
+            if (dev > -2) for (auto a : list) cudaCheck(cudaMemAdvise(ptr, size, a, dev));
+            if (mSize > 0) {// copy over data from the old memory block
+                cudaCheck(cudaMemcpy(ptr, mPtr, std::min(mSize, size), cudaMemcpyDefault));
+                cudaCheck(cudaFree(mPtr));
+            }
+            mPtr = ptr;
+            mSize = mCapacity = size;
+        }
+    }
+
+    /// @brief Apply a single advise to a memory block
+    /// @param byteOffset offset in bytes marking the begenning of the memory block to be advised
+    /// @param size size in bytes of the memory block to be advised.
+    /// @param dev the device ID to prefetch to, cudaCpuDeviceId = -1, 0, 1, ...
+    /// @param adv cuda device
+    void advise(ptrdiff_t byteOffset, size_t size, int dev, cudaMemoryAdvise adv)
+    {
+        cudaCheck(cudaMemAdvise(util::PtrAdd(mPtr, byteOffset), size, adv, dev));
+    }
+
+    /// @brief Apply a list of advices to a memory block
+    /// @param byteOffset offset in bytes marking the begenning of the memory block to be advised
+    /// @param size size in bytes of the memory block to be advised.
+    /// @param dev the device ID to prefetch to, cudaCpuDeviceId = -1, 0, 1, ...
+    /// @param list list of cuda advises
+    void advise(ptrdiff_t byteOffset, size_t size, int dev, std::initializer_list<cudaMemoryAdvise> list)
+    {
+        void *ptr = util::PtrAdd(mPtr, byteOffset);
+        for (auto a : list)  cudaCheck(cudaMemAdvise(ptr, size, a, dev));
+    }
+
+    /// @brief Prefetches data to the specified device, i.e. ensure the device has an up-to-date copy of the memory specified
+    /// @param byteOffset offset in bytes marking the begenning of the memory block to be prefetched
+    /// @param size size in bytes of the memory block to be prefetched. The default value of zero means copy all @c this->size() bytes.
+    /// @param dev the device ID to prefetch to, cudaCpuDeviceId = -1, 0, 1, ...
+    /// @param stream  cuda stream
+    void prefetch(ptrdiff_t byteOffset = 0, size_t size = 0, int dev = cudaCpuDeviceId, cudaStream_t stream = cudaStreamPerThread)
+    {
+        cudaCheck(cudaMemPrefetchAsync(util::PtrAdd(mPtr, byteOffset), size ? size : mSize, dev, stream));
+    }
+
+    ///////////////////////////////////////////////////////////////////////
+
+    /// @brief Prefectches all data to the specified device
+    /// @param device device ID, cudaCpuDeviceId = -1, 0, 1, ...
+    /// @param stream cuda stream
+    /// @param sync if false the memory copy is asynchronous
+    /// @note Legacy method included for compatibility with DeviceBuffer
+    void deviceUpload(int device = 0, cudaStream_t stream = cudaStreamPerThread, bool sync = false) const
+    {
+        cudaCheck(cudaMemPrefetchAsync(mPtr, mSize, device, stream));
+        if (sync) cudaCheck(cudaStreamSynchronize(stream));
+    }
+    void deviceUpload(int device, void* stream, bool sync) const{this->deviceUpload(device, cudaStream_t(stream));}
+
+    /// @brief Prefectches all data to the current device, as given by cudaGetDevice
+    /// @param stream cuda stream
+    /// @param sync if false the memory copy is asynchronous
+    /// @note Legacy method included for compatibility with DeviceBuffer
+    void deviceUpload(void* stream, bool sync) const{
+        int device = 0;
+        cudaCheck(cudaGetDevice(&device));
+        this->deviceUpload(device, cudaStream_t(stream), sync);
+    }
+
+    ///////////////////////////////////////////////////////////////////////
+
+    /// @brief Prefetches all data to the host
+    /// @param stream cuda stream
+    /// @param sync if false the memory copy is asynchronous
+
+    void deviceDownload(cudaStream_t stream = 0, bool sync = false) const
+    {
+        cudaCheck(cudaMemPrefetchAsync(mPtr, mSize, cudaCpuDeviceId, stream));
+        if (sync) cudaCheck(cudaStreamSynchronize(stream));
+    }
+
+    /// @brief Legacy
+    /// @param stream
+    /// @param sync
+    void deviceDownload(void* stream, bool sync) const{this->deviceDownload(cudaStream_t(stream), sync);}
+
+    // used by GridHandle
+    void deviceDownload(int dummmy, void* stream, bool sync) const{this->deviceDownload(cudaStream_t(stream), sync);}
+
+    ///////////////////////////////////////////////////////////////////////
+
+    /// @brief Retuns a raw pointer to the unified memory managed by this instance.
+    /// @warning Note that the pointer can be NULL!
+    void* data() const {return mPtr;}
+
+    /// @brief Returns an offset pointer of a specefic type from the allocated unified memory
+    /// @tparam T Type of the pointer returned
+    /// @param count Numbers of elements of @c parameter type T to skip (or offset) the return pointer
+    /// @warning assumes that this instance is not empty!
+    template <typename T>
+    T* data(ptrdiff_t count = 0) const {
+        NANOVDB_ASSERT(mPtr != nullptr || count == 0);
+        return reinterpret_cast<T*>(mPtr) + count;
+    }
+
+    /// @brief Returns a byte offset void pointer from the unified memory
+    /// @param byteOffset Number of bytes to skip (or offset) the return pointer
+    /// @warning assumes that this instance is not empty!
+    void* data(ptrdiff_t byteOffset) const {
+        NANOVDB_ASSERT(mPtr != nullptr || byteOffset == 0);
+        return util::PtrAdd(mPtr, byteOffset);
+    }
+
+    /// @brief Legacy
+    /// @return
+    void* deviceData()    const {return mPtr;}
+    void* deviceData(int) const {return mPtr;}
+
+    /// @brief Size of the allocated pages in this instance
+    /// @return number bytes allocated by this instance
+    size_t size() const {return mSize;}
+
+    /// @brief Capacity of this instance, i.e. room in page table
+    /// @return number of bytes reserved, but not necessarily allocated, by this instance
+    size_t capacity() const {return mCapacity;}
+
+    //@{
+    /// @brief Returns true if this allocator is empty, i.e. has no allocated memory
+    inline bool empty() const { return mPtr == nullptr; }
+    inline bool isEmpty() const { return this->empty(); }
+    //@}
+
+};// UnifiedBuffer
+
+}// namespace cuda
+
+template<>
+struct BufferTraits<cuda::UnifiedBuffer>
+{
+    static constexpr bool hasDeviceDual = true;
+};
+
+}// namespace nanovdb
+
+#endif // end of NANOVDB_CUDA_UNIFIEDBUFFER_H_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/examples/CMakeLists.txt b/nanovdb/nanovdb/examples/CMakeLists.txt
index d06371cd95..93c888710b 100644
--- a/nanovdb/nanovdb/examples/CMakeLists.txt
+++ b/nanovdb/nanovdb/examples/CMakeLists.txt
@@ -109,6 +109,11 @@ nanovdb_example(NAME "ex_collide_level_set")
 nanovdb_example(NAME "ex_raytrace_fog_volume")
 nanovdb_example(NAME "ex_raytrace_level_set")
 
+if(CUDAToolkit_FOUND)
+  nanovdb_example(NAME "ex_make_mgpu_nanovdb") # requires cuRAND
+  target_link_libraries(ex_make_mgpu_nanovdb PRIVATE CUDA::curand)
+endif()
+
 if(NANOVDB_USE_MAGICAVOXEL)
   nanovdb_example(NAME "ex_vox_to_nanovdb")
 endif()
diff --git a/nanovdb/nanovdb/examples/ex_collide_level_set/main.cc b/nanovdb/nanovdb/examples/ex_collide_level_set/main.cc
index 7771737fca..a5028fa68b 100644
--- a/nanovdb/nanovdb/examples/ex_collide_level_set/main.cc
+++ b/nanovdb/nanovdb/examples/ex_collide_level_set/main.cc
@@ -37,10 +37,8 @@ int main(int ac, char** av)
 
         const int numPoints = 10000000;
 
-        BufferT positionBuffer;
-        positionBuffer.init(numPoints * sizeof(float) * 3);
-        BufferT velocityBuffer;
-        velocityBuffer.init(numPoints * sizeof(float) * 3);
+        BufferT positionBuffer(numPoints * sizeof(float) * 3);
+        BufferT velocityBuffer(numPoints * sizeof(float) * 3);
 
         runNanoVDB(handle, numIterations, numPoints, positionBuffer, velocityBuffer);
 #if defined(NANOVDB_USE_OPENVDB)
diff --git a/nanovdb/nanovdb/examples/ex_make_custom_nanovdb_cuda/make_custom_nanovdb_cuda.cc b/nanovdb/nanovdb/examples/ex_make_custom_nanovdb_cuda/make_custom_nanovdb_cuda.cc
index b28c35d35b..64f1b11c9b 100644
--- a/nanovdb/nanovdb/examples/ex_make_custom_nanovdb_cuda/make_custom_nanovdb_cuda.cc
+++ b/nanovdb/nanovdb/examples/ex_make_custom_nanovdb_cuda/make_custom_nanovdb_cuda.cc
@@ -34,6 +34,7 @@ int main()
 
         cudaStream_t stream; // Create a CUDA stream to allow for asynchronous copy of pinned CUDA memory.
         cudaStreamCreate(&stream);
+
         handle.deviceUpload(stream, false); // Copy the NanoVDB grid to the GPU asynchronously
         auto* gpuGrid = handle.deviceGrid<float>(); // get a (raw) pointer to a NanoVDB grid of value type float on the GPU
 
diff --git a/nanovdb/nanovdb/examples/ex_make_mgpu_nanovdb/main.cu b/nanovdb/nanovdb/examples/ex_make_mgpu_nanovdb/main.cu
new file mode 100644
index 0000000000..98ca45baa8
--- /dev/null
+++ b/nanovdb/nanovdb/examples/ex_make_mgpu_nanovdb/main.cu
@@ -0,0 +1,306 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0#include <nanovdb/NanoVDB.h>
+#include <nanovdb/tools/CreatePrimitives.h>
+#include <nanovdb/cuda/UnifiedBuffer.h>
+#include <nanovdb/cuda/DeviceStreamMap.h>
+#include <nanovdb/util/cuda/Timer.h>
+
+#include <cassert>
+#include <cinttypes>
+#include <cuda.h>
+#include <curand.h>
+#include <mma.h>
+#include <device_launch_parameters.h>
+#include <iostream>
+#include <random>
+#include <thread>
+#include <type_traits>
+#include <map>
+
+namespace {// anonymous namespace
+
+template <typename frag_t>
+inline __device__
+void to_TF32(frag_t& frag) {
+#pragma unroll
+    for (int t = 0; t < frag.num_elements; t++)
+        frag.x[t] =  nvcuda::wmma::__float_to_tf32(frag.x[t]);
+}
+
+__global__
+__launch_bounds__(256)
+void stencilConvolve_v7(nanovdb::NanoGrid<nanovdb::ValueOnIndex> *deviceGrid, int leafOffset,
+    float *inputBuffer, uint32_t *haloIndices, float *haloBuffer,
+    float *stencil, float *outputBuffer, float *denseOutputBuffer)
+{
+    static constexpr int Di = 64;
+    static constexpr int Do = 128;
+    static constexpr int M = 16;
+    static constexpr int N = 16;
+    static constexpr int K = 8;
+
+    using InputBufferType = float (&)[][Di];
+    using HaloBufferType = float (&)[4][4][6][Di];
+    using StencilType = float (&)[3][3][3][Di][Do];
+    using OutputBufferType = float (&)[][Do];
+    using DenseOutputBufferType = float (&)[2][2][4][Do];
+    using SpokeStencilType = float (&)[2][2][4][Di];
+
+    InputBufferType mInputBuffer = reinterpret_cast<InputBufferType>(*inputBuffer);
+    OutputBufferType mOutputBuffer = reinterpret_cast<OutputBufferType>(*outputBuffer);
+    StencilType mStencil = reinterpret_cast<StencilType>(*stencil);
+
+    const int Bk = (blockIdx.x & 1) * 4;
+    const int Bj = ((blockIdx.x >> 1) & 3) * 2;
+    const int Bi = ((blockIdx.x >> 3) & 3) * 2;
+    const int leafId = (blockIdx.x >> 5) + leafOffset;
+    const int tid = threadIdx.x;
+
+    using LeafNodeType = nanovdb::NanoGrid<nanovdb::ValueOnIndex>::TreeType::LeafNodeType;
+    const auto& tree = deviceGrid->tree();
+    auto acc = tree.getAccessor();
+    const LeafNodeType& leaf = tree.template getFirstNode<LeafNodeType>()[leafId];
+    const nanovdb::Coord leafOrigin = leaf.origin();
+    const auto& valueMask = leaf.valueMask();
+
+    uint64_t activeMask = valueMask.words()[Bi] | valueMask.words()[Bi + 1];
+    activeMask &= (0xffffUL << (Bj << 3));
+    activeMask &= (0xf0f0f0f0f0f0f0fUL << Bk);
+    if (!activeMask) return;
+
+    int II = (tid >> 6) & 0x3;
+    int E  =  tid       & 0x3f;
+
+    __shared__ float sBufferRaw[6144]; // 4x4x6 array of elements of size Di=64
+    HaloBufferType sHaloBuffer = reinterpret_cast<HaloBufferType>(sBufferRaw[0]);
+    __shared__ float sSpokeStencil[2][2][4][Di];
+    __shared__ float sOutputBuffer[2][2][4][Do];
+
+    auto origin = leafOrigin.offsetBy(Bi, Bj, Bk);
+
+    // -----------------------------
+    for (int jj = 0; jj < 4; jj++)
+        for (int kk = 0; kk < 6; kk++) {
+            const auto& offset = acc.getValue(origin.offsetBy(II - 1, jj - 1, kk - 1));
+            sHaloBuffer[II][jj][kk][E] = mInputBuffer[offset][E];
+        }
+    __syncthreads();
+
+// -----------------------------
+
+    using a_frag_t = nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, M, N, K, nvcuda::wmma::precision::tf32, nvcuda::wmma::row_major>;
+    nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, M, N, K, nvcuda::wmma::precision::tf32, nvcuda::wmma::row_major> b_frag;
+    nvcuda::wmma::fragment<nvcuda::wmma::accumulator, M, N, K, float> c_frag;
+    __shared__ float sFragBuffer[8][32][4];
+    const int warpID = tid >> 5;
+    const int laneID = tid & 0x1f;
+    nvcuda::wmma::fill_fragment(c_frag, 0.f);
+
+    for (int di = 0; di <= 2; di++)
+    for (int dj = 0; dj <= 2; dj++)
+    for (int dk = 0; dk <= 2; dk++) {
+        // Create a contiguous copy of the 2x2x4 Spoke-Stencil
+        // No sychthreads here, because we take care of that at the end of the loop
+        for (int b = 0; b < 2 * 2 * 4 * Di; b += blockDim.x) {
+            auto eid = tid + b;
+            int ii = ( eid >> 9 ) & 0x1;  // 1-wide in X
+            int jj = ( eid >> 8 ) & 0x1;  // 2-wide in Y
+            int kk = ( eid >> 6 ) & 0x3;  // 4-wide in Z
+            int ee = ( eid >> 0 ) & 0x3f; // 64 entries
+            sSpokeStencil[ii][jj][kk][ee] = sHaloBuffer[ii + di][jj + dj][kk + dk][ee];
+        }
+        __syncthreads();
+
+        {
+            int inBase = warpID << 3;
+
+            a_frag_t& a_frag = *reinterpret_cast<a_frag_t*>(sFragBuffer[warpID][laneID]);
+            nvcuda::wmma::load_matrix_sync(a_frag, &sSpokeStencil[0][0][0][inBase], 64);
+            to_TF32(a_frag);
+
+            __syncthreads();
+        }
+
+#pragma unroll
+        for (int sweep = 0; sweep < 8; ++sweep) {
+            const int inBlock = sweep;
+            const int outBlock = warpID;
+            const int inBase = inBlock << 3;
+            const int outBase = outBlock << 4;
+
+            a_frag_t& a_frag = *reinterpret_cast<a_frag_t*>(sFragBuffer[inBlock][laneID]);
+
+            nvcuda::wmma::load_matrix_sync(b_frag, &mStencil[di][dj][dk][inBase][outBase], 128);
+            to_TF32(b_frag);
+            nvcuda::wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
+
+        }
+    }
+    nvcuda::wmma::store_matrix_sync(&sOutputBuffer[0][0][0][warpID << 4], c_frag, 128, nvcuda::wmma::mem_row_major);
+
+    // Sparse commit phase
+    __syncthreads();
+    const int warpI     = (tid >> 8) & 0x1;
+    const int warpJ     = (tid >> 7) & 0x1;
+    const int warpK     = (tid >> 5) & 0x3;
+    const int elementID = laneID | ((tid & 0x200) >> 4);
+
+    // NB: this if fixed for 256 threads/block
+    const int elementsPerSM = 32;
+    const int xSpanPerSM = 1;
+    const int ySpanPerSM = 2;
+
+#pragma unroll
+    for (int xOffset = 0; xOffset < 2; xOffset += xSpanPerSM)
+    for (int yOffset = 0; yOffset < 2; yOffset += ySpanPerSM) {
+        const auto coord = origin.offsetBy(warpI + xOffset, warpJ + yOffset, warpK);
+        const auto& offset = acc.getValue(coord);
+#pragma unroll
+        for (int elementOffset = 0; elementOffset < Do; elementOffset += elementsPerSM)
+            mOutputBuffer[offset][elementID + elementOffset] = sOutputBuffer[warpI + xOffset][warpJ + yOffset][warpK][elementID + elementOffset];
+    }
+}// stencilConvolve_v7
+
+}// anonymous namespace
+
+void testConvolution()
+{
+    static constexpr int Di = 64, Do = 128;
+    using InputBufferType  = std::array<float, Di>;
+    using OutputBufferType = std::array<float, Do>;
+    using StencilType = float (&)[3][3][3][Di][Do];
+
+    float* stencilHostPtr = new float[Do * 27 * Di];
+
+    std::mt19937 gen(42u);
+    std::uniform_real_distribution<float> uniform_dist(-1., 1.);
+
+    for (int i = 0; i < Do * 27 * Di; i++) stencilHostPtr[i] = uniform_dist(gen);
+
+    nanovdb::cuda::DeviceStreamMap devStreamMap( nanovdb::cuda::DeviceStreamMap::DeviceType::Unified, {}, 2);
+    const size_t deviceCount = devStreamMap.size();
+    std::cout << "Number of devices that supports unified memory: " << deviceCount << std::endl;
+
+    // Calculate minimum page size which corresponds to minimum physical allocation granularity
+    size_t minGranularity = devStreamMap.getMinPageSize();
+
+    // Ensure that we don't split an input and/or output feature across page boundaries
+    minGranularity = std::lcm(std::lcm(minGranularity, sizeof(InputBufferType)), sizeof(OutputBufferType));
+    size_t valueCountGranularity = minGranularity / min(sizeof(InputBufferType), sizeof(OutputBufferType));
+
+    // Initialize and replicate an IndexGrid on each device
+    auto floatHandle = nanovdb::tools::createLevelSetSphere<float>(100, nanovdb::Vec3d(0), 1, 3, nanovdb::Vec3d(0), "test");
+    nanovdb::FloatGrid* floatGrid = floatHandle.grid<float>();
+
+    using BufferT = nanovdb::cuda::DeviceBuffer;
+    auto indexHandle = nanovdb::tools::createNanoGrid<nanovdb::FloatGrid, nanovdb::ValueOnIndex, BufferT>(*floatGrid, 0u, false, false, 1);
+    for (auto& [device, stream] : devStreamMap) {// copy host buffer to all the device buffers
+        cudaSetDevice(device);
+        indexHandle.deviceUpload(device, stream, true);
+    }
+    auto* indexGrid = indexHandle.grid<nanovdb::ValueOnIndex>();
+
+    auto ceil = [](size_t x, size_t y)->size_t{return ((x + y - 1) / y) * y;};
+    const size_t valueCount = ceil(indexGrid->valueCount(), deviceCount * valueCountGranularity);
+
+    const size_t inputAllocationSize  = valueCount * sizeof(InputBufferType);
+    const size_t outputAllocationSize = valueCount * sizeof(OutputBufferType);
+
+    nanovdb::cuda::UnifiedBuffer inputBuffer(inputAllocationSize, 2*inputAllocationSize);// over-allocate
+    nanovdb::cuda::UnifiedBuffer outputBuffer(outputAllocationSize);
+
+    const size_t deviceValueCount = valueCount / deviceCount;
+
+    // Randomly initialize input features
+    std::vector<std::thread> threads;
+    auto streamIter = devStreamMap.begin();
+    for (int i = 0; i < deviceCount; ++i, ++streamIter) {
+        threads.push_back(std::thread([&, i, streamIter]() {
+            cudaSetDevice(streamIter->first);
+            float* inputStripePtr  =  inputBuffer.data<float>(deviceValueCount * Di * i);
+            float* outputStripePtr = outputBuffer.data<float>(deviceValueCount * Do * i);
+
+            unsigned long long seed = 42u;
+            curandGenerator_t rng;
+            curandStatus_t curandStatus = curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_DEFAULT);
+            curandStatus = curandSetPseudoRandomGeneratorSeed(rng, seed);
+            curandStatus = curandGenerateUniform(rng, inputStripePtr,  deviceValueCount * Di);
+            curandStatus = curandGenerateUniform(rng, outputStripePtr, deviceValueCount * Do);
+            curandStatus = curandDestroyGenerator(rng);
+
+            cudaDeviceSynchronize();
+        }));
+    }
+    for (int i = 0; i < deviceCount; ++i) threads[i].join();
+    threads.clear();
+
+    // Spawn a convolution kernel on each device that operates on a disjoint subset of the leaves. The leaves operated on by
+    // each device corresponds approximately to the features in the virtual address range that are physically allocated on
+    // the same device.
+    auto **timers = new nanovdb::util::cuda::Timer*[deviceCount];
+    const size_t leafNodeCount = indexGrid->tree().nodeCount(0);
+    streamIter = devStreamMap.begin();
+    for (int i = 0; i < deviceCount; ++i, ++streamIter) {
+        threads.emplace_back([&, i, streamIter]() {// capture iterator by value and not reference
+            const int device    = streamIter->first;
+            cudaStream_t stream = streamIter->second;
+            cudaSetDevice(device);
+            timers[i] = new nanovdb::util::cuda::Timer(stream);
+
+            size_t deviceLeafNodeCount = (leafNodeCount + deviceCount - 1) / deviceCount;
+            const size_t deviceLeafNodeOffset = deviceLeafNodeCount * i;
+            deviceLeafNodeCount = std::min(deviceLeafNodeCount, leafNodeCount - deviceLeafNodeOffset);
+
+            void* stencilDevicePtr;
+            cudaMallocAsync(&stencilDevicePtr, sizeof(float) * Do * 27 * Di, stream);
+            cudaMemcpyAsync(stencilDevicePtr, stencilHostPtr, sizeof(float) * Do * 27 * Di, cudaMemcpyHostToDevice, stream);
+
+            // If we use managed memory, we need to advise about the usage of the memory range in order to obtain an
+            // equivalently optimal paging strategy. cudaMemAdviseSetReadMostly instructs the "paging policy" that data
+            // is far more likely to be read than set, cudaMemAdviseSetPreferredLocation defines the device that is most like
+            // to the data, and cudaMemAdviseSetAccessedBy is a hint about the which devices are communicating data.
+            const size_t inPageSize  = deviceValueCount * Di * sizeof(float), inPageOffset  =  inPageSize * i;// in bytes
+            const size_t outPageSize = deviceValueCount * Do * sizeof(float), outPageOffset = outPageSize * i;// in bytes
+            inputBuffer.advise(inPageOffset, inPageSize, device, {cudaMemAdviseSetReadMostly, cudaMemAdviseSetPreferredLocation});
+            inputBuffer.prefetch(inPageOffset, inPageSize, device, stream);
+            outputBuffer.advise(outPageOffset, outPageSize, device, cudaMemAdviseSetPreferredLocation);
+            for (auto other = devStreamMap.begin(); other != devStreamMap.end(); ++other) {
+                inputBuffer.advise(  inPageOffset,  inPageSize, other->first, cudaMemAdviseSetAccessedBy);
+                outputBuffer.advise(outPageOffset, outPageSize, other->first, cudaMemAdviseSetAccessedBy);
+            }
+            cudaStreamSynchronize(stream);
+            auto d_indexGrid =  reinterpret_cast<nanovdb::OnIndexGrid*>(indexHandle.deviceData(device));
+
+            // Run 10 warmup iterations
+            float* stencilPtr = (float*)stencilDevicePtr;
+            dim3 blockDim(256);
+            for (int k = 0; k < 10; ++k) {
+                stencilConvolve_v7<<<deviceLeafNodeCount * 2 * 4 * 4, blockDim, 0, stream>>>(
+                    d_indexGrid, deviceLeafNodeOffset, inputBuffer.data<float>(), nullptr, nullptr, stencilPtr, outputBuffer.data<float>(), nullptr);
+            }
+            cudaStreamSynchronize(stream);
+
+            timers[i]->start();
+            stencilConvolve_v7<<<deviceLeafNodeCount * 2 * 4 * 4, blockDim, 0, stream>>>(
+                d_indexGrid, deviceLeafNodeOffset, inputBuffer.data<float>(), nullptr, nullptr, stencilPtr, outputBuffer.data<float>(), nullptr);
+            timers[i]->record();
+            cudaFree(stencilDevicePtr);
+        });
+    }
+    for (int i = 0; i < deviceCount; ++i) threads[i].join();
+    delete [] stencilHostPtr;
+
+    streamIter = devStreamMap.begin();
+    for (int i = 0; i < deviceCount; ++i, ++streamIter) {
+        timers[i]->print("Device " + std::to_string(streamIter->first) + " GPU convolution ", std::cout);
+        delete timers[i];
+    }
+    delete [] timers;
+    std::cout << "Multi-GPU sparse convolution test complete!" << std::endl;
+}// testConvolution
+
+int main(int argc, char** argv)
+{
+    testConvolution();
+    return 0;
+}
diff --git a/nanovdb/nanovdb/examples/ex_modify_nanovdb_thrust/modify_nanovdb_thrust.cc b/nanovdb/nanovdb/examples/ex_modify_nanovdb_thrust/modify_nanovdb_thrust.cc
index 2ff54fdc75..f2bb4ecd3e 100644
--- a/nanovdb/nanovdb/examples/ex_modify_nanovdb_thrust/modify_nanovdb_thrust.cc
+++ b/nanovdb/nanovdb/examples/ex_modify_nanovdb_thrust/modify_nanovdb_thrust.cc
@@ -16,7 +16,7 @@ int main()
         auto handle = nanovdb::tools::createLevelSetSphere<float, nanovdb::cuda::DeviceBuffer>(100.0f);
         using GridT = nanovdb::FloatGrid;
 
-        handle.deviceUpload(0, false); // Copy the NanoVDB grid to the GPU asynchronously
+        handle.deviceUpload(nullptr, false); // Copy the NanoVDB grid to the GPU asynchronously
 
         const GridT* grid = handle.grid<float>(); // get a (raw) const pointer to a NanoVDB grid of value type float on the CPU
         GridT* deviceGrid = handle.deviceGrid<float>(); // get a (raw) pointer to a NanoVDB grid of value type float on the GPU
@@ -32,7 +32,7 @@ int main()
 
         scaleActiveVoxels(deviceGrid, grid->tree().nodeCount(0), 2.0f);
 
-        handle.deviceDownload(0, true); // Copy the NanoVDB grid to the CPU synchronously
+        handle.deviceDownload(nullptr, true); // Copy the NanoVDB grid to the CPU synchronously
 
         std::cout << "Value after scaling  = " << grid->tree().getValue(nanovdb::Coord(101,0,0)) << std::endl;
     }
@@ -40,4 +40,4 @@ int main()
         std::cerr << "An exception occurred: \"" << e.what() << "\"" << std::endl;
     }
     return 0;
-}
\ No newline at end of file
+}
diff --git a/nanovdb/nanovdb/examples/ex_raytrace_fog_volume/main.cc b/nanovdb/nanovdb/examples/ex_raytrace_fog_volume/main.cc
index 697afcf857..a82708b136 100644
--- a/nanovdb/nanovdb/examples/ex_raytrace_fog_volume/main.cc
+++ b/nanovdb/nanovdb/examples/ex_raytrace_fog_volume/main.cc
@@ -37,8 +37,7 @@ int main(int ac, char** av)
 
         const int width = 1024;
         const int height = 1024;
-        BufferT   imageBuffer;
-        imageBuffer.init(width * height * sizeof(float));
+        BufferT   imageBuffer(width * height * sizeof(float));
 
         runNanoVDB(handle, numIterations, width, height, imageBuffer);
 #if defined(NANOVDB_USE_OPENVDB)
diff --git a/nanovdb/nanovdb/examples/ex_raytrace_level_set/main.cc b/nanovdb/nanovdb/examples/ex_raytrace_level_set/main.cc
index 71cd8959af..c31780c8f4 100644
--- a/nanovdb/nanovdb/examples/ex_raytrace_level_set/main.cc
+++ b/nanovdb/nanovdb/examples/ex_raytrace_level_set/main.cc
@@ -37,8 +37,7 @@ int main(int ac, char** av)
 
         const int width = 1024;
         const int height = 1024;
-        BufferT   imageBuffer;
-        imageBuffer.init(width * height * sizeof(float));
+        BufferT   imageBuffer(width * height * sizeof(float));
 
         runNanoVDB(handle, numIterations, width, height, imageBuffer);
 #if defined(NANOVDB_USE_OPENVDB)
diff --git a/nanovdb/nanovdb/io/IO.h b/nanovdb/nanovdb/io/IO.h
index a7110846a9..5d5fc94141 100644
--- a/nanovdb/nanovdb/io/IO.h
+++ b/nanovdb/nanovdb/io/IO.h
@@ -14,13 +14,20 @@
 
     \note  This file does NOT depend on OpenVDB, but optionally on ZIP and BLOSC
 
-    \details NanoVDB files take on of two formats:
+    \details NanoVDB files take on one of two following formats:
              1) multiple segments each with multiple grids (segments have easy to access metadata about its grids)
              2) starting with verion 32.6.0 nanovdb files also support a raw buffer with one or more grids (just a
              dump of a raw grid buffer, so no new metadata in headers as when using segments mentioned above).
 
-    // 1: Segment:  FileHeader, MetaData0, gridName0...MetaDataN, gridNameN, compressed Grid0, ... compressed GridN
-    // 2: Raw: Grid0, ... GridN
+    Example of case 1:
+        | <------------------------------------ segment 1 with N grids --------------------------------------> | <--- segment 2 ...
+        FileHeader, FileMetaData0, gridName0...FileMetaDataN, gridNameN, compressed Grid0, ... compressed GridN FileHeader ...
+    Example of case 2:
+        | <-- grid buffer ---> |
+        Grid0, Grid1, ... GridN
+
+    Note that FileHeader and FileMetaData (both defined in NanoVDB.h) have fixed sizes of respectively 16B and 176B.
+    However, GridNameX and GridX have variable sizes!
 */
 
 #ifndef NANOVDB_IO_H_HAS_BEEN_INCLUDED
@@ -722,9 +729,13 @@ inline uint64_t stringHash(const char* c_str)
 
 } // namespace io ======================================================================
 
+} // namespace nanovdb ===================================================================
+
+// the following stream specializations should not be namespaced!
+
 template<typename T>
 inline std::ostream&
-operator<<(std::ostream& os, const math::BBox<math::Vec3<T>>& b)
+operator<<(std::ostream& os, const nanovdb::math::BBox<nanovdb::math::Vec3<T>>& b)
 {
     os << "(" << b[0][0] << "," << b[0][1] << "," << b[0][2] << ") -> "
        << "(" << b[1][0] << "," << b[1][1] << "," << b[1][2] << ")";
@@ -732,7 +743,7 @@ operator<<(std::ostream& os, const math::BBox<math::Vec3<T>>& b)
 }
 
 inline std::ostream&
-operator<<(std::ostream& os, const CoordBBox& b)
+operator<<(std::ostream& os, const nanovdb::CoordBBox& b)
 {
     os << "(" << b[0][0] << "," << b[0][1] << "," << b[0][2] << ") -> "
        << "(" << b[1][0] << "," << b[1][1] << "," << b[1][2] << ")";
@@ -740,7 +751,7 @@ operator<<(std::ostream& os, const CoordBBox& b)
 }
 
 inline std::ostream&
-operator<<(std::ostream& os, const Coord& ijk)
+operator<<(std::ostream& os, const nanovdb::Coord& ijk)
 {
     os << "(" << ijk[0] << "," << ijk[1] << "," << ijk[2] << ")";
     return os;
@@ -748,7 +759,7 @@ operator<<(std::ostream& os, const Coord& ijk)
 
 template<typename T>
 inline std::ostream&
-operator<<(std::ostream& os, const math::Vec3<T>& v)
+operator<<(std::ostream& os, const nanovdb::math::Vec3<T>& v)
 {
     os << "(" << v[0] << "," << v[1] << "," << v[2] << ")";
     return os;
@@ -756,12 +767,10 @@ operator<<(std::ostream& os, const math::Vec3<T>& v)
 
 template<typename T>
 inline std::ostream&
-operator<<(std::ostream& os, const math::Vec4<T>& v)
+operator<<(std::ostream& os, const nanovdb::math::Vec4<T>& v)
 {
     os << "(" << v[0] << "," << v[1] << "," << v[2] << "," << v[3] << ")";
     return os;
 }
 
-} // namespace nanovdb ===================================================================
-
 #endif // NANOVDB_IO_H_HAS_BEEN_INCLUDED
diff --git a/nanovdb/nanovdb/math/Math.h b/nanovdb/nanovdb/math/Math.h
index da3a616248..3c33944b02 100644
--- a/nanovdb/nanovdb/math/Math.h
+++ b/nanovdb/nanovdb/math/Math.h
@@ -1042,10 +1042,10 @@ struct BaseBBox
         return *this;
     }
 
-    //__hostdev__ BaseBBox expandBy(typename Vec3T::ValueType padding) const
-    //{
-    //    return BaseBBox(mCoord[0].offsetBy(-padding),mCoord[1].offsetBy(padding));
-    //}
+//    __hostdev__ BaseBBox expandBy(typename Vec3T::ValueType padding) const
+//    {
+//        return BaseBBox(mCoord[0].offsetBy(-padding),mCoord[1].offsetBy(padding));
+//    }
     __hostdev__ bool isInside(const Vec3T& xyz)
     {
         if (xyz[0] < mCoord[0][0] || xyz[1] < mCoord[0][1] || xyz[2] < mCoord[0][2])
diff --git a/nanovdb/nanovdb/python/PyGridHandle.h b/nanovdb/nanovdb/python/PyGridHandle.h
index 190c6ab15f..b102899365 100644
--- a/nanovdb/nanovdb/python/PyGridHandle.h
+++ b/nanovdb/nanovdb/python/PyGridHandle.h
@@ -17,7 +17,7 @@ template<typename BufferT> nb::class_<nanovdb::GridHandle<BufferT>> defineGridHa
     return nb::class_<nanovdb::GridHandle<BufferT>>(m, name)
         .def(nb::init<>())
         .def("reset", &nanovdb::GridHandle<BufferT>::reset)
-        .def("size", &nanovdb::GridHandle<BufferT>::size)
+        .def("size", &nanovdb::GridHandle<BufferT>::bufferSize)
         .def("isEmpty", &nanovdb::GridHandle<BufferT>::isEmpty)
         .def("empty", &nanovdb::GridHandle<BufferT>::empty)
         .def(
diff --git a/nanovdb/nanovdb/tools/GridStats.h b/nanovdb/nanovdb/tools/GridStats.h
index fac54b20f7..81383468e3 100644
--- a/nanovdb/nanovdb/tools/GridStats.h
+++ b/nanovdb/nanovdb/tools/GridStats.h
@@ -18,6 +18,7 @@
 #include <nanovdb/NanoVDB.h>
 
 #ifdef NANOVDB_USE_TBB
+#include <memory>
 #include <tbb/blocked_range.h>
 #include <tbb/parallel_reduce.h>
 #endif
diff --git a/nanovdb/nanovdb/tools/cuda/IndexToGrid.cuh b/nanovdb/nanovdb/tools/cuda/IndexToGrid.cuh
index d26b09a24f..9e8431260c 100644
--- a/nanovdb/nanovdb/tools/cuda/IndexToGrid.cuh
+++ b/nanovdb/nanovdb/tools/cuda/IndexToGrid.cuh
@@ -359,7 +359,9 @@ inline BufferT IndexToGrid<SrcBuildT>::getBuffer(const BufferT &pool)
     mNodeAcc.meta  = mNodeAcc.node[0]  + NanoLeaf<DstBuildT>::DataType::memUsage()*mNodeAcc.nodeCount[0];// leaf nodes end and blind meta data begins
     mNodeAcc.blind = mNodeAcc.meta  + 0*sizeof(GridBlindMetaData); // meta data ends and blind data begins
     mNodeAcc.size  = mNodeAcc.blind;// end of buffer
-    auto buffer = BufferT::create(mNodeAcc.size, &pool, false, mStream);
+    int device = 0;
+    cudaCheck(cudaGetDevice(&device));
+    auto buffer = BufferT::create(mNodeAcc.size, &pool, device, mStream);
     mNodeAcc.d_dstPtr = buffer.deviceData();
     if (mNodeAcc.d_dstPtr == nullptr) throw std::runtime_error("Failed memory allocation on the device");
 
diff --git a/nanovdb/nanovdb/tools/cuda/PointsToGrid.cuh b/nanovdb/nanovdb/tools/cuda/PointsToGrid.cuh
index bcf335efe1..07c3dab2ba 100644
--- a/nanovdb/nanovdb/tools/cuda/PointsToGrid.cuh
+++ b/nanovdb/nanovdb/tools/cuda/PointsToGrid.cuh
@@ -22,6 +22,7 @@
 
 #include <nanovdb/NanoVDB.h>
 #include <nanovdb/cuda/DeviceBuffer.h>
+#include <nanovdb/cuda/UnifiedBuffer.h>
 #include <nanovdb/GridHandle.h>
 #include <nanovdb/tools/cuda/GridChecksum.cuh>
 #include <nanovdb/util/cuda/Timer.h>
@@ -39,7 +40,6 @@ namespace tools::cuda {// ======================================================
 ///        mainly used as a means to build a BVH acceleration structure for points, e.g. for efficient rendering.
 /// @tparam PtrT Template type to a raw or fancy-pointer of point coordinates in world space. Dereferencing should return Vec3f or Vec3d.
 /// @tparam BufferT Template type of buffer used for memory allocation on the device
-/// @tparam AllocT  Template type of optional device allocator for internal temporary memory
 /// @param dWorldPoints Raw or fancy pointer to list of point coordinates in world space on the device
 /// @param pointCount number of point in the list @c d_world
 /// @param voxelSize Size of a voxel in world units used for the output grid
@@ -49,7 +49,7 @@ namespace tools::cuda {// ======================================================
 /// @param stream optional CUDA stream (defaults to CUDA stream 0)
 /// @return Returns a handle with a grid of type NanoGrid<Point> where point information, e.g. coordinates,
 ///         are represented as blind data defined by @c type.
-template<typename PtrT, typename BufferT = nanovdb::cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+template<typename PtrT, typename BufferT = nanovdb::cuda::DeviceBuffer>
 GridHandle<BufferT>
 pointsToGrid(const PtrT dWorldPoints,
              int pointCount,
@@ -64,7 +64,6 @@ pointsToGrid(const PtrT dWorldPoints,
 ///        mainly used as a means to build a BVH acceleration structure for points, e.g. for efficient rendering.
 /// @tparam PtrT Template type to a raw or fancy-pointer of point coordinates in world space. Dereferencing should return Vec3f or Vec3d.
 /// @tparam BufferT Template type of buffer used for memory allocation on the device
-/// @tparam AllocT  Template type of optional device allocator for internal temporary memory
 /// @param dWorldPoints Raw or fancy pointer to list of point coordinates in world space on the device
 /// @param pointCount total number of point in the list @c d_world
 /// @param maxPointsPerVoxel Max density of points per voxel, i.e. maximum number of points in any voxel
@@ -78,7 +77,7 @@ pointsToGrid(const PtrT dWorldPoints,
 /// @param stream optional CUDA stream (defaults to CUDA stream 0)
 /// @return Returns a handle with a grid of type NanoGrid<Point> where point information, e.g. coordinates,
 ///         are represented as blind data defined by @c type.
-template<typename PtrT, typename BufferT = nanovdb::cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+template<typename PtrT, typename BufferT = nanovdb::cuda::DeviceBuffer>
 GridHandle<BufferT>
 pointsToGrid(const PtrT dWorldPoints,
              int pointCount,
@@ -91,7 +90,7 @@ pointsToGrid(const PtrT dWorldPoints,
 
 //-----------------------------------------------------------------------------------------------------
 
-template<typename BuildT, typename PtrT, typename BufferT = nanovdb::cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+template<typename BuildT, typename PtrT, typename BufferT = nanovdb::cuda::DeviceBuffer>
 GridHandle<BufferT>
 pointsToGrid(std::vector<std::tuple<const PtrT,size_t,double,PointType>> pointSet,
             const BufferT &buffer = BufferT(),
@@ -105,13 +104,12 @@ pointsToGrid(std::vector<std::tuple<const PtrT,size_t,double,PointType>> pointSe
 /// @tparam BuildT Template type of the return grid
 /// @tparam PtrT Template type to a raw or fancy-pointer of point coordinates in world space. Dereferencing should return Vec3f or Vec3d.
 /// @tparam BufferT Template type of buffer used for memory allocation on the device
-/// @tparam AllocT  Template type of optional device allocator for internal temporary memory
 /// @param dGridVoxels Raw or fancy pointer to list of voxel coordinates in grid (or index) space on the device
 /// @param pointCount number of voxel in the list @c dGridVoxels
 /// @param voxelSize Size of a voxel in world units used for the output grid
 /// @param buffer Instance of the device buffer used for memory allocation
 /// @return Returns a handle with the grid of type NanoGrid<BuildT>
-template<typename BuildT, typename PtrT, typename BufferT = nanovdb::cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+template<typename BuildT, typename PtrT, typename BufferT = nanovdb::cuda::DeviceBuffer>
 GridHandle<BufferT>
 voxelsToGrid(const PtrT dGridVoxels,
              size_t voxelCount,
@@ -121,7 +119,7 @@ voxelsToGrid(const PtrT dGridVoxels,
 
 //-------------------------------------------------------------------------------------------------------
 
-template<typename BuildT, typename PtrT, typename BufferT = nanovdb::cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+template<typename BuildT, typename PtrT, typename BufferT = nanovdb::cuda::DeviceBuffer>
 GridHandle<BufferT>
 voxelsToGrid(std::vector<std::tuple<const PtrT, size_t, double>> pointSet,
              const BufferT &buffer = BufferT(),
@@ -131,7 +129,7 @@ voxelsToGrid(std::vector<std::tuple<const PtrT, size_t, double>> pointSet,
 
 /// @brief Example class of a fancy pointer that can optionally be used as a template for writing
 ///        a custom fancy pointer that allows for particle coordinates to be arrange non-linearly
-///        in memory. For instance with coordinates are interlaced with other dats, i.e. an array
+///        in memory. For instance, when coordinates are interlaced with other data, e.g. an array
 ///        of structs, a custom implementation of fancy_ptr::operator[](size_t i) can account for
 ///        strides that skip other interlaces data.
 /// @tparam T Template type that specifies the type use for the coordinates of the points
@@ -266,7 +264,7 @@ __hostdev__ inline static Vec3T voxelToWorld(const Vec3f &voxel, const Coord &ij
 
 namespace tools::cuda {
 
-template <typename BuildT, typename AllocT = cub::CachingDeviceAllocator>
+template <typename BuildT>
 class PointsToGrid
 {
 public:
@@ -296,6 +294,7 @@ public:
     /// @param stream optional CUDA stream (defaults to CUDA stream 0)
     PointsToGrid(const Map &map, cudaStream_t stream = 0)
         : mStream(stream)
+        , mTimer(stream)
         , mPointType(util::is_same<BuildT,Point>::value ? PointType::Default : PointType::Disable)
     {
         mData.map = map;
@@ -392,34 +391,30 @@ private:
     int               mTolerance{1}, mMaxIterations{1};
     CheckMode         mChecksum{CheckMode::Disable};
 
-    // wrapper of AllocT, defaulting to cub::CachingDeviceAllocator, which offers a shared scratch space
     struct Allocator {
-        AllocT mAllocator;
         void* d_scratch;
         size_t scratchSize, actualScratchSize;
         Allocator() : d_scratch(nullptr), scratchSize(0), actualScratchSize(0) {}
         ~Allocator() {
-            if (scratchSize > 0) this->free(d_scratch);// a bug in cub makes this necessary
-            mAllocator.FreeAllCached();
+            if (scratchSize > 0) cudaFree(d_scratch);
         }
+
         template <typename T>
         T* alloc(size_t count, cudaStream_t stream) {
             T* d_ptr = nullptr;
-            cudaCheck(mAllocator.DeviceAllocate((void**)&d_ptr, sizeof(T)*count, stream));
+            cudaCheck(cudaMallocAsync((void**)&d_ptr, sizeof(T)*count, stream));
             return d_ptr;
         }
+
         template <typename T>
         T* alloc(cudaStream_t stream) {return this->template alloc<T>(1, stream);}
-        void free(void *d_ptr) {if (d_ptr) cudaCheck(mAllocator.DeviceFree(d_ptr));}
-        template<class... T>
-        void free(void *d_ptr, T... other) {
-            if (d_ptr) cudaCheck(mAllocator.DeviceFree(d_ptr));
-            this->free(other...);
-        }
-        void adjustScratch(cudaStream_t stream){
+
+        void free(void *d_ptr, cudaStream_t stream) {cudaCheck(cudaFreeAsync(d_ptr, stream));}
+
+        void adjustScratch(cudaStream_t stream) {
             if (scratchSize > actualScratchSize) {
-                if (actualScratchSize>0) cudaCheck(mAllocator.DeviceFree(d_scratch));
-                cudaCheck(mAllocator.DeviceAllocate((void**)&d_scratch, scratchSize, stream));
+                if (actualScratchSize>0) cudaCheck(cudaFreeAsync(d_scratch, stream));
+                cudaCheck(cudaMallocAsync((void**)&d_scratch, scratchSize, stream));
                 actualScratchSize = scratchSize;
             }
         }
@@ -437,8 +432,8 @@ namespace kernels {
 /// error : For this host platform/dialect, an extended lambda cannot be defined inside the 'if'
 /// or 'else' block of a constexpr if statement.
 /// function in a lambda through lambdaKernel wrapper defined in CudaUtils.h.
-template <typename BuildT, typename AllocT = cub::CachingDeviceAllocator>
-__global__ void fillValueIndexKernel(const size_t numItems, uint64_t* devValueIndex, typename PointsToGrid<BuildT, AllocT>::Data* d_data) {
+template <typename BuildT>
+__global__ void fillValueIndexKernel(const size_t numItems, uint64_t* devValueIndex, typename PointsToGrid<BuildT>::Data* d_data) {
     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
     if (tid >= numItems) return;
     devValueIndex[tid] = static_cast<uint64_t>(d_data->getLeaf(tid).mValueMask.countOn());
@@ -450,8 +445,8 @@ __global__ void fillValueIndexKernel(const size_t numItems, uint64_t* devValueIn
 /// to fix the following on Windows platform:
 /// error : For this host platform/dialect, an extended lambda cannot be defined inside the 'if'
 /// or 'else' block of a constexpr if statement.
-template <typename BuildT, typename AllocT = cub::CachingDeviceAllocator>
-__global__ void leafPrefixSumKernel(const size_t numItems, uint64_t* devValueIndexPrefix, typename PointsToGrid<BuildT, AllocT>::Data* d_data) {
+template <typename BuildT>
+__global__ void leafPrefixSumKernel(const size_t numItems, uint64_t* devValueIndexPrefix, typename PointsToGrid<BuildT>::Data* d_data) {
     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
     if (tid >= numItems) return;
 
@@ -477,8 +472,8 @@ __global__ void leafPrefixSumKernel(const size_t numItems, uint64_t* devValueInd
 /// to fix the following on Windows platform:
 /// error : For this host platform/dialect, an extended lambda cannot be defined inside the 'if'
 /// or 'else' block of a constexpr if statement.
-template <typename BuildT, typename AllocT = cub::CachingDeviceAllocator>
-__global__ void setMaskEqValMaskKernel(const size_t numItems, typename PointsToGrid<BuildT, AllocT>::Data* d_data) {
+template <typename BuildT>
+__global__ void setMaskEqValMaskKernel(const size_t numItems, typename PointsToGrid<BuildT>::Data* d_data) {
     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
     if (tid >= numItems) return;
     auto &leaf = d_data->getLeaf(tid);
@@ -505,12 +500,12 @@ __global__ void setMaskEqValMaskKernel(const size_t numItems, typename PointsToG
 
 //-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 
-template<typename BuildT, typename AllocT>
+template<typename BuildT>
 template<typename PtrT, typename BufferT>
 inline GridHandle<BufferT>
-PointsToGrid<BuildT, AllocT>::getHandle(const PtrT points,
-                                        size_t pointCount,
-                                        const BufferT &pool)
+PointsToGrid<BuildT>::getHandle(const PtrT points,
+                                size_t pointCount,
+                                const BufferT &pool)
 {
     if (mVerbose==1) mTimer.start("\nCounting nodes");
     this->countNodes(points, pointCount);
@@ -538,9 +533,11 @@ PointsToGrid<BuildT, AllocT>::getHandle(const PtrT points,
     if (mVerbose==1) mTimer.stop();
 
     if (mVerbose==1) mTimer.restart("Computation of checksum");
-    tools::cuda::updateChecksum((GridData*)buffer.deviceData(), mChecksum);
+    tools::cuda::updateChecksum((GridData*)buffer.deviceData(), mChecksum, mStream);
     if (mVerbose==1) mTimer.stop();
 
+    cudaStreamSynchronize(mStream);
+
     return GridHandle<BufferT>(std::move(buffer));
 }// PointsToGrid<BuildT>::getHandle
 
@@ -562,9 +559,9 @@ struct ShiftRightIterator : public cub::TransformInputIterator<OutT, ShiftRight<
 
 //-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 
-template <typename BuildT, typename AllocT>
+template <typename BuildT>
 template <typename PtrT>
-void PointsToGrid<BuildT, AllocT>::countNodes(const PtrT points, size_t pointCount)
+void PointsToGrid<BuildT>::countNodes(const PtrT points, size_t pointCount)
 {
     using Vec3T = typename util::remove_const<typename pointer_traits<PtrT>::element_type>::type;
     if constexpr(util::is_same<BuildT, Point>::value) {
@@ -616,7 +613,7 @@ jump:// this marks the beginning of the actual algorithm
     }, mDeviceData, points);
     cudaCheckError();
     if (mVerbose==2) mTimer.restart("DeviceRadixSort of "+std::to_string(pointCount)+" tile keys");
-    CALL_CUBS(DeviceRadixSort::SortPairs, d_keys, mData.d_keys, d_indx, mData.d_indx, pointCount, 0, 62);// 21 bits per coord
+    CALL_CUBS(DeviceRadixSort::SortPairs, d_keys, mData.d_keys, d_indx, mData.d_indx, pointCount, 0, 63);// 21 bits per coord
     std::swap(d_indx, mData.d_indx);// sorted indices are now in d_indx
 
     if (mVerbose==2) mTimer.restart("Allocate runs");
@@ -626,13 +623,14 @@ jump:// this marks the beginning of the actual algorithm
     if (mVerbose==2) mTimer.restart("DeviceRunLengthEncode tile keys");
     CALL_CUBS(DeviceRunLengthEncode::Encode, mData.d_keys, d_keys, d_points_per_tile, d_node_count+2, pointCount);
     cudaCheck(cudaMemcpyAsync(mData.nodeCount+2, d_node_count+2, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
+    cudaCheck(cudaStreamSynchronize(mStream));
     mData.d_tile_keys = mMemPool.template alloc<uint64_t>(mData.nodeCount[2], mStream);
     cudaCheck(cudaMemcpyAsync(mData.d_tile_keys, d_keys, mData.nodeCount[2]*sizeof(uint64_t), cudaMemcpyDeviceToDevice, mStream));
 
     if (mVerbose) mTimer.restart("DeviceRadixSort of " + std::to_string(pointCount) + " voxel keys in " + std::to_string(mData.nodeCount[2]) + " tiles");
     uint32_t *points_per_tile = new uint32_t[mData.nodeCount[2]];
     cudaCheck(cudaMemcpyAsync(points_per_tile, d_points_per_tile, mData.nodeCount[2]*sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
-    mMemPool.free(d_points_per_tile);
+    mMemPool.free(d_points_per_tile, mStream);
 
     for (uint32_t id = 0, offset = 0; id < mData.nodeCount[2]; ++id) {
         const uint32_t count = points_per_tile[id];
@@ -651,24 +649,30 @@ jump:// this marks the beginning of the actual algorithm
         CALL_CUBS(DeviceRadixSort::SortPairs, d_keys + offset, mData.d_keys + offset, d_indx + offset, mData.d_indx + offset, count, 0, 36);// 9+12+15=36
         offset += count;
     }
-    mMemPool.free(d_indx);
+    mMemPool.free(d_indx, mStream);
     delete [] points_per_tile;
 
     if (mVerbose==2) mTimer.restart("Count points per voxel");
 
+    cudaEvent_t copyEvent;
+    cudaCheck(cudaEventCreate(&copyEvent));
     mData.pointsPerVoxel    = mMemPool.template alloc<uint32_t>(pointCount, mStream);
     uint32_t *d_voxel_count = mMemPool.template alloc<uint32_t>(mStream);
     CALL_CUBS(DeviceRunLengthEncode::Encode, mData.d_keys, d_keys, mData.pointsPerVoxel, d_voxel_count, pointCount);
     cudaCheck(cudaMemcpyAsync(&mData.voxelCount, d_voxel_count, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
-    mMemPool.free(d_voxel_count);
+    cudaCheck(cudaEventRecord(copyEvent, mStream));
+    mMemPool.free(d_voxel_count, mStream);
 
     if (util::is_same<BuildT, Point>::value) {
         if (mVerbose==2) mTimer.restart("Count max points per voxel");
         uint32_t *d_maxPointsPerVoxel = mMemPool.template alloc<uint32_t>(mStream), maxPointsPerVoxel;
+        cudaCheck(cudaEventSynchronize(copyEvent));
         CALL_CUBS(DeviceReduce::Max, mData.pointsPerVoxel, d_maxPointsPerVoxel, mData.voxelCount);
         cudaCheck(cudaMemcpyAsync(&maxPointsPerVoxel, d_maxPointsPerVoxel, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
-        mMemPool.free(d_maxPointsPerVoxel);
+        cudaCheck(cudaEventRecord(copyEvent, mStream));
+        mMemPool.free(d_maxPointsPerVoxel, mStream);
         double dx = mData.map.getVoxelSize()[0];
+        cudaCheck(cudaEventSynchronize(copyEvent));
         if (++iterCounter >= mMaxIterations || pointCount == 1u || math::Abs((int)maxPointsPerVoxel - (int)mMaxPointsPerVoxel) <= mTolerance) {
             mMaxPointsPerVoxel = maxPointsPerVoxel;
         } else {
@@ -687,53 +691,66 @@ jump:// this marks the beginning of the actual algorithm
             }
             if (mVerbose==2) printf("\ntarget density = %u, current density = %u current dx = %f, next dx = %f\n", mMaxPointsPerVoxel, maxPointsPerVoxel, tmp.dx, dx);
             mData.map = Map(dx);
-            mMemPool.free(mData.d_keys, mData.d_indx, d_keys, mData.d_tile_keys, d_node_count, mData.pointsPerVoxel);
+            mMemPool.free(mData.d_keys, mStream);
+            mMemPool.free(mData.d_indx, mStream);
+            mMemPool.free(d_keys, mStream);
+            mMemPool.free(mData.d_tile_keys, mStream);
+            mMemPool.free(d_node_count, mStream);
+            mMemPool.free(mData.pointsPerVoxel, mStream);
             goto jump;
         }
     }
     if (iterCounter>1 && mVerbose) std::cerr << "Used " << iterCounter << " attempts to determine dx that produces a target dpoint denisty\n\n";
 
     if (mVerbose==2) mTimer.restart("Compute prefix sum of points per voxel");
+    cudaCheck(cudaEventSynchronize(copyEvent));
     mData.pointsPerVoxelPrefix = mMemPool.template alloc<uint32_t>(mData.voxelCount, mStream);
     CALL_CUBS(DeviceScan::ExclusiveSum, mData.pointsPerVoxel, mData.pointsPerVoxelPrefix, mData.voxelCount);
 
     mData.pointsPerLeaf = mMemPool.template alloc<uint32_t>(pointCount, mStream);
     CALL_CUBS(DeviceRunLengthEncode::Encode, ShiftRightIterator<9>(mData.d_keys), d_keys, mData.pointsPerLeaf, d_node_count, pointCount);
     cudaCheck(cudaMemcpyAsync(mData.nodeCount, d_node_count, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
+    cudaCheck(cudaEventRecord(copyEvent, mStream));
 
     if constexpr(util::is_same<BuildT, Point>::value) {
         uint32_t *d_maxPointsPerLeaf = mMemPool.template alloc<uint32_t>(mStream);
+        cudaCheck(cudaEventSynchronize(copyEvent));
         CALL_CUBS(DeviceReduce::Max, mData.pointsPerLeaf, d_maxPointsPerLeaf, mData.nodeCount[0]);
         cudaCheck(cudaMemcpyAsync(&mMaxPointsPerLeaf, d_maxPointsPerLeaf, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
         //printf("\n Leaf count = %u, max points per leaf = %u\n", mData.nodeCount[0], mMaxPointsPerLeaf);
         if (mMaxPointsPerLeaf > std::numeric_limits<uint16_t>::max()) {
             throw std::runtime_error("Too many points per leaf: "+std::to_string(mMaxPointsPerLeaf));
         }
-        mMemPool.free(d_maxPointsPerLeaf);
+        mMemPool.free(d_maxPointsPerLeaf, mStream);
     }
 
+    cudaCheck(cudaEventSynchronize(copyEvent));
     mData.pointsPerLeafPrefix = mMemPool.template alloc<uint32_t>(mData.nodeCount[0], mStream);
     CALL_CUBS(DeviceScan::ExclusiveSum, mData.pointsPerLeaf, mData.pointsPerLeafPrefix, mData.nodeCount[0]);
 
+    cudaCheck(cudaStreamSynchronize(mStream));
     mData.d_leaf_keys = mMemPool.template alloc<uint64_t>(mData.nodeCount[0], mStream);
     cudaCheck(cudaMemcpyAsync(mData.d_leaf_keys, d_keys, mData.nodeCount[0]*sizeof(uint64_t), cudaMemcpyDeviceToDevice, mStream));
 
     CALL_CUBS(DeviceSelect::Unique, ShiftRightIterator<12>(mData.d_leaf_keys), d_keys, d_node_count+1, mData.nodeCount[0]);// count lower nodes
     cudaCheck(cudaMemcpyAsync(mData.nodeCount+1, d_node_count+1, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
+    cudaCheck(cudaStreamSynchronize(mStream));
     mData.d_lower_keys = mMemPool.template alloc<uint64_t>(mData.nodeCount[1], mStream);
     cudaCheck(cudaMemcpyAsync(mData.d_lower_keys, d_keys, mData.nodeCount[1]*sizeof(uint64_t), cudaMemcpyDeviceToDevice, mStream));
 
-    mMemPool.free(d_keys, d_node_count);
+    mMemPool.free(d_keys, mStream);
+    mMemPool.free(d_node_count, mStream);
     if (mVerbose==2) mTimer.stop();
+    cudaCheck(cudaEventDestroy(copyEvent));
 
     //printf("Leaf count = %u, lower count = %u, upper count = %u\n", mData.nodeCount[0], mData.nodeCount[1], mData.nodeCount[2]);
 }// PointsToGrid<BuildT>::countNodes
 
 //-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 
-template <typename BuildT, typename AllocT>
+template <typename BuildT>
 template <typename PtrT, typename BufferT>
-inline BufferT PointsToGrid<BuildT, AllocT>::getBuffer(const PtrT, size_t pointCount, const BufferT &pool)
+inline BufferT PointsToGrid<BuildT>::getBuffer(const PtrT, size_t pointCount, const BufferT &pool)
 {
     auto sizeofPoint = [&]()->size_t{
         switch (mPointType){
@@ -760,7 +777,10 @@ inline BufferT PointsToGrid<BuildT, AllocT>::getBuffer(const PtrT, size_t pointC
     mData.blind = mData.meta  + sizeof(GridBlindMetaData)*int( mPointType!=PointType::Disable ); // meta data ends and blind data begins
     mData.size  = mData.blind + pointCount*sizeofPoint();// end of buffer
 
-    auto buffer = BufferT::create(mData.size, &pool, false);// only allocate buffer on the device
+    int device = 0;
+    cudaGetDevice(&device);
+    auto buffer = BufferT::create(mData.size, &pool, device, mStream);// only allocate buffer on the device
+
     mData.d_bufferPtr = buffer.deviceData();
     if (mData.d_bufferPtr == nullptr) throw std::runtime_error("Failed to allocate grid buffer on the device");
     cudaCheck(cudaMemcpyAsync(mDeviceData, &mData, sizeof(Data), cudaMemcpyHostToDevice, mStream));// copy Data CPU -> GPU
@@ -769,9 +789,9 @@ inline BufferT PointsToGrid<BuildT, AllocT>::getBuffer(const PtrT, size_t pointC
 
 //-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 
-template <typename BuildT, typename AllocT>
+template <typename BuildT>
 template <typename PtrT>
-inline void PointsToGrid<BuildT, AllocT>::processGridTreeRoot(const PtrT points, size_t pointCount)
+inline void PointsToGrid<BuildT>::processGridTreeRoot(const PtrT points, size_t pointCount)
 {
     using Vec3T = typename util::remove_const<typename pointer_traits<PtrT>::element_type>::type;
     util::cuda::lambdaKernel<<<1, 1, 0, mStream>>>(1, [=] __device__(size_t, Data *d_data, PointType pointType) {
@@ -906,8 +926,8 @@ inline void PointsToGrid<BuildT, AllocT>::processGridTreeRoot(const PtrT points,
 
 //-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 
-template <typename BuildT, typename AllocT>
-inline void PointsToGrid<BuildT, AllocT>::processUpperNodes()
+template <typename BuildT>
+inline void PointsToGrid<BuildT>::processUpperNodes()
 {
     util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[2]), mNumThreads, 0, mStream>>>(mData.nodeCount[2], [=] __device__(size_t tid, Data *d_data) {
         auto &root  = d_data->getRoot();
@@ -934,7 +954,7 @@ inline void PointsToGrid<BuildT, AllocT>::processUpperNodes()
     }, mDeviceData);
     cudaCheckError();
 
-    mMemPool.free(mData.d_tile_keys);
+    mMemPool.free(mData.d_tile_keys, mStream);
 
     const uint64_t valueCount = mData.nodeCount[2] << 15;
     util::cuda::lambdaKernel<<<numBlocks(valueCount), mNumThreads, 0, mStream>>>(valueCount, [=] __device__(size_t tid, Data *d_data) {
@@ -946,8 +966,8 @@ inline void PointsToGrid<BuildT, AllocT>::processUpperNodes()
 
 //-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 
-template <typename BuildT, typename AllocT>
-inline void PointsToGrid<BuildT, AllocT>::processLowerNodes()
+template <typename BuildT>
+inline void PointsToGrid<BuildT>::processLowerNodes()
 {
     util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[1]), mNumThreads, 0, mStream>>>(mData.nodeCount[1], [=] __device__(size_t tid, Data *d_data) {
         auto &root  = d_data->getRoot();
@@ -976,9 +996,9 @@ inline void PointsToGrid<BuildT, AllocT>::processLowerNodes()
 
 //-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 
-template <typename BuildT, typename AllocT>
+template <typename BuildT>
 template <typename PtrT>
-inline void PointsToGrid<BuildT, AllocT>::processLeafNodes(const PtrT points)
+inline void PointsToGrid<BuildT>::processLeafNodes(const PtrT points)
 {
     const uint8_t flags = static_cast<uint8_t>(mData.flags.data());// mIncludeStats ? 16u : 0u;// 4th bit indicates stats
 
@@ -1026,7 +1046,11 @@ inline void PointsToGrid<BuildT, AllocT>::processLeafNodes(const PtrT points)
         }
     }, mDeviceData); cudaCheckError();
 
-    mMemPool.free(mData.d_keys, mData.pointsPerVoxel, mData.pointsPerVoxelPrefix, mData.pointsPerLeafPrefix, mData.pointsPerLeaf);
+    mMemPool.free(mData.d_keys, mStream);
+    mMemPool.free(mData.pointsPerVoxel, mStream);
+    mMemPool.free(mData.pointsPerVoxelPrefix, mStream);
+    mMemPool.free(mData.pointsPerLeafPrefix, mStream);
+    mMemPool.free(mData.pointsPerLeaf, mStream);
 
     if (mVerbose==2) mTimer.restart("set inactive voxel values");
     const uint64_t denseVoxelCount = mData.nodeCount[0] << 9;
@@ -1046,18 +1070,18 @@ inline void PointsToGrid<BuildT, AllocT>::processLeafNodes(const PtrT points)
         if (mVerbose==2) mTimer.restart("prefix-sum for index grid");
         uint64_t *devValueIndex = mMemPool.template alloc<uint64_t>(mData.nodeCount[0], mStream);
         auto devValueIndexPrefix = mMemPool.template alloc<uint64_t>(mData.nodeCount[0], mStream);
-        kernels::fillValueIndexKernel<BuildT, AllocT><<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], devValueIndex, mDeviceData);
+        kernels::fillValueIndexKernel<BuildT><<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], devValueIndex, mDeviceData);
         cudaCheckError();
         CALL_CUBS(DeviceScan::InclusiveSum, devValueIndex, devValueIndexPrefix, mData.nodeCount[0]);
-        mMemPool.free(devValueIndex);
-        kernels::leafPrefixSumKernel<BuildT, AllocT><<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], devValueIndexPrefix, mDeviceData);
+        mMemPool.free(devValueIndex, mStream);
+        kernels::leafPrefixSumKernel<BuildT><<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], devValueIndexPrefix, mDeviceData);
         cudaCheckError();
-        mMemPool.free(devValueIndexPrefix);
+        mMemPool.free(devValueIndexPrefix, mStream);
     }
 
     if constexpr(BuildTraits<BuildT>::is_indexmask) {
         if (mVerbose==2) mTimer.restart("leaf.mMask = leaf.mValueMask");
-        kernels::setMaskEqValMaskKernel<BuildT, AllocT><<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], mDeviceData);
+        kernels::setMaskEqValMaskKernel<BuildT><<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], mDeviceData);
         cudaCheckError();
     }
     if (mVerbose==2) mTimer.stop();
@@ -1065,11 +1089,11 @@ inline void PointsToGrid<BuildT, AllocT>::processLeafNodes(const PtrT points)
 
 //-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 
-template <typename BuildT, typename AllocT>
+template <typename BuildT>
 template <typename PtrT>
-inline void PointsToGrid<BuildT, AllocT>::processPoints(const PtrT, size_t)
+inline void PointsToGrid<BuildT>::processPoints(const PtrT, size_t)
 {
-    mMemPool.free(mData.d_indx);
+    mMemPool.free(mData.d_indx, mStream);
 }
 
 //-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
@@ -1130,16 +1154,17 @@ inline void PointsToGrid<Point>::processPoints(const PtrT points, size_t pointCo
     default:
         printf("Internal error in PointsToGrid<Point>::processPoints\n");
     }
-    mMemPool.free(mData.d_indx);
+    mMemPool.free(mData.d_indx, mStream);
 }// PointsToGrid<Point>::processPoints
 
 //-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 
-template <typename BuildT, typename AllocT>
-inline void PointsToGrid<BuildT, AllocT>::processBBox()
+template <typename BuildT>
+inline void PointsToGrid<BuildT>::processBBox()
 {
     if (mData.flags.isMaskOff(GridFlags::HasBBox)) {
-        mMemPool.free(mData.d_leaf_keys, mData.d_lower_keys);
+        mMemPool.free(mData.d_leaf_keys, mStream);
+        mMemPool.free(mData.d_lower_keys, mStream);
         return;
     }
 
@@ -1158,7 +1183,7 @@ inline void PointsToGrid<BuildT, AllocT>::processBBox()
         leaf.updateBBox();
         lower.mBBox.expandAtomic(leaf.bbox());
     }, mDeviceData);
-    mMemPool.free(mData.d_leaf_keys);
+    mMemPool.free(mData.d_leaf_keys, mStream);
     cudaCheckError();
 
     // reset bbox in upper nodes
@@ -1174,7 +1199,7 @@ inline void PointsToGrid<BuildT, AllocT>::processBBox()
         auto &lower = d_data->getLower(tid);
         upper.mBBox.expandAtomic(lower.bbox());
     }, mDeviceData);
-    mMemPool.free(mData.d_lower_keys);
+    mMemPool.free(mData.d_lower_keys, mStream);
     cudaCheckError()
 
     // propagate bbox from upper -> root/parent node
@@ -1192,44 +1217,44 @@ inline void PointsToGrid<BuildT, AllocT>::processBBox()
 
 //-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 
-template<typename BuildT, typename PtrT, typename BufferT, typename AllocT>
+template<typename BuildT, typename PtrT, typename BufferT>
 GridHandle<BufferT>// Grid<BuildT>
 voxelsToGrid(const PtrT d_ijk, size_t voxelCount, double voxelSize, const BufferT &buffer, cudaStream_t stream)
 {
-    PointsToGrid<BuildT, AllocT> converter(voxelSize, Vec3d(0.0), stream);
+    PointsToGrid<BuildT> converter(voxelSize, Vec3d(0.0), stream);
     return converter.getHandle(d_ijk, voxelCount, buffer);
 }
 
 //-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 
-template<typename PtrT, typename BufferT, typename AllocT>
+template<typename PtrT, typename BufferT>
 GridHandle<BufferT>// Grid<Point> with PointType coordinates as blind data
 pointsToGrid(const PtrT d_xyz, int pointCount, int maxPointsPerVoxel, int tolerance, int maxIterations, PointType type, const BufferT &buffer, cudaStream_t stream)
 {
-    PointsToGrid<Point, AllocT> converter(maxPointsPerVoxel, tolerance, maxIterations, Vec3d(0.0), stream);
+    PointsToGrid<Point> converter(maxPointsPerVoxel, tolerance, maxIterations, stream);
     converter.setPointType(type);
     return converter.getHandle(d_xyz, pointCount, buffer);
 }
 
 //-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 
-template<typename BuildT, typename PtrT, typename BufferT, typename AllocT>
+template<typename BuildT, typename PtrT, typename BufferT>
 GridHandle<BufferT>
 pointsToGrid(std::vector<std::tuple<const PtrT,size_t,double,PointType>> vec, const BufferT &buffer, cudaStream_t stream)
 {
     std::vector<GridHandle<BufferT>> handles;
-    for (auto &p : vec) handles.push_back(pointsToGrid<BuildT, AllocT>(std::get<0>(p), std::get<1>(p), std::get<2>(p), std::get<3>(p), buffer, stream));
+    for (auto &p : vec) handles.push_back(pointsToGrid<BuildT>(std::get<0>(p), std::get<1>(p), std::get<2>(p), std::get<3>(p), buffer, stream));
     return mergeDeviceGrids(handles, stream);
 }
 
 //-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 
-template<typename BuildT, typename PtrT, typename BufferT, typename AllocT>
+template<typename BuildT, typename PtrT, typename BufferT>
 GridHandle<BufferT>
 voxelsToGrid(std::vector<std::tuple<const PtrT,size_t,double>> vec, const BufferT &buffer, cudaStream_t stream)
 {
     std::vector<GridHandle<BufferT>> handles;
-    for (auto &p : vec) handles.push_back(voxelsToGrid<BuildT, PtrT, BufferT, AllocT>(std::get<0>(p), std::get<1>(p), std::get<2>(p), buffer, stream));
+    for (auto &p : vec) handles.push_back(voxelsToGrid<BuildT, PtrT, BufferT>(std::get<0>(p), std::get<1>(p), std::get<2>(p), buffer, stream));
     return mergeDeviceGrids(handles, stream);
 }
 
@@ -1237,7 +1262,7 @@ voxelsToGrid(std::vector<std::tuple<const PtrT,size_t,double>> vec, const Buffer
 
 //-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 
-template<typename PtrT, typename BufferT = cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+template<typename PtrT, typename BufferT = cuda::DeviceBuffer>
 [[deprecated("Use cuda::pointsToGrid instead")]]
 GridHandle<BufferT>
 cudaPointsToGrid(const PtrT dWorldPoints,
@@ -1247,24 +1272,24 @@ cudaPointsToGrid(const PtrT dWorldPoints,
                  const BufferT &buffer = BufferT(),
                  cudaStream_t stream = 0)
 {
-    return tools::cuda::pointsToGrid<PtrT, BufferT, AllocT>(dWorldPoints, pointCount, voxelSize, type, buffer, stream);
+    return tools::cuda::pointsToGrid<PtrT, BufferT>(dWorldPoints, pointCount, voxelSize, type, buffer, stream);
 }
 
 //-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 
-template<typename BuildT, typename PtrT, typename BufferT = cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+template<typename BuildT, typename PtrT, typename BufferT = cuda::DeviceBuffer>
 [[deprecated("Use cuda::pointsToGrid instead")]]
 GridHandle<BufferT>
 cudaPointsToGrid(std::vector<std::tuple<const PtrT,size_t,double,PointType>> pointSet,
                  const BufferT &buffer = BufferT(),
                  cudaStream_t stream = 0)
 {
-    return tools::cuda::pointsToGrid<BuildT, PtrT, BufferT,AllocT>(pointSet, buffer, stream);
+    return tools::cuda::pointsToGrid<BuildT, PtrT, BufferT>(pointSet, buffer, stream);
 }
 
 //-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 
-template<typename BuildT, typename PtrT, typename BufferT = cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+template<typename BuildT, typename PtrT, typename BufferT = cuda::DeviceBuffer>
 [[deprecated("Use cuda::voxelsToGrid instead")]]
 GridHandle<BufferT>
 cudaVoxelsToGrid(const PtrT dGridVoxels,
@@ -1273,19 +1298,19 @@ cudaVoxelsToGrid(const PtrT dGridVoxels,
                  const BufferT &buffer = BufferT(),
                  cudaStream_t stream = 0)
 {
-    return tools::cuda::voxelsToGrid<BuildT, PtrT, BufferT, AllocT>(dGridVoxels, voxelCount, voxelSize, buffer, stream);
+    return tools::cuda::voxelsToGrid<BuildT, PtrT, BufferT>(dGridVoxels, voxelCount, voxelSize, buffer, stream);
 }
 
 //-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 
-template<typename BuildT, typename PtrT, typename BufferT = cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+template<typename BuildT, typename PtrT, typename BufferT = cuda::DeviceBuffer>
 [[deprecated("Use cuda::voxelsToGrid instead")]]
 GridHandle<BufferT>
 cudaVoxelsToGrid(std::vector<std::tuple<const PtrT, size_t, double>> pointSet,
                  const BufferT &buffer = BufferT(),
                  cudaStream_t stream = 0)
 {
-    return tools::cuda::voxelsToGrid<BuildT, PtrT, BufferT, AllocT>(pointSet, buffer, stream);
+    return tools::cuda::voxelsToGrid<BuildT, PtrT, BufferT>(pointSet, buffer, stream);
 }
 
 }// namespace nanovdb
diff --git a/nanovdb/nanovdb/unittest/TestNanoVDB.cc b/nanovdb/nanovdb/unittest/TestNanoVDB.cc
index 33e120c063..5e1aabe35e 100644
--- a/nanovdb/nanovdb/unittest/TestNanoVDB.cc
+++ b/nanovdb/nanovdb/unittest/TestNanoVDB.cc
@@ -8037,7 +8037,7 @@ TEST_F(TestNanoVDB, mergeSplitGrids)
         handles1.emplace_back(nanovdb::tools::createLevelSetSphere(radius,nanovdb::Vec3d(0),1,3,
                                                             nanovdb::Vec3d(0), gridNames.back()));
         EXPECT_FALSE(handles1.back().isPadded());
-        size1 += handles1.back().size();
+        size1 += handles1.back().bufferSize();
     }
     EXPECT_EQ(5u, gridNames.size());
     EXPECT_EQ(5u, handles1.size());
@@ -8046,26 +8046,26 @@ TEST_F(TestNanoVDB, mergeSplitGrids)
         gridNames.emplace_back("sphere_" + std::to_string(radius));
         handles2.emplace_back(nanovdb::tools::createLevelSetSphere(radius,nanovdb::Vec3d(0),1,3,
                                                             nanovdb::Vec3d(0), gridNames.back()));
-        size2 += handles2.back().size();
+        size2 += handles2.back().bufferSize();
     }
     EXPECT_EQ(10u, gridNames.size());
     EXPECT_EQ( 5u, handles2.size());
     //timer.restart("merging 5 host grids");
     auto mergedHandle = nanovdb::mergeGrids<nanovdb::HostBuffer, std::vector>(handles2);// merge last 5 grid handles
-    EXPECT_EQ(size2, mergedHandle.size());
+    EXPECT_EQ(size2, mergedHandle.bufferSize());
     EXPECT_FALSE(mergedHandle.isPadded());
     EXPECT_TRUE(mergedHandle.data());
     auto *gridData = mergedHandle.gridData();// first grid
     EXPECT_TRUE(gridData);
     EXPECT_EQ(5u, gridData->mGridCount);
     EXPECT_EQ(0u, gridData->mGridIndex);
-    EXPECT_EQ(handles2[0].size(), gridData->mGridSize);
+    EXPECT_EQ(handles2[0].bufferSize(), gridData->mGridSize);
     //timer.restart("unit-test host grids");
     for (int i=0; i<5; ++i){
         gridData = mergedHandle.gridData(i);
         EXPECT_TRUE(gridData);
         EXPECT_EQ(i, gridData->mGridIndex);
-        EXPECT_EQ(handles2[i].size(), gridData->mGridSize);
+        EXPECT_EQ(handles2[i].bufferSize(), gridData->mGridSize);
         EXPECT_EQ(strcmp(gridNames[i+5].c_str(), gridData->mGridName),0);
     }
 
@@ -8091,27 +8091,27 @@ TEST_F(TestNanoVDB, mergeSplitGrids)
 
     //timer.restart("merging 10 host grids");
     mergedHandle = nanovdb::mergeGrids<nanovdb::HostBuffer, std::vector>(handles1);
-    EXPECT_EQ(size1 + size2, mergedHandle.size());
+    EXPECT_EQ(size1 + size2, mergedHandle.bufferSize());
     EXPECT_TRUE(mergedHandle.data());
     gridData = mergedHandle.gridData();// first grid
     EXPECT_TRUE(gridData);
     EXPECT_EQ(10u, gridData->mGridCount);
     EXPECT_EQ( 0u, gridData->mGridIndex);
-    EXPECT_EQ(handles1[0].size(), gridData->mGridSize);
+    EXPECT_EQ(handles1[0].bufferSize(), gridData->mGridSize);
 
     //timer.restart("splitting host grids");
     auto splitHandles = nanovdb::splitGrids(mergedHandle);
     //timer.restart("unit-test split grids");
     EXPECT_EQ(10u, splitHandles.size());
     for (int i=0; i<5; ++i){
-        EXPECT_EQ(handles1[i].size(), splitHandles[i].size());
+        EXPECT_EQ(handles1[i].bufferSize(), splitHandles[i].bufferSize());
         gridData = splitHandles[i].gridData();
         EXPECT_EQ(0u, gridData->mGridIndex);
         EXPECT_EQ(1u, gridData->mGridCount);
         EXPECT_EQ(strcmp(gridNames[i].c_str(), gridData->mGridName),0);
     }
     for (int i=5; i<10; ++i){
-        EXPECT_EQ(handles2[i-5].size(), splitHandles[i].size());
+        EXPECT_EQ(handles2[i-5].bufferSize(), splitHandles[i].bufferSize());
         gridData = splitHandles[i].gridData();
         EXPECT_EQ(0u, gridData->mGridIndex);
         EXPECT_EQ(1u, gridData->mGridCount);
diff --git a/nanovdb/nanovdb/unittest/TestNanoVDB.cu b/nanovdb/nanovdb/unittest/TestNanoVDB.cu
index 0d5f797299..6896c6ca09 100644
--- a/nanovdb/nanovdb/unittest/TestNanoVDB.cu
+++ b/nanovdb/nanovdb/unittest/TestNanoVDB.cu
@@ -16,10 +16,15 @@
 #include <nanovdb/tools/cuda/GridChecksum.cuh>
 #include <nanovdb/tools/cuda/GridValidator.cuh>
 #include <nanovdb/tools/cuda/GridStats.cuh>
+//#include <nanovdb/tools/cuda/DilateVoxels.cuh>
 #include <nanovdb/util/cuda/Timer.h>
 #include <nanovdb/util/Timer.h>
 #include <nanovdb/io/IO.h>
+#include <nanovdb/cuda/UnifiedBuffer.h>
+#include <nanovdb/cuda/DeviceStreamMap.h>
 
+#include <cuda.h>
+#include <cuda_runtime_api.h>
 #include <gtest/gtest.h>
 #include <algorithm>// for std::sort
 #include <iomanip> // for std::setw, std::setfill
@@ -130,8 +135,24 @@ void cudaStr()
 
 TEST(TestNanoVDBCUDA, CudaDeviceBuffer)
 {
-   nanovdb::test::device2host(1000);
-   nanovdb::test::host2device2host(1000);
+    {
+        nanovdb::cuda::DeviceBuffer buffer;
+        EXPECT_EQ(0, buffer.deviceCount());
+        EXPECT_EQ(0, buffer.bufferCount());
+        EXPECT_EQ(0, buffer.size());
+        EXPECT_TRUE(buffer.empty());
+    }
+    {
+        nanovdb::cuda::DeviceBuffer buffer(1024);
+        int count = 0;
+        cudaGetDeviceCount(&count);
+        EXPECT_EQ(count, buffer.deviceCount());
+        EXPECT_EQ(1, buffer.bufferCount());
+        EXPECT_EQ(1024, buffer.size());
+        EXPECT_FALSE(buffer.empty());
+    }
+    nanovdb::test::device2host(1000);
+    nanovdb::test::host2device2host(1000);
 }
 
 TEST(TestNanoVDBCUDA, CudaStr)
@@ -139,6 +160,74 @@ TEST(TestNanoVDBCUDA, CudaStr)
    nanovdb::test::cudaStr();
 }
 
+__global__ void testKernel(int device)
+{
+    int dev;
+    cudaError_t err = cudaGetDevice(&dev);
+    if (err != cudaSuccess) printf("kernel cuda error: %d\n", (int)err);
+    if (dev != device) printf("Error: expected device ID = %i but was called with %i\n", dev, device);
+}
+
+TEST(TestNanoVDBCUDA, DeviceStreamMap)
+{
+    using DevMap = nanovdb::cuda::DeviceStreamMap;
+    int count = 0, verbose = 0, current = 0;
+    {
+        cudaCheck(cudaGetDeviceCount(&count));
+        cudaCheck(cudaGetDevice(&current));
+        EXPECT_EQ(count,   nanovdb::util::cuda::deviceCount());
+        EXPECT_EQ(current, nanovdb::util::cuda::currentDevice());
+        float *ptr = new float;
+        EXPECT_EQ(cudaInvalidDeviceId, nanovdb::util::cuda::ptrToDevice(ptr));
+        delete ptr;
+        cudaCheck(cudaMalloc((void**)&ptr, sizeof(float)));
+        EXPECT_EQ(current, nanovdb::util::cuda::ptrToDevice(ptr));
+        cudaCheck(cudaFree(ptr));
+    }
+    //std::cout << "Total device count = " << count << std::endl;
+    {
+        //std::cout << "Any:\n";
+        DevMap map(DevMap::Any, {}, verbose);
+        EXPECT_EQ(count, map.size());
+        for (const auto& [device, stream] : map) {
+            cudaSetDevice(device);
+            testKernel<<<1, 1, 0, stream>>>(device);
+            cudaStreamSynchronize(stream);
+        }
+    }
+    {
+        //std::cout << "Any excluding {current}:\n";
+        DevMap map(DevMap::Any, {current}, verbose);
+        EXPECT_EQ(count-1, map.size());
+        for (const auto& [device, stream] : map) {
+            cudaSetDevice(device);
+            testKernel<<<1, 1, 0, stream>>>(device);
+            cudaStreamSynchronize(stream);
+        }
+    }
+    {
+        //std::cout << "PeerToPeer:\n";
+        DevMap map(DevMap::PeerToPeer, {},  verbose);
+        EXPECT_GE(count, map.size());
+        for (const auto& [device, stream] : map) {
+            cudaSetDevice(device);
+            testKernel<<<1, 1, 0, stream>>>(device);
+            cudaStreamSynchronize(stream);
+        }
+    }
+    {
+        //std::cout << "Unified:\n";
+        DevMap map(DevMap::Unified, {}, verbose);
+        EXPECT_GE(count, map.size());
+        for (const auto& [device, stream] : map) {
+            cudaSetDevice(device);
+            testKernel<<<1, 1, 0, stream>>>(device);
+            cudaStreamSynchronize(stream);
+        }
+    }
+    cudaSetDevice(current); // restore device so subsequent tests don't fail
+}
+
 TEST(TestNanoVDBCUDA, Basic_CudaPointsToGrid_float)
 {
     using BuildT = float;
@@ -508,10 +597,10 @@ TEST(TestNanoVDBCUDA, Basic_CudaPointsToGrid_ValueOnIndexMask)
     }
 }// Basic_CudaPointsToGrid_ValueOnIndexMask
 
-TEST(TestNanoVDBCUDA, Large_CudaPointsToGrid_old)
+TEST(TestNanoVDBCUDA, Large_CudaPointsToGrid_DeviceBuffer)
 {
     using BuildT = nanovdb::ValueOnIndex;
-    //nanovdb::util::Timer timer;
+    nanovdb::util::Timer timer;
     const size_t voxelCount = 1 << 20;// 1048576
     std::vector<nanovdb::Coord> voxels;
     {//generate random voxels
@@ -545,9 +634,9 @@ TEST(TestNanoVDBCUDA, Large_CudaPointsToGrid_old)
     cudaCheck(cudaMemcpy(d_coords, voxels.data(), voxelSize, cudaMemcpyHostToDevice));
     //timer.stop();
 
-    //timer.start("Building grid on GPU from "+std::to_string(voxels.size())+" points");
+    timer.start("voxelsToGrid<DeviceBuffer>("+std::to_string(voxels.size())+" points)");
     auto handle = nanovdb::tools::cuda::voxelsToGrid<BuildT>(d_coords, voxelCount, 1.0);
-    //timer.stop();
+    timer.stop();
 
     EXPECT_TRUE(handle.deviceData());// grid only exists on the GPU
     EXPECT_TRUE(handle.deviceGrid<BuildT>());
@@ -587,6 +676,85 @@ TEST(TestNanoVDBCUDA, Large_CudaPointsToGrid_old)
     //timer.stop();
 }// Large_CudaPointsToGrid_old
 
+TEST(TestNanoVDBCUDA, Large_CudaPointsToGrid_UnifiedBuffer)
+{
+    using BuildT = nanovdb::ValueOnIndex;
+    nanovdb::util::Timer timer;
+    const size_t voxelCount = 1 << 20;// 1048576
+    std::vector<nanovdb::Coord> voxels;
+    {//generate random voxels
+        voxels.reserve(voxelCount);
+        std::srand(98765);
+        const int max = 512, min = -max;
+        auto op = [&](){return rand() % (max - min) + min;};
+        //timer.start("Creating "+std::to_string(voxelCount)+" random voxels on the CPU");
+        while (voxels.size() < voxelCount) voxels.push_back(nanovdb::Coord(op(), op(), op()));
+        //timer.stop();
+        EXPECT_EQ(voxelCount, voxels.size());
+    }
+#if 0
+    {// Build grid on CPU
+        nanovdb::tools::build::Grid<float> buildGrid(0.0f);
+        //timer.start("Building grid on CPU from "+std::to_string(voxels.size())+" points");
+        nanovdb::util::forEach0, voxelCount, voxelCount >> 6, [&](const nanovdb::util::Range1D &r){
+            auto acc = buildGrid.getWriteAccessor();
+            for (size_t i=r.begin(); i!=r.end(); ++i) acc.setValueOn(voxels[i]);
+        });
+        //timer.restart("Converting CPU build::Grid to nanovdb");
+        auto handle = nanovdb::tools::createNanoGrid(buildGrid);
+        //timer.stop();
+    }
+#endif
+    nanovdb::Coord* d_coords;
+    const size_t voxelSize = voxels.size() * sizeof(nanovdb::Coord);
+    //timer.start("Allocating "+std::to_string(voxelSize >> 20)+" MB on the GPU");
+    cudaCheck(cudaMalloc(&d_coords, voxelSize));
+    //timer.restart("Copying voxels from CPU to GPU");
+    cudaCheck(cudaMemcpy(d_coords, voxels.data(), voxelSize, cudaMemcpyHostToDevice));
+    //timer.stop();
+
+    timer.start("voxelsToGrid<UnifiedBuffer>("+std::to_string(voxels.size())+" points)");
+    auto handle = nanovdb::tools::cuda::voxelsToGrid<BuildT, nanovdb::Coord*, nanovdb::cuda::UnifiedBuffer>(d_coords, voxelCount, 1.0);
+    timer.stop();
+
+    EXPECT_TRUE(handle.deviceData());// grid exists on the GPU
+    EXPECT_TRUE(handle.deviceGrid<BuildT>());
+    EXPECT_FALSE(handle.deviceGrid<int>(0));
+    EXPECT_TRUE(handle.deviceGrid<BuildT>(0));
+    EXPECT_FALSE(handle.deviceGrid<BuildT>(1));
+    EXPECT_TRUE(handle.data());// grid also exists on the CPU
+
+    //timer.start("Allocating and copying grid from GPU to CPU");
+    auto *grid = handle.grid<BuildT>();// grid also exists on the CPU
+    EXPECT_TRUE(grid);
+    handle.deviceDownload();// creates a copy on the CPU
+    EXPECT_TRUE(handle.deviceData());
+    EXPECT_TRUE(handle.data());
+    auto *data = handle.gridData();
+    EXPECT_TRUE(data);
+    grid = handle.grid<BuildT>();
+    EXPECT_TRUE(grid);
+    EXPECT_TRUE(grid->valueCount()>0);
+    EXPECT_EQ(nanovdb::Vec3d(1.0), grid->voxelSize());
+
+    //timer.restart("Parallel unit-testing on CPU");
+    nanovdb::util::forEach(voxels,[&](const nanovdb::util::Range1D &r){
+        auto acc = grid->getAccessor();
+        for (size_t i=r.begin(); i!=r.end(); ++i) {
+            const nanovdb::Coord &ijk = voxels[i];
+            EXPECT_TRUE(acc.probeLeaf(ijk)!=nullptr);
+            EXPECT_TRUE(acc.isActive(ijk));
+            EXPECT_TRUE(acc.getValue(ijk) > 0u);
+            const auto *leaf = acc.get<nanovdb::GetLeaf<BuildT>>(ijk);
+            EXPECT_TRUE(leaf);
+            const auto offset = leaf->CoordToOffset(ijk);
+            EXPECT_EQ(ijk, leaf->offsetToGlobalCoord(offset));
+        }
+    });
+
+    //timer.stop();
+}// Large_CudaPointsToGrid_new
+
 TEST(TestNanoVDBCUDA, mergeSplitGrids)
 {
     size_t size1 = 0, size2 = 0;
@@ -598,7 +766,7 @@ TEST(TestNanoVDBCUDA, mergeSplitGrids)
         handles1.emplace_back(nanovdb::tools::createLevelSetSphere(radius,nanovdb::Vec3d(0),1,3,
                                                             nanovdb::Vec3d(0), gridNames.back()));
         EXPECT_FALSE(handles1.back().isPadded());
-        size1 += handles1.back().size();
+        size1 += handles1.back().bufferSize();
     }
     EXPECT_EQ(5u, gridNames.size());
     EXPECT_EQ(5u, handles1.size());
@@ -607,7 +775,7 @@ TEST(TestNanoVDBCUDA, mergeSplitGrids)
         gridNames.emplace_back("sphere_" + std::to_string(radius));
         handles2.emplace_back(nanovdb::tools::createLevelSetSphere(radius,nanovdb::Vec3d(0),1,3,
                                                             nanovdb::Vec3d(0), gridNames.back()));
-        size2 += handles2.back().size();
+        size2 += handles2.back().bufferSize();
     }
     EXPECT_EQ(10u, gridNames.size());
     EXPECT_EQ( 5u, handles2.size());
@@ -678,7 +846,7 @@ TEST(TestNanoVDBCUDA, mergeSplitDeviceGrids)
         handles.emplace_back(nanovdb::tools::createLevelSetSphere<float, BufferT>(radius,nanovdb::Vec3d(0),1,3,
                                                            nanovdb::Vec3d(0), gridNames.back()));
         EXPECT_FALSE(handles.back().isPadded());
-        size += handles.back().size();
+        size += handles.back().bufferSize();
     }
     //timer.restart("copy grids to device");
     for (auto &h : handles) h.deviceUpload();
@@ -763,9 +931,17 @@ TEST(TestNanoVDBCUDA, CudaIndexGridToGrid_basic)
     using BufferT = nanovdb::cuda::DeviceBuffer;
     auto idxHdl = nanovdb::tools::createNanoGrid<nanovdb::FloatGrid, nanovdb::ValueIndex, BufferT>(*floatGrid, 0u, false, false, 1);
     //timer.restart("Copy IndexGrid from CPU to GPU");
+    EXPECT_EQ(1u, idxHdl.buffer().bufferCount());
+    EXPECT_TRUE(idxHdl.buffer().data(0, cudaCpuDeviceId));
+    EXPECT_FALSE(idxHdl.buffer().data(0, 0));
+    EXPECT_FALSE(idxHdl.buffer().data(0, 1));
     EXPECT_FALSE(idxHdl.deviceGrid<nanovdb::ValueIndex>());
     idxHdl.deviceUpload();
     EXPECT_TRUE(idxHdl.deviceGrid<nanovdb::ValueIndex>());
+    EXPECT_EQ(2u, idxHdl.buffer().bufferCount());
+    EXPECT_TRUE(idxHdl.buffer().data(0, cudaCpuDeviceId));
+    EXPECT_TRUE(idxHdl.buffer().data(0, 0));
+    EXPECT_FALSE(idxHdl.buffer().data(0, 1));
     auto *idxGrid = idxHdl.grid<nanovdb::ValueIndex>();
     EXPECT_TRUE(idxGrid);
     //timer.restart("Create value list on CPU");
@@ -2713,10 +2889,13 @@ TEST(TestNanoVDBCUDA, NodeManager)
     }
 }// NodeManager
 
-TEST(TestNanoVDBCUDA, GridStats)
+
+TEST(TestNanoVDBCUDA, GridStats_UnifiedBuffer)
 {
+
+    using BufferT = nanovdb::cuda::UnifiedBuffer;
     using GridT = nanovdb::NanoGrid<float>;
-    auto handle = nanovdb::tools::createLevelSetSphere<float, nanovdb::cuda::DeviceBuffer>(100,
+    auto handle = nanovdb::tools::createLevelSetSphere<float, BufferT>(100,
                                                                                   nanovdb::Vec3d(0),
                                                                                   1.0,
                                                                                   3.0,
@@ -2819,20 +2998,133 @@ TEST(TestNanoVDBCUDA, GridStats)
         EXPECT_EQ(grid->tree().root().average(),      data->mAverage);
         EXPECT_EQ(grid->tree().root().stdDeviation(), data->mStdDevi);
     }
-}// GridStats
+}// GridStats_UnifiedBuffer
 
-TEST(TestNanoVDBCUDA, cudaIsValid)
+TEST(TestNanoVDBCUDA, GridStats_DeviceBuffer)
 {
-    const auto mode = nanovdb::CheckMode::Full;
+    using BufferT = nanovdb::cuda::DeviceBuffer;
     using GridT = nanovdb::NanoGrid<float>;
-    auto handle = nanovdb::tools::createLevelSetSphere<float, nanovdb::cuda::DeviceBuffer>(100,
+    auto handle = nanovdb::tools::createLevelSetSphere<float, BufferT>(100,
                                                                                   nanovdb::Vec3d(0),
                                                                                   1.0,
                                                                                   3.0,
                                                                                   nanovdb::Vec3d(0),
                                                                                   "test",
-                                                                                  nanovdb::tools::StatsMode::Disable,
-                                                                                  mode);
+                                                                                  nanovdb::tools::StatsMode::Disable);
+    EXPECT_TRUE(handle.data());
+    GridT *grid = handle.grid<float>();
+    EXPECT_TRUE(grid);
+    handle.deviceUpload();
+    GridT *d_grid = handle.deviceGrid<float>();
+    EXPECT_TRUE(d_grid);
+
+    {// check min/max using const iterators
+        float min = std::numeric_limits<float>::max(), max = -min;
+        int n2=0, n1=0, n0=0;// check that nodes are arranged breath-first in memory
+        for (auto it2 = grid->tree().root().cbeginChild(); it2; ++it2) {
+            EXPECT_EQ(grid->tree().getFirstUpper() + n2++, &(*it2));
+            for (auto it1 = it2->cbeginChild(); it1; ++it1) {
+                EXPECT_EQ(grid->tree().getFirstLower() + n1++, &(*it1));
+                for (auto it0 = it1->cbeginChild(); it0; ++it0) {
+                    EXPECT_EQ(grid->tree().getFirstLeaf() + n0++, &(*it0));
+                    for (auto it = it0->cbeginValueOn(); it; ++it) {
+                        if (*it < min) min = *it;
+                        if (*it > max) max = *it;
+                    }
+                }// loop over child nodes of the lower internal node
+            }// loop over child nodes of the upper internal node
+        }// loop over child nodes of the root node
+        EXPECT_NE(min, grid->tree().root().minimum());
+        EXPECT_NE(max, grid->tree().root().maximum());
+        EXPECT_EQ(n2, grid->tree().nodeCount(2));
+        EXPECT_EQ(n1, grid->tree().nodeCount(1));
+        EXPECT_EQ(n0, grid->tree().nodeCount(0));
+    }
+    {
+        //nanovdb::util::Timer cpuTimer("CPU gridStats: Default = Full");
+        nanovdb::tools::updateGridStats(grid);
+        //cpuTimer.stop();
+    }
+    {// check min/max using const iterators
+        float min = std::numeric_limits<float>::max(), max = -min;
+        int n2=0, n1=0, n0=0;// check that nodes are arranged breath-first in memory
+        for (auto it2 = grid->tree().root().cbeginChild(); it2; ++it2) {
+            EXPECT_EQ(grid->tree().getFirstUpper() + n2++, &(*it2));
+            for (auto it1 = it2->cbeginChild(); it1; ++it1) {
+                EXPECT_EQ(grid->tree().getFirstLower() + n1++, &(*it1));
+                for (auto it0 = it1->cbeginChild(); it0; ++it0) {
+                    EXPECT_EQ(grid->tree().getFirstLeaf() + n0++, &(*it0));
+                    for (auto it = it0->cbeginValueOn(); it; ++it) {
+                        if (*it < min) min = *it;
+                        if (*it > max) max = *it;
+                    }
+                }// loop over child nodes of the lower internal node
+            }// loop over child nodes of the upper internal node
+        }// loop over child nodes of the root node
+        EXPECT_EQ(min, grid->tree().root().minimum());
+        EXPECT_EQ(max, grid->tree().root().maximum());
+        EXPECT_EQ(n2, grid->tree().nodeCount(2));
+        EXPECT_EQ(n1, grid->tree().nodeCount(1));
+        EXPECT_EQ(n0, grid->tree().nodeCount(0));
+    }
+    {// check min/max using non-const iterators
+        float min = std::numeric_limits<float>::max(), max = -min;
+        int n2=0, n1=0, n0=0;// check that nodes are arranged breath-first in memory
+        for (auto it2 = grid->tree().root().beginChild(); it2; ++it2) {
+            EXPECT_EQ(grid->tree().getFirstUpper() + n2++, &(*it2));
+            for (auto it1 = it2->beginChild(); it1; ++it1) {
+                EXPECT_EQ(grid->tree().getFirstLower() + n1++, &(*it1));
+                for (auto it0 = it1->beginChild(); it0; ++it0) {
+                    EXPECT_EQ(grid->tree().getFirstLeaf() + n0++, &(*it0));
+                    for (auto it = it0->beginValueOn(); it; ++it) {
+                        if (*it < min) min = *it;
+                        if (*it > max) max = *it;
+                    }
+                }// loop over child nodes of the lower internal node
+            }// loop over child nodes of the upper internal node
+        }// loop over child nodes of the root node
+        EXPECT_EQ(min, grid->tree().root().minimum());
+        EXPECT_EQ(max, grid->tree().root().maximum());
+        EXPECT_EQ(n2, grid->tree().nodeCount(2));
+        EXPECT_EQ(n1, grid->tree().nodeCount(1));
+        EXPECT_EQ(n0, grid->tree().nodeCount(0));
+    }
+
+    {
+        //nanovdb::util::cuda::Timer gpuTimer("GPU gridStats: Default = Full");
+        nanovdb::tools::cuda::updateGridStats(d_grid);
+        //gpuTimer.stop();
+    }
+    {// check bbox and stats of device grid
+        using DataT = nanovdb::NanoRoot<float>::DataType;
+        std::unique_ptr<char[]> buffer(new char[sizeof(DataT)]);
+        cudaMemcpy(buffer.get(), (char*)d_grid + sizeof(nanovdb::GridData) + sizeof(nanovdb::TreeData), sizeof(DataT), cudaMemcpyDeviceToHost);
+        auto *data = (const DataT*)buffer.get();
+        EXPECT_EQ(grid->indexBBox(), data->mBBox);
+        EXPECT_EQ(grid->tree().root().background(),   data->mBackground);
+        EXPECT_EQ(grid->tree().root().minimum(),      data->mMinimum);
+        EXPECT_EQ(grid->tree().root().maximum(),      data->mMaximum);
+        EXPECT_EQ(grid->tree().root().average(),      data->mAverage);
+        EXPECT_EQ(grid->tree().root().stdDeviation(), data->mStdDevi);
+    }
+}// GridStats_DeviceBuffer
+
+// make -j && ./unittest/testNanoVDB --gtest_filter="*cudaIsValid*" --gtest_repeat=10
+TEST(TestNanoVDBCUDA, cudaIsValid_DeviceBuffer)
+{
+    using BufferT = nanovdb::cuda::DeviceBuffer;
+    const auto mode = nanovdb::CheckMode::Full;
+    using GridT = nanovdb::NanoGrid<float>;
+    nanovdb::util::Timer timer("createLevelSetSphere<DeviceBuffer>(500)");
+    auto handle = nanovdb::tools::createLevelSetSphere<float, BufferT>(500,
+                                                                       nanovdb::Vec3d(0),
+                                                                       1.0,
+                                                                       3.0,
+                                                                       nanovdb::Vec3d(0),
+                                                                       "test",
+                                                                       nanovdb::tools::StatsMode::Disable,
+                                                                       mode);
+    timer.stop();
     EXPECT_TRUE(handle.data());
     GridT *grid = handle.grid<float>();
     EXPECT_TRUE(grid);
@@ -2848,4 +3140,212 @@ TEST(TestNanoVDBCUDA, cudaIsValid)
     EXPECT_FALSE(nanovdb::isValid(grid,       mode, verbose));
     handle.deviceUpload();
     EXPECT_FALSE(nanovdb::tools::cuda::isValid(d_grid, mode, verbose));
-}// cudaIsValid
+}// cudaIsValid_DeviceBuffer
+
+TEST(TestNanoVDBCUDA, cudaIsValid_UnifiedBuffer)
+{
+    using BufferT = nanovdb::cuda::UnifiedBuffer;
+    const auto mode = nanovdb::CheckMode::Full;
+    using GridT = nanovdb::NanoGrid<float>;
+    nanovdb::util::Timer timer("createLevelSetSphere<UnifiedBuffer>(500)");
+    auto handle = nanovdb::tools::createLevelSetSphere<float, BufferT>(500,
+                                                                       nanovdb::Vec3d(0),
+                                                                       1.0,
+                                                                       3.0,
+                                                                       nanovdb::Vec3d(0),
+                                                                       "test",
+                                                                       nanovdb::tools::StatsMode::Disable,
+                                                                       mode);
+    timer.stop();
+    EXPECT_TRUE(handle.data());
+    GridT *grid = handle.grid<float>();
+    EXPECT_TRUE(grid);
+    handle.deviceUpload();
+    GridT *d_grid = handle.deviceGrid<float>();
+    EXPECT_TRUE(d_grid);
+    const bool verbose = false;
+
+    EXPECT_TRUE(nanovdb::isValid(grid,        mode, verbose));
+    EXPECT_TRUE(nanovdb::tools::cuda::isValid(d_grid,  mode, verbose));
+
+    grid->mGridType = nanovdb::GridType::Vec3f;
+    EXPECT_FALSE(nanovdb::isValid(grid,       mode, verbose));
+    handle.deviceUpload();
+    EXPECT_FALSE(nanovdb::tools::cuda::isValid(d_grid, mode, verbose));
+}// cudaIsValid_UnifiedBuffer
+
+TEST(TestNanoVDBCUDA, cudaIsValid_HostBuffer)
+{
+    using BufferT = nanovdb::HostBuffer;
+    const auto mode = nanovdb::CheckMode::Full;
+    using GridT = nanovdb::NanoGrid<float>;
+    nanovdb::util::Timer timer("createLevelSetSphere<UnifiedBuffer>(500)");
+    auto handle = nanovdb::tools::createLevelSetSphere<float, BufferT>(500,
+                                                                       nanovdb::Vec3d(0),
+                                                                       1.0,
+                                                                       3.0,
+                                                                       nanovdb::Vec3d(0),
+                                                                       "test",
+                                                                       nanovdb::tools::StatsMode::Disable,
+                                                                       mode);
+    timer.stop();
+    EXPECT_TRUE(handle.data());
+    GridT *grid = handle.grid<float>();
+    EXPECT_TRUE(grid);
+}// cudaIsValid_HostBuffer
+
+TEST(TestNanoVDBCUDA, overSizedDeviceBuffer)
+{
+    // create a grid in a host buffer that exactly fits the grid
+    auto handle1 = nanovdb::tools::createLevelSetSphere();
+    const size_t gridSize = handle1.gridSize();
+    EXPECT_EQ(gridSize, handle1.bufferSize());
+    EXPECT_EQ(gridSize, handle1.totalGridSize());
+    EXPECT_EQ(0       , handle1.freeSize());
+    EXPECT_EQ(1       , handle1.gridCount());
+    EXPECT_TRUE(handle1.isFull());
+
+    // copy grid to an oversized device buffer
+    using BufferT = nanovdb::cuda::DeviceBuffer;
+    auto overSizedBuffer = BufferT(2*gridSize, false);// only allocate the device buffer
+    cudaMemcpy(overSizedBuffer.deviceData(), handle1.data(), gridSize, cudaMemcpyHostToDevice);
+
+    // construct handle from over-sized device buffer and test it
+    nanovdb::GridHandle<BufferT> handle2(std::move(overSizedBuffer));
+    EXPECT_EQ(1         , handle2.gridCount());
+    EXPECT_EQ(  gridSize, handle2.gridSize());
+    EXPECT_EQ(  gridSize, handle2.totalGridSize());
+    EXPECT_EQ(2*gridSize, handle2.bufferSize());
+    EXPECT_EQ(  gridSize, handle2.freeSize());
+    EXPECT_FALSE(handle2.isFull());
+}// overSizedDeviceBuffer
+
+__global__ void initKernel(int N, int *x)
+{
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= N) return;
+    x[tid] = 2;
+}
+
+// make -j && ./unittest/testNanoVDB --gtest_filter="*UnifiedBuffer*" --gtest_repeat=2
+TEST(TestNanoVDBCUDA, UnifiedBuffer_basic)
+{
+    int device = 0;
+    cudaSetDevice(device);// set the default device to GPU #0
+    size_t free, total;
+    cudaMemGetInfo( &free, &total );
+    std::cout << "GPU #" << device << " memory: free = " << (free>>30) << " GB, total = " << (total>>30) << " GB\n";
+
+    cudaStream_t stream;
+    cudaCheck(cudaStreamCreate(&stream));
+    const size_t N = 1<<20, size = N*sizeof(int), capacity = 20*size;// over-allocation
+
+    nanovdb::cuda::UnifiedBuffer buffer(size, capacity);
+    EXPECT_EQ(size, buffer.size());
+    EXPECT_EQ(capacity, buffer.capacity());
+    {// set N values on the host
+        //buffer.deviceDownload(stream);
+        buffer.prefetch(0, size, cudaCpuDeviceId, stream);
+        nanovdb::util::Timer timer("Setting values on CPU with unified memory");
+        for (int i = 0, *x = buffer.data<int>(); i < N; i++) *x++ = 1;
+        timer.stop();
+    }
+    {// resize unified buffers
+        nanovdb::util::Timer timer("resize unified buffer");
+        buffer.resize(2*size);
+        timer.stop();
+        EXPECT_EQ(2*size, buffer.size());
+        EXPECT_EQ(capacity, buffer.capacity());
+    }
+    {
+        //buffer.deviceUpload(0, stream);
+        buffer.prefetch(size, size, 0, stream);
+        nanovdb::util::cuda::Timer timer("Setting values on GPU with unified memory", stream);
+        static const int blockSize = 256, numBlocks = (N + blockSize - 1) / blockSize;
+        initKernel<<<numBlocks, blockSize, 0, stream>>>(N, buffer.data<int>(N));
+        timer.stop();
+    }
+
+    EXPECT_EQ(CUDA_SUCCESS, cudaStreamSynchronize(stream));
+    int *x = buffer.data<int>();
+    for (int i = 0; i < N; ++i) EXPECT_EQ(1, *x++);
+    for (int i = 0; i < N; ++i) EXPECT_EQ(2, *x++);
+}// UnifiedBuffer_basic
+
+TEST(TestNanoVDBCUDA, UnifiedBuffer_IO)
+{
+    //cudaSetDevice(0);// loads runtime context
+    using BufferT = nanovdb::cuda::UnifiedBuffer;
+    const size_t size = 8*(1ULL << 30);// 8 GB
+    cudaStream_t stream;
+    cudaCheck(cudaStreamCreate(&stream));
+    {
+        auto buffer = BufferT::create(size, nullptr, true, (void*)stream);
+        EXPECT_EQ(CUDA_SUCCESS, cudaStreamSynchronize(stream));
+        EXPECT_EQ(size, buffer.size());
+        EXPECT_EQ(size, buffer.capacity());
+    }
+    {// size = capacity
+        auto handle = nanovdb::io::readGrid<BufferT>("data/3_spheres.nvdb", 0);
+        auto nanoGrid = handle.grid<float>();
+        EXPECT_EQ(handle.size(), handle.buffer().capacity());
+        EXPECT_EQ(handle.buffer().size(), handle.buffer().capacity());
+        EXPECT_TRUE(nanoGrid);
+        EXPECT_EQ(1u, handle.gridCount());
+        auto *grid = handle.grid<float>();
+        EXPECT_TRUE(grid);
+        EXPECT_EQ(0u, grid->gridIndex());
+        EXPECT_EQ(1u, grid->gridCount());
+        EXPECT_TRUE(nanovdb::tools::validateChecksum(grid));
+        EXPECT_TRUE(nanovdb::tools::validateChecksum(grid, nanovdb::CheckMode::Full));
+    }
+    {// size < capacity
+        BufferT reference(0, size);
+        auto handle = nanovdb::io::readGrid<BufferT>("data/3_spheres.nvdb", 0, 0, reference);
+        auto nanoGrid = handle.grid<float>();
+        EXPECT_LT(handle.buffer().size(), handle.buffer().capacity());
+        EXPECT_EQ(size, handle.buffer().capacity());
+        EXPECT_TRUE(nanoGrid);
+        EXPECT_EQ(1u, handle.gridCount());
+        auto *grid = handle.grid<float>();
+        EXPECT_TRUE(grid);
+        EXPECT_EQ(0u, grid->gridIndex());
+        EXPECT_EQ(1u, grid->gridCount());
+        EXPECT_TRUE(nanovdb::tools::validateChecksum(grid));
+        EXPECT_TRUE(nanovdb::tools::validateChecksum(grid, nanovdb::CheckMode::Full));
+    }
+}// UnifiedBuffer_IO
+
+TEST(TestNanoVDBCUDA, UnifiedBuffer_createLevelSetSphere)
+{
+    using BufferT = nanovdb::cuda::UnifiedBuffer;
+    const int radius = 100, center = 50, width = 3, voxelSize = 1;
+    const std::string gridName("sphere_" + std::to_string(radius));
+    auto handle = nanovdb::tools::createLevelSetSphere<float, BufferT>(radius, nanovdb::Vec3d(center),
+                                                                       voxelSize, width, nanovdb::Vec3d(0), gridName);
+
+    EXPECT_TRUE(handle);
+    EXPECT_EQ(1u, handle.gridCount());
+    EXPECT_EQ(handle.size(), handle.buffer().capacity());
+    auto* meta = handle.gridMetaData();
+    EXPECT_TRUE(meta);
+    EXPECT_EQ(gridName, std::string(meta->shortGridName()));
+    EXPECT_EQ(nanovdb::GridType::Float, meta->gridType());
+    EXPECT_EQ(nanovdb::GridClass::LevelSet, meta->gridClass());
+    auto* dstGrid = handle.grid<float>();
+    EXPECT_TRUE(dstGrid);
+    EXPECT_EQ(dstGrid->gridSize(), handle.buffer().capacity());
+    EXPECT_EQ(gridName, std::string(dstGrid->gridName()));
+
+    EXPECT_TRUE(dstGrid->hasBBox());
+    EXPECT_TRUE(dstGrid->hasMinMax());
+    EXPECT_TRUE(dstGrid->hasAverage());
+    EXPECT_TRUE(dstGrid->hasStdDeviation());
+
+    EXPECT_NEAR( -3.0f, dstGrid->tree().root().minimum(), 0.04f);
+    EXPECT_NEAR(  3.0f, dstGrid->tree().root().maximum(), 0.04f);
+    EXPECT_NEAR(  0.0f, dstGrid->tree().root().average(), 0.30f);
+
+    EXPECT_EQ(nanovdb::Coord(center - radius - 2), dstGrid->indexBBox()[0]);
+    EXPECT_EQ(nanovdb::Coord(center + radius + 2), dstGrid->indexBBox()[1]);
+} // UnifiedBuffer_createLevelSetSphere
diff --git a/nanovdb/nanovdb/unittest/TestOpenVDB.cc b/nanovdb/nanovdb/unittest/TestOpenVDB.cc
index 238bdc8026..a2d44a01d2 100644
--- a/nanovdb/nanovdb/unittest/TestOpenVDB.cc
+++ b/nanovdb/nanovdb/unittest/TestOpenVDB.cc
@@ -1667,7 +1667,7 @@ TEST_F(TestOpenVDB, File)
     EXPECT_TRUE(dstGrid);
 
     EXPECT_TRUE(handles[0].data());
-    EXPECT_TRUE(handles[0].size() > 0);
+    EXPECT_TRUE(handles[0].bufferSize() > 0);
 
     auto kernel = [&](const openvdb::CoordBBox& bbox) {
         using CoordT = const nanovdb::Coord;
@@ -2784,6 +2784,29 @@ TEST_F(TestOpenVDB, Benchmark_OpenVDB_PointDataGrid)
 }// Benchmark_OpenVDB_PointDataGrid
 #endif
 
+// make testOpenVDB && ./unittest/testOpenVDB --gtest_filter="*BBox"
+TEST_F(TestOpenVDB, BBox)
+{
+    const double voxelSize = 5.0;
+    auto nanoHandle = nanovdb::tools::createFogVolumeBox<float>(40, 60, 100, nanovdb::Vec3d(0.0), voxelSize);
+    auto nanoGrid = nanoHandle.grid<float>();
+    EXPECT_TRUE(nanoGrid);
+    auto openGrid = nanovdb::tools::nanoToOpenVDB(*nanoGrid);
+    auto indexBBox = openGrid->evalActiveVoxelBoundingBox();
+    //std::cerr << "OpenVDB index bbox:\t" << indexBBox << std::endl;
+    //std::cerr << "NanoVDB index bbox:\t" << nanoGrid->indexBBox() << std::endl;
+    auto worldBBoxMin = openGrid->transform().indexToWorld(indexBBox.min());
+    auto worldBBoxMax = openGrid->transform().indexToWorld(indexBBox.max().offsetBy(1));// <----- !!!!!!!!!
+    //std::cout << "OpenVDB world bbox:\t" << worldBBoxMin << " -> " << worldBBoxMax << std::endl;
+    //std::cerr << "NanoVDB world bbox:\t" << nanoGrid->worldBBox() << std::endl;
+    for (int i=0; i<3; ++i) {
+        EXPECT_EQ(  nanoGrid->indexBBox()[0][i], indexBBox.min()[i]);
+        EXPECT_EQ(  nanoGrid->indexBBox()[1][i], indexBBox.max()[i]);
+        EXPECT_NEAR(nanoGrid->worldBBox()[0][i], worldBBoxMin[i], 1e-6);
+        EXPECT_NEAR(nanoGrid->worldBBox()[1][i], worldBBoxMax[i], 1e-6);
+    }
+}// BBox
+
 int main(int argc, char** argv)
 {
     ::testing::InitGoogleTest(&argc, argv);
diff --git a/nanovdb/nanovdb/util/Timer.h b/nanovdb/nanovdb/util/Timer.h
index b2d8dc9e9c..b340851cb0 100644
--- a/nanovdb/nanovdb/util/Timer.h
+++ b/nanovdb/nanovdb/util/Timer.h
@@ -19,7 +19,7 @@ namespace util {
 
 class Timer
 {
-    std::chrono::high_resolution_clock::time_point mStart;
+    std::chrono::high_resolution_clock::time_point mStart, mStop;
 public:
     /// @brief Default constructor
     Timer() {}
@@ -38,22 +38,34 @@ class Timer
         mStart = std::chrono::high_resolution_clock::now();
     }
 
-    /// @brief elapsed time (since start) in miliseconds
+    /// @brief Record the stop time so the elapsed time since start can be computed
+    void record()
+    {
+        mStop = std::chrono::high_resolution_clock::now();
+    }
+
+    /// @brief Returns the time in milliseconds since record was called
+    float milliseconds() const
+    {
+        return std::chrono::duration_cast<std::chrono::milliseconds>(mStop - mStart).count();
+    }
+
+    /// @brief call record and return the elapsed time (since start) in miliseconds
     template <typename AccuracyT = std::chrono::milliseconds>
     auto elapsed()
     {
-        auto end = std::chrono::high_resolution_clock::now();
-        return std::chrono::duration_cast<AccuracyT>(end - mStart).count();
+        this->record();
+        return std::chrono::duration_cast<AccuracyT>(mStop - mStart).count();
     }
 
-    /// @brief stop the timer
+    /// @brief stop the timer and print elapsed time to a stream
     /// @tparam AccuracyT Template parameter defining the accuracy of the reported times
     /// @param os output stream for the message above
     template <typename AccuracyT = std::chrono::milliseconds>
     void stop(std::ostream& os = std::cerr)
     {
-        auto end = std::chrono::high_resolution_clock::now();
-        auto diff = std::chrono::duration_cast<AccuracyT>(end - mStart).count();
+        mStop = std::chrono::high_resolution_clock::now();
+        auto diff = std::chrono::duration_cast<AccuracyT>(mStop - mStart).count();
         os << "completed in " << diff;
         if (std::is_same<AccuracyT, std::chrono::microseconds>::value) {// resolved at compile-time
             os << " microseconds" << std::endl;
@@ -66,7 +78,7 @@ class Timer
         }
     }
 
-    /// @brief stop and start the timer
+    /// @brief stop and start the timer again
     /// @tparam AccuracyT Template parameter defining the accuracy of the reported times
     /// @param msg string message to be printed when timer is started
     /// @param os output stream for the message above
diff --git a/nanovdb/nanovdb/util/cuda/Timer.h b/nanovdb/nanovdb/util/cuda/Timer.h
index 07c9366a0d..b9865193f3 100644
--- a/nanovdb/nanovdb/util/cuda/Timer.h
+++ b/nanovdb/nanovdb/util/cuda/Timer.h
@@ -21,12 +21,17 @@ namespace util::cuda {
 class Timer
 {
     cudaStream_t mStream{0};
-    cudaEvent_t mStart, mStop;
+    cudaEvent_t  mStart, mStop;
 
 public:
     /// @brief Default constructor
     /// @param stream CUDA stream to be timed (defaults to stream 0)
     /// @note Starts the timer
+    /// @warning @c cudaEventCreate creates the event for the current device
+    ///          and @c cudaEventRecord requires that the event and stream are
+    ///          associated with the same device. So it's important to call
+    ///          @c cudaSetDevice(device) so @c device matches the one used
+    ///          when @c stream was created.
     Timer(cudaStream_t stream = 0) : mStream(stream)
     {
         cudaEventCreate(&mStart);
@@ -38,6 +43,11 @@ class Timer
     /// @param msg string message to be printed when timer is started
     /// @param stream CUDA stream to be timed (defaults to stream 0)
     /// @param os output stream for the message above
+    /// @warning @c cudaEventCreate creates the event for the current device
+    ///          and @c cudaEventRecord requires that the event and stream are
+    ///          associated with the same device. So it's important to call
+    ///          @c cudaSetDevice(device) so @c device matches the one used
+    ///          when @c stream was created.
     Timer(const std::string &msg, cudaStream_t stream = 0, std::ostream& os = std::cerr)
         : mStream(stream)
     {
@@ -57,6 +67,10 @@ class Timer
     /// @brief Start the timer
     /// @param stream CUDA stream to be timed (defaults to stream 0)
     /// @param os output stream for the message above
+    /// @warning @c cudaEventRecord requires that the event and stream are
+    ///          associated with the same device. So it's important to call
+    ///          @c cudaSetDevice(device) so @c device matches the one used
+    ///          when @c mStream was created.
     void start() {cudaEventRecord(mStart, mStream);}
 
     /// @brief Start the timer
@@ -78,31 +92,66 @@ class Timer
         this->start();
     }
 
-    /// @brief elapsed time (since start) in miliseconds
-    /// @return elapsed time (since start) in miliseconds
-    float elapsed()
+    /// @warning @c cudaEventRecord requires that the event and stream are
+    ///          associated with the same device. So it's important to call
+    ///          @c cudaSetDevice(device) so it matches the device used when
+    ///          @c mStream was created.
+    inline void record()
     {
         cudaEventRecord(mStop, mStream);
         cudaEventSynchronize(mStop);
-        float diff = 0.0f;
-        cudaEventElapsedTime(&diff, mStart, mStop);
-        return diff;
     }
 
+    /// @brief Return the time in milliseconds since record was called
+    inline float milliseconds() const
+    {
+        float msec = 0.0f;
+        cudaEventElapsedTime(&msec, mStart, mStop);
+        return msec;
+    }
+
+    /// @brief elapsed time (since start) in miliseconds
+    /// @return elapsed time (since start) in miliseconds
+    inline float elapsed()
+    {
+        this->record();
+        return this->milliseconds();
+    }
+
+    /// @brief Prints the elapsed time in milliseconds to a stream
+    /// @param os output stream to print to
+    inline void print(std::ostream& os = std::cerr)
+    {
+        const float msec = this->milliseconds();
+        os << "completed in " << msec << " milliseconds" << std::endl;
+    }
+
+    /// @brief Prints a message followed by the elapsed time in milliseconds to a stream
+    /// @param msg meaasge to print before the time
+    /// @param os stream to print to
+    inline void print(const char* msg, std::ostream& os = std::cerr)
+    {
+        os << msg;
+        this->print(os);
+    }
+
+    /// @brief Like the above method but with a std::string arguemnts
+    inline void print(const std::string &msg, std::ostream& os = std::cerr){this->print(msg.c_str(), os);}
+
     /// @brief stop the timer
     /// @param os output stream for the message above
-    void stop(std::ostream& os = std::cerr)
+    inline void stop(std::ostream& os = std::cerr)
     {
-        float diff = this->elapsed();
-        os << "completed in " << diff << " milliseconds" << std::endl;
+        this->record();
+        this->print(os);
     }
 
     /// @brief stop and start the timer
     /// @param msg string message to be printed when timer is started
     /// @warning Remember to call start before restart
-    void restart(const std::string &msg, std::ostream& os = std::cerr)
+    inline void restart(const std::string &msg, std::ostream& os = std::cerr)
     {
-        this->stop();
+        this->stop(os);
         this->start(msg, os);
     }
 };// Timer
diff --git a/nanovdb/nanovdb/util/cuda/Util.h b/nanovdb/nanovdb/util/cuda/Util.h
index 8d1711b3aa..bf952832cb 100644
--- a/nanovdb/nanovdb/util/cuda/Util.h
+++ b/nanovdb/nanovdb/util/cuda/Util.h
@@ -109,6 +109,43 @@ inline cudaError_t freeAsync(void* d_ptr, cudaStream_t stream){return cudaFreeAs
 
 #endif
 
+/// @brief Returns the device ID associated with the specified pointer
+inline int ptrToDevice(void *ptr)
+{
+    cudaPointerAttributes ptrAtt;
+    cudaPointerGetAttributes(&ptrAtt, ptr);
+    return ptrAtt.device;
+}
+
+/// @brief Returns the ID of the current device
+inline int currentDevice()
+{
+    int current = 0;
+    cudaCheck(cudaGetDevice(&current));
+    return current;
+}
+
+/// @brief Returns the number of devices with compute capability greater or equal to 1.0 that are available for execution
+inline int deviceCount()
+{
+    int deviceCount = 0;
+    cudaCheck(cudaGetDeviceCount(&deviceCount));
+    return deviceCount;
+}
+
+/// @brief Print information about a specific device
+/// @param device device ID for which information will be printed
+/// @param preMsg optional message printed before the device information
+/// @param file   Optional file stream to print to, e.g. stderr or stdout
+inline void printDevInfo(int device, const char *preMsg = nullptr, std::FILE* file = stderr)
+{
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, device);
+    if (preMsg) fprintf(file, "%s ", preMsg);
+    fprintf(file,"GPU #%d, named \"%s\", compute capability %d.%d, %lu GB of VRAM\n",
+            device, prop.name, prop.major, prop.minor, prop.totalGlobalMem >> 30);
+}
+
 /// @brief Simple (naive) implementation of a unique device pointer
 ///        using stream ordered memory allocation and deallocation.
 /// @tparam T Type of the device pointer
@@ -190,4 +227,4 @@ __global__ void cudaLambdaKernel(const size_t numItems, Func func, Args... args)
 }
 #endif// __CUDACC__
 
-#endif// NANOVDB_UTIL_CUDA_UTIL_H_HAS_BEEN_INCLUDED
\ No newline at end of file
+#endif// NANOVDB_UTIL_CUDA_UTIL_H_HAS_BEEN_INCLUDED
diff --git a/pendingchanges/nanovdb.txt b/pendingchanges/nanovdb.txt
new file mode 100644
index 0000000000..83df0ab718
--- /dev/null
+++ b/pendingchanges/nanovdb.txt
@@ -0,0 +1,8 @@
+Added support for multiple GPUs to DeviceBuffer
+Added a UnifiedBuffer class that wraps CUDA unified memory
+Added example for multiGPU sparse convolution
+Added CUDA utility functions for device queries
+Added functions to independently stop and compute the elapsed time for timer classes
+
+Fixed ostream specializations being hidden within the nanovdb namespace
+Replaced CUB's CachingDeviceAllocator with the default asynchronous stream ordered allocator in PointsToGrid for improved performance

From 2d4f3e1bfecf3844ec150407f4ffe2bb9bff4217 Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Sun, 22 Dec 2024 16:30:45 +1300
Subject: [PATCH 19/59] Migrating last PR changes from private repo over to
 this repo including: Changes to .clang-format for better include sorting
 Gsplat rasterization code cleanup and add unused arguments for pixel mask

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 fvdb/.clang-format                            |  20 +-
 fvdb/.github/workflows/tests.yml              |   4 +-
 fvdb/examples/3dgs/train_segmentation.py      |  30 +-
 fvdb/fvdb/_Cpp.pyi                            |   2 +
 fvdb/fvdb/nn/gaussian_splatting.py            |  28 +-
 fvdb/fvdb/optim/gaussian_splat_optimizer.py   |  64 +-
 fvdb/src/FVDB.cpp                             |   6 +-
 fvdb/src/GaussianSplatting.cpp                | 153 ++--
 fvdb/src/GaussianSplatting.h                  |  12 +-
 fvdb/src/GridBatch.cpp                        |  10 +-
 fvdb/src/GridBatch.h                          |   5 +-
 fvdb/src/JaggedTensor.cpp                     |   9 +-
 fvdb/src/JaggedTensor.h                       |   2 +-
 fvdb/src/SparseConvPackInfo.cpp               |   6 +-
 fvdb/src/Types.h                              |   2 +-
 fvdb/src/detail/GridBatchImpl.h               |   1 -
 fvdb/src/detail/TorchDeviceBuffer.cpp         |   1 +
 fvdb/src/detail/autograd/AvgPoolGrid.h        |   2 +-
 fvdb/src/detail/autograd/GaussianRender.cpp   |  62 +-
 fvdb/src/detail/autograd/GaussianRender.h     |   7 +-
 fvdb/src/detail/autograd/JaggedReduce.h       |   2 +-
 fvdb/src/detail/autograd/ReadIntoDense.h      |   7 +-
 fvdb/src/detail/autograd/SampleGrid.h         |   4 +-
 fvdb/src/detail/autograd/SplatIntoGrid.h      |   4 +-
 fvdb/src/detail/autograd/TransformPoints.h    |   4 +-
 fvdb/src/detail/autograd/UpsampleGrid.h       |   4 +-
 fvdb/src/detail/build/CoarseFromFine.cpp      |   1 -
 fvdb/src/detail/build/ConvGrid.cpp            |   1 -
 fvdb/src/detail/build/DenseGrid.cpp           |   1 -
 fvdb/src/detail/build/EmptyGrid.cpp           |   1 -
 fvdb/src/detail/build/FineFromCoarse.cpp      |   1 -
 fvdb/src/detail/build/FromMesh.cpp            |   1 -
 .../build/NearestNeighborGridFromPoints.cpp   |   1 -
 .../src/detail/build/PaddedGridFromCoords.cpp |   1 -
 fvdb/src/detail/build/PaddedGridFromGrid.cpp  |   1 -
 .../src/detail/build/PaddedGridFromPoints.cpp |   1 -
 fvdb/src/detail/io/IO.h                       |   1 +
 fvdb/src/detail/io/LoadNanovdb.cpp            |   1 -
 fvdb/src/detail/io/SaveNanoVDB.cpp            |   4 +-
 fvdb/src/detail/ops/BuildDeviceGrid.cu        |   5 +-
 fvdb/src/detail/ops/JCat0.cu                  |   2 +-
 fvdb/src/detail/ops/JOffsetsFromJIdx.cu       |   2 +-
 fvdb/src/detail/ops/JaggedTensorIndex.cu      |   1 +
 fvdb/src/detail/ops/PaddedIJKForMesh.cu       |   3 +-
 fvdb/src/detail/ops/VolumeRender.cu           |   8 +-
 .../backend/SparseConvolutionCutlass.cu       |  11 +-
 .../backend/SparseConvolutionHalo.cu          |   2 +-
 .../backend/SparseConvolutionHaloGrad.cu      |   2 +-
 .../backend/SparseConvolutionImplicitGEMM.cu  |   4 +-
 .../SparseConvolutionImplicitGEMMGrad.cu      |   4 +-
 ...SparseConvolutionImplicitGEMMGradSorted.cu |   4 +-
 .../SparseConvolutionImplicitGEMMSorted.cu    |   4 +-
 .../backend/SparseConvolutionKernelMap.cu     |   1 -
 .../backend/SparseConvolutionLggs.cu          |   7 +-
 .../convolution/pack_info/BrickHaloBuffer.cu  |   2 +-
 .../pack_info/ConvolutionKernelMap.cu         |   5 +-
 .../pack_info/IGEMMBitOperations.cu           |   1 -
 .../gsplat/GaussianFullyFusedProjection.cu    |   2 +-
 .../GaussianFullyFusedProjectionJagged.cu     |   2 +-
 .../ops/gsplat/GaussianRasterizeBackward.cu   | 548 ++++++++++++++
 .../ops/gsplat/GaussianRasterizeForward.cu    | 403 ++++++++++
 .../ops/gsplat/GaussianTileIntersection.cu    |   5 +-
 fvdb/src/detail/ops/gsplat/GsplatTypes.cuh    |   7 +-
 .../detail/ops/gsplat/SphericalHarmonics.cu   | 704 ++++++++++--------
 fvdb/src/detail/ops/gsplat/VectorTypes.cuh    |   6 +-
 fvdb/src/detail/ops/jagged/JaggedReduce.cu    |   1 -
 fvdb/src/detail/ops/jagged/JaggedSort.cu      |   1 -
 .../utils/nanovdb/TorchNanoConversions.h      |   1 +
 fvdb/src/python/Bindings.cpp                  |   6 +-
 fvdb/src/python/GridBatchBinding.cpp          |   1 -
 fvdb/src/python/JaggedTensorBinding.cpp       |   1 -
 fvdb/src/python/TypeCasters.h                 |   3 +-
 fvdb/tests/unit/test_gsplat.py                |   6 +-
 73 files changed, 1682 insertions(+), 550 deletions(-)
 create mode 100644 fvdb/src/detail/ops/gsplat/GaussianRasterizeBackward.cu
 create mode 100644 fvdb/src/detail/ops/gsplat/GaussianRasterizeForward.cu

diff --git a/fvdb/.clang-format b/fvdb/.clang-format
index 0e90463ff5..fcd55348c8 100644
--- a/fvdb/.clang-format
+++ b/fvdb/.clang-format
@@ -74,13 +74,23 @@ BreakConstructorInitializersBeforeComma: false
 BreakInheritanceList: BeforeColon
 BreakStringLiterals: false
 Cpp11BracedListStyle: false
-IncludeBlocks: Preserve
+IncludeBlocks: Regroup
 IncludeIsMainRegex: "$"
 IncludeCategories:
-  - Regex: '^<.*\.(h|cuh)>'
-    Priority: 1
-  - Regex: ".*"
-    Priority: 2
+  - Regex:           '^"' # quoted includes
+    Priority:        1
+  - Regex:           '^<(detail/|Types.h|JaggedTensor.h|GridBatch.h|SparseConvPackInfo.h|Config.h|FVDB.h|GaussianSplatting.h)' # fvdb includes
+    Priority:        1
+  - Regex:           '^<nanovdb/' # nanovdb includes
+    Priority:        2
+  - Regex:           '^<(torch|c10|THC|ATen)/' # torch includes
+    Priority:        3
+  - Regex:           '^<(cooperative_groups|cuco|cuda.h|cuda_|device_types|math_constants|nvtx3|cute)' # CUDA includes
+    Priority:        4
+  - Regex:           '^<.*\..*' # other system includes (e.g. with a '.')
+    Priority:        5
+  - Regex:           '^<[^.]+' # STL includes (no '.')
+    Priority:        6
 IncludeIsMainSourceRegex: "$"
 IndentPPDirectives: None
 IndentWrappedFunctionNames: false
diff --git a/fvdb/.github/workflows/tests.yml b/fvdb/.github/workflows/tests.yml
index e64b3bf953..c6b71fb8e7 100644
--- a/fvdb/.github/workflows/tests.yml
+++ b/fvdb/.github/workflows/tests.yml
@@ -20,8 +20,8 @@ permissions:
   deployments: write
 
 jobs:
-
   fvdb-build:
+    if: ${{ !startsWith(github.event.pull_request.title, 'Draft:') }}
     name: fVDB Build
     runs-on:
       - self-hosted
@@ -47,7 +47,7 @@ jobs:
 
       - name: Buid fvdb
         run: |
-          TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9+PTX" MAX_JOBS=$(($(nproc) < $(free -g | awk '/^Mem:/{jobs=int($7/2.5); if(jobs<1) jobs=1; print jobs}') ? $(nproc) : $(free -g | awk '/^Mem:/{jobs=int($7/2.5); if(jobs<1) jobs=1; print jobs}'))) conda run --no-capture-output -n fvdb_build python setup.py bdist_wheel --dist-dir=dist
+          TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6+PTX" MAX_JOBS=$(($(nproc) < $(free -g | awk '/^Mem:/{jobs=int($7/2.5); if(jobs<1) jobs=1; print jobs}') ? $(nproc) : $(free -g | awk '/^Mem:/{jobs=int($7/2.5); if(jobs<1) jobs=1; print jobs}'))) conda run --no-capture-output -n fvdb_build python setup.py bdist_wheel --dist-dir=dist
 
       - name: Upload package
         uses: actions/upload-artifact@v4
diff --git a/fvdb/examples/3dgs/train_segmentation.py b/fvdb/examples/3dgs/train_segmentation.py
index 32cc77c139..3650a3b1b4 100644
--- a/fvdb/examples/3dgs/train_segmentation.py
+++ b/fvdb/examples/3dgs/train_segmentation.py
@@ -1,7 +1,6 @@
 # Copyright Contributors to the OpenVDB Project
 # SPDX-License-Identifier: Apache-2.0
 #
-
 import itertools
 import time
 from dataclasses import dataclass
@@ -74,11 +73,11 @@ def __init__(self, cfg: Config, checkpoint_path: str, device: Union[str, torch.d
         self.gs_model: GaussianSplat3D = GaussianSplat3D(torch.rand([8, 3]), torch.rand([8, 3])).to(device)
         self.gs_model.load_state_dict(checkpoint["splats"])
 
-        feats = torch.randn(self.gs_model.num_gaussians, 128, device=device)
-        self.gs_model.register_channel("seg", feats)
+        feats = torch.randn(self.gs_model.num_gaussians, 1, 64, device=device)
+        self.gs_model.set_spherical_harmonic_coeffs(feats)
 
         self.mlp = torch.nn.Sequential(
-            torch.nn.Linear(128, 256),
+            torch.nn.Linear(64, 256),
             torch.nn.ReLU(),
             torch.nn.Linear(256, 256),
             torch.nn.ReLU(),
@@ -112,19 +111,27 @@ def train(self, dataset):
             intrinsics = minibatch["intrinsics"].to(self.device)
             cam_to_world = minibatch["cam_to_world"].to(self.device)
             world_to_cam = torch.linalg.inv(cam_to_world).contiguous()
-            # scale = minibatch["scale"]
-            # mask_cdf = minibatch["mask_cdf"]
-            # mask_id = minibatch["mask_id"]
 
             img_h, img_w = img.shape[1], img.shape[2]
             print(img_h, img_w, img.shape)
 
             # Forward pass
-            feats, alphas, _ = self.gs_model(
-                image_w=img_w, image_h=img_h, intrincs_mat=intrinsics, extrinsics_mat=world_to_cam, channel="seg"
+            feats, alphas, info = self.gs_model(
+                image_w=img_w, image_h=img_h, intrincs_mat=intrinsics, extrinsics_mat=world_to_cam
             )
 
-            print(feats.shape)
+            # TODO (Francis): Don't use Pytorch caching allocator which causes massive fragmentation
+            #                 This should use about half the memory
+            del alphas, info
+            torch.cuda.empty_cache()
+            feats_ravel = feats.view(-1, 64)
+            idx = torch.randperm(feats_ravel.shape[0])[:4096]
+            feats_slice = feats.view(-1, 64)[idx]
+            gfeats = self.mlp(feats_slice)
+            loss = gfeats.sum()
+            torch.cuda.empty_cache()
+
+            time.sleep(2)
 
 
 def main(checkpoint_path: str):
@@ -134,5 +141,4 @@ def main(checkpoint_path: str):
 
 
 if __name__ == "__main__":
-    with torch.no_grad():
-        tyro.cli(main)
+    tyro.cli(main)
diff --git a/fvdb/fvdb/_Cpp.pyi b/fvdb/fvdb/_Cpp.pyi
index 241460219a..5ac7e5f2cb 100644
--- a/fvdb/fvdb/_Cpp.pyi
+++ b/fvdb/fvdb/_Cpp.pyi
@@ -542,6 +542,7 @@ def gaussian_render(
     antialias: bool = False,
     render_depth_channel: bool = False,
     return_debug_info=False,
+    pixels_to_render: Optional[JaggedTensorOrTensor] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, torch.Tensor | Any]]: ...
 def gaussian_render_depth(
     means: JaggedTensorOrTensor,
@@ -559,6 +560,7 @@ def gaussian_render_depth(
     eps2d: float = 0.3,
     antialias: bool = False,
     return_debug_info=False,
+    pixels_to_render: Optional[JaggedTensorOrTensor] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, torch.Tensor | Any]]: ...
 def precompute_gaussian_render_state(
     means: torch.Tensor,
diff --git a/fvdb/fvdb/nn/gaussian_splatting.py b/fvdb/fvdb/nn/gaussian_splatting.py
index c33ae722c4..e39862d602 100644
--- a/fvdb/fvdb/nn/gaussian_splatting.py
+++ b/fvdb/fvdb/nn/gaussian_splatting.py
@@ -41,16 +41,20 @@ def __init__(
         opacities = torch.logit(torch.full((num_means,), initial_opacity))  # [N,]
 
         # TODO (Francis): Don't hardcode number of channels to 3 here
-        _sh_and_colors = torch.zeros((num_means, (sh_degree + 1) ** 2, 3))  # [N, K, 3]
-        _sh_and_colors[:, 0, :] = self._rgb_to_sh(rgbs)
+        _sh_and_colors = torch.zeros(((sh_degree + 1) ** 2, num_means, 3))  # [N, K, 3]
+        _sh_and_colors[0, :, :] = self._rgb_to_sh(rgbs)
 
         # sh = nn.Parameter(_sh_and_colors)  # [N, K, 3]
         means = torch.nn.Parameter(means)  # [N, 3]
         scales = torch.nn.Parameter(scales)  # [N, 3]
         quats = torch.nn.Parameter(quats)  # [N, 4]
         opacities = torch.nn.Parameter(opacities)  # [N,]
-        sh_0 = torch.nn.Parameter(_sh_and_colors[:, :1, :])  # [N, 1, 3]
-        sh_n = torch.nn.Parameter(_sh_and_colors[:, 1:, :])  # [N, K, 3]
+
+        # FIXME (Francis): I don't like splitting these but during training we need
+        #                  a seperate learning rate for each of them. I wonder if we can
+        #                  just create a view for the optimizer but keep them in the same tensor
+        sh_0 = torch.nn.Parameter(_sh_and_colors[:1, :, :])  # [1, N, 3]
+        sh_n = torch.nn.Parameter(_sh_and_colors[1:, :, :])  # [K, N, 3]
 
         self._params = torch.nn.ParameterDict(
             {
@@ -131,6 +135,13 @@ def from_random(
             sh_degree=sh_degree,
         )
 
+    def set_spherical_harmonic_coeffs(self, new_sh_coeffs: torch.Tensor):
+        self._params["sh0"].data = new_sh_coeffs[:1, :, :]
+        self._params["shN"].data = new_sh_coeffs[1:, :, :]
+        k = new_sh_coeffs.shape[1]
+        self.sh_degree = int(math.sqrt(k) - 1)
+        self.clear_cache()
+
     @property
     def num_gaussians(self) -> int:
         return self._params["means"].shape[0]
@@ -230,7 +241,6 @@ def forward(
 
         # if camera_model not in ["pinhole", "ortho", "fisheye"]:
         #     raise ValueError(f"Invalid camera_model {camera_model}")
-
         sh_degree = self.sh_degree if sh_degree < 0 else sh_degree
         if sh_degree > self.sh_degree:
             raise ValueError(f"sh_degree {sh_degree} is larger than the maximum {self.sh_degree}")
@@ -239,7 +249,13 @@ def forward(
         quats = self._params["quats"]  # [N, 4]
         scales = torch.exp(self._params["scales"])  # [N, 3]
         opacities = torch.sigmoid(self._params["opacities"])  # [N,]
-        sh = torch.cat([self._params["sh0"], self._params["shN"]], 1)  # [N, K, 3]
+        # FIXME (Francis): It sucks that we need to concatenate here
+        #                  but we do this so we can optimize
+        #                  these parameters seperately. I wonder if we can
+        #                  fix this
+        sh0 = self._params["sh0"]  # [1, N, 3]
+        shN = self._params["shN"]  # [K, N, 3]
+        sh = sh0 if shN.numel() == 0 or sh_degree == 0 else torch.cat([sh0, shN], 0)  # [N, K, 3]
 
         image_crop = (0, 0, image_w, image_h) if image_crop is None else image_crop
         if cache_info:
diff --git a/fvdb/fvdb/optim/gaussian_splat_optimizer.py b/fvdb/fvdb/optim/gaussian_splat_optimizer.py
index a8c75d8a3f..30535abb1d 100644
--- a/fvdb/fvdb/optim/gaussian_splat_optimizer.py
+++ b/fvdb/fvdb/optim/gaussian_splat_optimizer.py
@@ -445,23 +445,39 @@ def _normalized_quat_to_rotmat(quat_: torch.Tensor) -> torch.Tensor:
 
         def param_fn(name: str, p: torch.Tensor) -> torch.Tensor:
             repeats = [split_factor] + [1] * (p.dim() - 1)
+            cat_dim = 0
             if name == "means":
                 p_split = (p[sel] + samples).reshape(-1, 3)  # [S*N, 3]
+                p_rest = p[rest]
             elif name == "scales":
                 # TODO: Adjust scale factor for splitting
                 p_split = torch.log(scales / 1.6).repeat(split_factor, 1)  # [2N, 3]
+                p_rest = p[rest]
             elif name == "opacities" and revised_opacity:
                 new_opacities = 1.0 - torch.sqrt(1.0 - torch.sigmoid(p[sel]))
                 p_split = torch.logit(new_opacities).repeat(repeats)  # [2N]
+                p_rest = p[rest]
+            elif name == "sh0" or name == "shN":
+                repeats = [1] + [split_factor] + [1] * (p.dim() - 2)
+                p_split = p[:, sel, ...].repeat(repeats)  # [K, 2N, D]
+                p_rest = p[:, rest, ...]
+                cat_dim = 1
             else:
                 p_split = p[sel].repeat(repeats)
-            p_new = torch.cat([p[rest], p_split])
-            p_new = torch.nn.Parameter(p_new)
+                p_rest = p[rest]
+            p_new = torch.nn.Parameter(torch.cat([p_rest, p_split], dim=cat_dim))
             return p_new
 
-        def optimizer_fn(_: str, v: torch.Tensor) -> torch.Tensor:
-            v_split = torch.zeros((split_factor * len(sel), *v.shape[1:]), device=device)
-            return torch.cat([v[rest], v_split])
+        def optimizer_fn(name: str, key: str, v: torch.Tensor) -> torch.Tensor:
+            if name == "sh0" or name == "shN":
+                v_split = torch.zeros((v.shape[0], split_factor * len(sel), *v.shape[2:]), device=device)
+                v_rest = v[:, rest, ...]
+                cat_dim = 1
+            else:
+                v_split = torch.zeros((split_factor * len(sel), *v.shape[1:]), device=device)
+                v_rest = v[rest]
+                cat_dim = 0
+            return torch.cat([v_rest, v_split], dim=cat_dim)
 
         # update the parameters and the state in the optimizers
         self._update_param_with_optimizer(param_fn, optimizer_fn)
@@ -483,12 +499,25 @@ def _duplicate_params(self, mask: torch.Tensor, dup_factor: int = 1):
         device = mask.device
         sel = torch.where(mask)[0]
 
-        def param_fn(_: str, p: torch.Tensor) -> torch.Tensor:
-            return torch.nn.Parameter(torch.cat([p] + [p[sel]] * dup_factor))
+        def param_fn(name: str, p: torch.Tensor) -> torch.Tensor:
+            cat_dim = 0
+            repeats = [dup_factor] + [1] * (p.dim() - 1)
+            if name == "sh0" or name == "shN":
+                repeats = [1, dup_factor, 1]
+                cat_dim = 1
+                p_sel = p[:, sel, ...]
+            else:
+                p_sel = p[sel]
+
+            return torch.nn.Parameter(torch.cat([p, p_sel.repeat(repeats)], dim=cat_dim))
 
-        def optimizer_fn(_: str, v: torch.Tensor) -> torch.Tensor:
-            zpad = torch.zeros((len(sel), *v.shape[1:]), device=device)
-            return torch.cat([v] + [zpad] * dup_factor)
+        def optimizer_fn(name: str, key: str, v: torch.Tensor) -> torch.Tensor:
+            if name == "sh0" or name == "shN":
+                zpad = torch.zeros(v.shape[0], len(sel) * dup_factor, *v.shape[2:], device=v.device, dtype=v.dtype)
+                return torch.cat([v, zpad], dim=1)
+            else:
+                zpad = torch.zeros((len(sel) * dup_factor, *v.shape[1:]), device=device)
+                return torch.cat([v, zpad])
 
         # update the parameters and the state in the optimizers
         self._update_param_with_optimizer(param_fn, optimizer_fn)
@@ -508,10 +537,15 @@ def _remove_params(self, mask: torch.Tensor):
         sel = torch.where(~mask)[0]
 
         def param_fn(name: str, p: torch.Tensor) -> torch.Tensor:
+            if name == "sh0" or name == "shN":
+                return torch.nn.Parameter(p[:, sel, ...])
             return torch.nn.Parameter(p[sel])
 
-        def optimizer_fn(key: str, v: torch.Tensor) -> torch.Tensor:
-            return v[sel]
+        def optimizer_fn(name: str, key: str, v: torch.Tensor) -> torch.Tensor:
+            if name == "sh0" or name == "shN":
+                return v[:, sel, ...]
+            else:
+                return v[sel]
 
         # update the parameters and the state in the optimizers
         self._update_param_with_optimizer(param_fn, optimizer_fn)
@@ -537,7 +571,7 @@ def param_fn(name: str, p: torch.Tensor) -> torch.Tensor:
             else:
                 raise ValueError(f"Unexpected parameter name: {name}")
 
-        def optimizer_fn(key: str, v: torch.Tensor) -> torch.Tensor:
+        def optimizer_fn(name: str, key: str, v: torch.Tensor) -> torch.Tensor:
             return torch.zeros_like(v)
 
         # update the parameters and the state in the optimizers
@@ -547,7 +581,7 @@ def optimizer_fn(key: str, v: torch.Tensor) -> torch.Tensor:
     def _update_param_with_optimizer(
         self,
         param_fn: Callable[[str, torch.Tensor], torch.Tensor],
-        optimizer_fn: Callable[[str, torch.Tensor], torch.Tensor],
+        optimizer_fn: Callable[[str, str, torch.Tensor], torch.Tensor],
         names: Union[List[str], None] = None,
     ):
         """Update the parameters and the state in the optimizers with defined functions.
@@ -577,7 +611,7 @@ def _update_param_with_optimizer(
                 for key in p_state.keys():
                     if key != "step":
                         v = p_state[key]
-                        p_state[key] = optimizer_fn(key, v)
+                        p_state[key] = optimizer_fn(name, key, v)
                 p_new = param_fn(name, p)
                 optimizer.param_groups[i]["params"] = [p_new]
                 optimizer.state[p_new] = p_state
diff --git a/fvdb/src/FVDB.cpp b/fvdb/src/FVDB.cpp
index b23fc27f1d..d14f60b897 100644
--- a/fvdb/src/FVDB.cpp
+++ b/fvdb/src/FVDB.cpp
@@ -3,9 +3,9 @@
 //
 #include "FVDB.h"
 
-#include "detail/autograd/Autograd.h"
-#include "detail/io/IO.h"
-#include "detail/ops/jagged/JaggedOps.h"
+#include <detail/autograd/Autograd.h>
+#include <detail/io/IO.h>
+#include <detail/ops/jagged/JaggedOps.h>
 
 #include <ATen/cuda/CUDAContext.h>
 
diff --git a/fvdb/src/GaussianSplatting.cpp b/fvdb/src/GaussianSplatting.cpp
index b21c4a6d9e..ccca44d112 100644
--- a/fvdb/src/GaussianSplatting.cpp
+++ b/fvdb/src/GaussianSplatting.cpp
@@ -2,31 +2,23 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "GaussianSplatting.h"
-#include "detail/autograd/Autograd.h"
-#include "detail/ops/Ops.h"
+
+#include <detail/autograd/Autograd.h>
+#include <detail/ops/Ops.h>
 
 namespace fvdb {
 
 namespace {
 
 torch::Tensor
-evaluateSphericalHarmonics(const torch::Tensor &directions, const torch::Tensor &sh_coeffs,
-                           const torch::Tensor radii            = torch::Tensor(),
-                           const int           sh_degree_to_use = -1) {
-    const int K                = sh_coeffs.size(-2); // number of SH bases
+evaluateSphericalHarmonics(const torch::optional<torch::Tensor> directions,
+                           const torch::Tensor                 &sh_coeffs,
+                           const torch::Tensor                  radii            = torch::Tensor(),
+                           const int                            sh_degree_to_use = -1) {
+    const int K                = sh_coeffs.size(0); // number of SH bases
     const int actual_sh_degree = sh_degree_to_use < 0 ? (std::sqrt(K) - 1) : sh_degree_to_use;
-
     TORCH_CHECK(K >= (actual_sh_degree + 1) * (actual_sh_degree + 1),
                 "K must be at least (sh_degree_to_use + 1)^2");
-    // TORCH_CHECK(sh_coeffs_or_diffuse_data.is_contiguous(), "sh_coeffs must be
-    // contiguous");
-    if (K == 1) {
-        return sh_coeffs.squeeze(-2) * 0.2820947917738781;
-    }
-    if (actual_sh_degree == 0) {
-        return sh_coeffs.index({ torch::indexing::Ellipsis, 0, torch::indexing::Slice() }) *
-               0.2820947917738781;
-    }
     auto sh_results =
         detail::autograd::SphericalHarmonics::apply(actual_sh_degree, directions, sh_coeffs, radii);
 
@@ -43,10 +35,10 @@ computeGaussianRenderStateUnbatched(const torch::Tensor &means, const torch::Ten
                                     const int tile_size, const float radius_clip, const float eps2d,
                                     bool antialias, bool render_depth_channel,
                                     bool render_depth_only) {
-    const int C = viewmats.size(0);   // number of cameras
-    const int N = means.size(0);      // number of gaussians
-    const int K = sh_coeffs.size(-2); // number of SH bases
-    const int D = sh_coeffs.size(-1); // Dimension of output
+    const int C = viewmats.size(0);                           // number of cameras
+    const int N = means.size(0);                              // number of gaussians
+    const int K = render_depth_only ? 1 : sh_coeffs.size(0);  // number of SH bases
+    const int D = render_depth_only ? 1 : sh_coeffs.size(-1); // Dimension of output
 
     TORCH_CHECK(means.sizes() == torch::IntArrayRef({ N, 3 }), "means must have shape (N, 3)");
     TORCH_CHECK(quats.sizes() == torch::IntArrayRef({ N, 4 }), "quats must have shape (N, 4)");
@@ -55,8 +47,8 @@ computeGaussianRenderStateUnbatched(const torch::Tensor &means, const torch::Ten
     TORCH_CHECK(viewmats.sizes() == torch::IntArrayRef({ C, 4, 4 }),
                 "viewmats must have shape (C, 4, 4)");
     TORCH_CHECK(Ks.sizes() == torch::IntArrayRef({ C, 3, 3 }), "Ks must have shape (C, 3, 3)");
-    TORCH_CHECK(render_depth_only || sh_coeffs.sizes() == torch::IntArrayRef({ N, K, D }),
-                "sh_coeffs must have shape (N, K, 3)");
+    TORCH_CHECK(render_depth_only || sh_coeffs.sizes() == torch::IntArrayRef({ K, N, D }),
+                "sh_coeffs must have shape (K, N, D)");
 
     TORCH_CHECK(means.is_contiguous(), "means must be contiguous");
     TORCH_CHECK(quats.is_contiguous(), "quats must be contiguous");
@@ -85,17 +77,30 @@ computeGaussianRenderStateUnbatched(const torch::Tensor &means, const torch::Ten
     if (render_depth_only) {
         colors = depths.unsqueeze(-1); // [C, N, 1]
     } else {
-        // You're using directional colors, evaluate them from spherical harmonics
-        // [differentiable]
-        const torch::Tensor camtoworlds = torch::inverse(viewmats);
-        const torch::Tensor dirs =
-            means.index(
-                { torch::indexing::None, torch::indexing::Slice(), torch::indexing::Slice() }) -
-            camtoworlds.index({ torch::indexing::Slice(), torch::indexing::None,
-                                torch::indexing::Slice(0, 3), 3 });
-        colors = evaluateSphericalHarmonics(dirs, sh_coeffs.unsqueeze(0).expand({ C, -1, -1, -1 }),
-                                            radii, sh_degree_to_use);
-        colors = torch::clamp_min(colors, colors + 0.5f);              // [C, N, 3]
+        if (K == 1 || sh_degree_to_use == 0) {
+            // Handle the case where we only have degree zero spherical harmonics, which just
+            // represent diffuse colors. This means that each Gaussian receives the same color
+            // regardless of which camera sees it, and we can just expand the colors to the correct
+            // shape (without reallocating memory). i.e. the color tensor has shape [C, N, D] but
+            // only allocates NxD floats in memory.
+            // This is useful for rendering e.g. high dimensional diffuse features.
+            colors = evaluateSphericalHarmonics(torch::nullopt, sh_coeffs.unsqueeze(1), radii,
+                                                sh_degree_to_use);
+            colors = colors.expand({ C, -1, -1 });
+
+        } else {
+            // FIXME (Francis): Do this in the kernel instead of materializing a large
+            //                  tensor here. It's a bit annoying because we'll have to update
+            //                  the current backward pass
+            const torch::Tensor camtoworlds = torch::inverse(viewmats);
+            const torch::Tensor dirs =
+                means.index(
+                    { torch::indexing::None, torch::indexing::Slice(), torch::indexing::Slice() }) -
+                camtoworlds.index({ torch::indexing::Slice(), torch::indexing::None,
+                                    torch::indexing::Slice(0, 3), 3 }); // [1, N, 3] - [C, 1, 3]
+            colors = evaluateSphericalHarmonics(
+                dirs, sh_coeffs.unsqueeze(1).expand({ -1, C, -1, -1 }), radii, sh_degree_to_use);
+        }
 
         if (render_depth_channel) {
             colors = torch::cat({ colors, depths.unsqueeze(-1) }, -1); // [C, N, D + 1]
@@ -112,9 +117,8 @@ computeGaussianRenderStateUnbatched(const torch::Tensor &means, const torch::Ten
                 means2d, radii, depths, at::nullopt, num_cameras, tile_size, num_tiles_h,
                 num_tiles_w);
         });
-    const torch::Tensor tile_offsets      = std::get<0>(tile_intersections);
-    const torch::Tensor tile_gaussian_ids = std::get<1>(tile_intersections);
-
+    const torch::Tensor tile_offsets      = std::get<0>(tile_intersections); // [C, TH, TW]
+    const torch::Tensor tile_gaussian_ids = std::get<1>(tile_intersections); // [M]
     return {
         means2d, conics, opacities_compensated, radii, colors, tile_offsets, tile_gaussian_ids
     };
@@ -122,17 +126,19 @@ computeGaussianRenderStateUnbatched(const torch::Tensor &means, const torch::Ten
 
 // Gaussian render for a single torch Tensor
 std::tuple<torch::Tensor, torch::Tensor, std::unordered_map<std::string, torch::Tensor>>
-gaussianRenderUnbatchedInternal(
-    const torch::Tensor &means, const torch::Tensor &quats, const torch::Tensor &scales,
-    const torch::Tensor &opacities, const torch::Tensor &sh_coeffs_or_diffuse_data,
-    const torch::Tensor &viewmats, const torch::Tensor &Ks, const uint32_t image_width,
-    const uint32_t image_height, const float near_plane, const float far_plane,
-    const int sh_degree_to_use, const int tile_size, const float radius_clip, const float eps2d,
-    bool antialias, bool render_depth_channel, bool return_debug_info, bool render_depth_only) {
+gaussianRenderUnbatchedInternal(const torch::Tensor &means, const torch::Tensor &quats,
+                                const torch::Tensor &scales, const torch::Tensor &opacities,
+                                const torch::Tensor &sh_coeffs, const torch::Tensor &viewmats,
+                                const torch::Tensor &Ks, const uint32_t image_width,
+                                const uint32_t image_height, const float near_plane,
+                                const float far_plane, const int sh_degree_to_use,
+                                const int tile_size, const float radius_clip, const float eps2d,
+                                bool antialias, bool render_depth_channel, bool return_debug_info,
+                                bool render_depth_only) {
     std::array<torch::Tensor, 7> renderState = computeGaussianRenderStateUnbatched(
-        means, quats, scales, opacities, sh_coeffs_or_diffuse_data, viewmats, Ks, image_width,
-        image_height, near_plane, far_plane, sh_degree_to_use, tile_size, radius_clip, eps2d,
-        antialias, render_depth_channel, render_depth_only);
+        means, quats, scales, opacities, sh_coeffs, viewmats, Ks, image_width, image_height,
+        near_plane, far_plane, sh_degree_to_use, tile_size, radius_clip, eps2d, antialias,
+        render_depth_channel, render_depth_only);
 
     torch::Tensor means2d               = renderState[0];
     torch::Tensor conics                = renderState[1];
@@ -176,11 +182,11 @@ gaussianRenderInternal(const JaggedTensor &means,     // [N1 + N2 + ..., 3]
                        const int tile_size, const float radius_clip, const float eps2d,
                        bool antialias, bool render_depth_channel, bool return_debug_info,
                        bool render_depth_only) {
-    const int ccz = viewmats.rsize(0);   // number of cameras
-    const int ggz = means.rsize(0);      // number of gaussians
-    const int K   = sh_coeffs.rsize(-2); // number of SH bases
+    const int ccz = viewmats.rsize(0);                           // number of cameras
+    const int ggz = means.rsize(0);                              // number of gaussians
+    const int D   = render_depth_only ? 1 : sh_coeffs.rsize(-1); // Dimension of output
 
-    using namespace torch::indexing;     // For the Slice operation
+    using namespace torch::indexing;                             // For the Slice operation
 
     TORCH_CHECK(means.rsizes() == torch::IntArrayRef({ ggz, 3 }), "means must have shape (ggz, 3)");
     TORCH_CHECK(quats.rsizes() == torch::IntArrayRef({ ggz, 4 }), "quats must have shape (ggz, 4)");
@@ -191,8 +197,6 @@ gaussianRenderInternal(const JaggedTensor &means,     // [N1 + N2 + ..., 3]
     TORCH_CHECK(viewmats.rsizes() == torch::IntArrayRef({ ccz, 4, 4 }),
                 "viewmats must have shape (C, 4, 4)");
     TORCH_CHECK(Ks.rsizes() == torch::IntArrayRef({ ccz, 3, 3 }), "Ks must have shape (ccz, 3, 3)");
-    TORCH_CHECK(render_depth_only || sh_coeffs.rsizes() == torch::IntArrayRef({ ggz, K, 3 }),
-                "sh_coeffs must have shape (ggz, K, 3)");
 
     TORCH_CHECK(means.is_contiguous(), "means must be contiguous");
     TORCH_CHECK(quats.is_contiguous(), "quats must be contiguous");
@@ -209,6 +213,13 @@ gaussianRenderInternal(const JaggedTensor &means,     // [N1 + N2 + ..., 3]
             return_debug_info, render_depth_only);
     }
 
+    // Check after we dispatch the unbatched version since the unbatched version accepts a
+    // [K, N, D] tensor for sh_coeffs while the batched version accepts a [ggz, K, D] tensor,
+    // which gets permuted later on.
+    const int K = render_depth_only ? 1 : sh_coeffs.rsize(-2); // number of SH bases
+    TORCH_CHECK(render_depth_only || sh_coeffs.rsizes() == torch::IntArrayRef({ ggz, K, D }),
+                "sh_coeffs must have shape (ggz, K, D)");
+
     // TODO: this part is very convoluted. But I don't have a better way of coding it without
     // customized CUDA kernels. The idea is that given Gaussians with shape [\sum(N_i), ...] and
     // cameras with shape [\sum(C_i), ...], we would calculate the intersection of each Gaussian
@@ -271,12 +282,14 @@ gaussianRenderInternal(const JaggedTensor &means,     // [N1 + N2 + ..., 3]
     } else {
         // Colors from SH coefficients [differentiable]
         const torch::Tensor sh_coeffs_batched =
-            sh_coeffs.jdata().index({ gaussian_ids, Slice(), Slice() });    // [nnz, K, 3]
+            sh_coeffs.jdata()
+                .permute({ 1, 0, 2 })
+                .index({ Slice(), gaussian_ids, Slice() });                 // [K, nnz, 3]
+
         const torch::Tensor camtoworlds = torch::inverse(viewmats.jdata()); // [ccz, 4, 4]
         const torch::Tensor dirs        = means.jdata().index({ gaussian_ids, Slice() }) -
                                    camtoworlds.index({ camera_ids, Slice(None, 3), 3 });
         colors = evaluateSphericalHarmonics(dirs, sh_coeffs_batched, radii, sh_degree_to_use);
-        colors = torch::clamp_min(colors, colors + 0.5f); // [ggz, 3]
 
         if (render_depth_channel) {
             colors = torch::cat({ colors, depths.index({ gaussian_ids }).unsqueeze(-1) }, -1);
@@ -323,21 +336,17 @@ projectGaussiansToImages(const torch::Tensor &means, const torch::Tensor &quats,
 
 // Gaussian render for a single torch Tensor
 std::unordered_map<std::string, torch::Tensor>
-precomputeGaussianRenderStateUnbatched(const torch::Tensor &means, const torch::Tensor &quats,
-                                       const torch::Tensor &scales, const torch::Tensor &opacities,
-                                       const torch::Tensor &sh_coeffs_or_diffuse_data,
-                                       const torch::Tensor &viewmats, const torch::Tensor &Ks,
-                                       const uint32_t image_width, const uint32_t image_height,
-                                       const float near_plane, const float far_plane,
-                                       const int sh_degree_to_use, const int tile_size,
-                                       const float radius_clip, const float eps2d, bool antialias,
-                                       bool render_depth_channel) {
-    const bool render_depth_only = false;
-
-    std::array<torch::Tensor, 7> renderState = computeGaussianRenderStateUnbatched(
-        means, quats, scales, opacities, sh_coeffs_or_diffuse_data, viewmats, Ks, image_width,
-        image_height, near_plane, far_plane, sh_degree_to_use, tile_size, radius_clip, eps2d,
-        antialias, render_depth_channel, render_depth_only);
+precomputeGaussianRenderStateUnbatched(
+    const torch::Tensor &means, const torch::Tensor &quats, const torch::Tensor &scales,
+    const torch::Tensor &opacities, const torch::Tensor &sh_coeffs, const torch::Tensor &viewmats,
+    const torch::Tensor &Ks, const uint32_t image_width, const uint32_t image_height,
+    const float near_plane, const float far_plane, const int sh_degree_to_use, const int tile_size,
+    const float radius_clip, const float eps2d, bool antialias, bool render_depth_channel) {
+    const bool                   render_depth_only = false;
+    std::array<torch::Tensor, 7> renderState       = computeGaussianRenderStateUnbatched(
+        means, quats, scales, opacities, sh_coeffs, viewmats, Ks, image_width, image_height,
+        near_plane, far_plane, sh_degree_to_use, tile_size, radius_clip, eps2d, antialias,
+        render_depth_channel, render_depth_only);
 
     std::unordered_map<std::string, torch::Tensor> info;
     info["means2d"]           = renderState[0];
@@ -376,7 +385,8 @@ gaussianRender(const JaggedTensor &means,     // [N1 + N2 + ..., 3]
                const uint32_t image_width, const uint32_t image_height, const float near_plane,
                const float far_plane, const int sh_degree_to_use, const int tile_size,
                const float radius_clip, const float eps2d, bool antialias,
-               bool render_depth_channel, bool return_debug_info) {
+               bool render_depth_channel, bool return_debug_info,
+               const torch::optional<JaggedTensor> pixels_to_render) {
     return gaussianRenderInternal(
         means, quats, scales, opacities, sh_coeffs, viewmats, Ks, image_width, image_height,
         near_plane, far_plane, sh_degree_to_use, tile_size, radius_clip, eps2d, antialias,
@@ -392,7 +402,8 @@ gaussianRenderDepth(const JaggedTensor &means,     // [N1 + N2 + ..., 3]
                     const JaggedTensor &Ks,        // [C1 + C2 + ..., 3, 3]
                     const uint32_t image_width, const uint32_t image_height, const float near_plane,
                     const float far_plane, const int tile_size, const float radius_clip,
-                    const float eps2d, bool antialias, bool return_debug_info) {
+                    const float eps2d, bool antialias, bool return_debug_info,
+                    const torch::optional<JaggedTensor> pixels_to_render) {
     fvdb::JaggedTensor dummy_coeffs;
     return gaussianRenderInternal(
         means, quats, scales, opacities, dummy_coeffs, viewmats, Ks, image_width, image_height,
diff --git a/fvdb/src/GaussianSplatting.h b/fvdb/src/GaussianSplatting.h
index 8a087990b8..ab3ef233df 100644
--- a/fvdb/src/GaussianSplatting.h
+++ b/fvdb/src/GaussianSplatting.h
@@ -4,10 +4,11 @@
 #ifndef FVDB_GAUSSIANSPLATTING_H
 #define FVDB_GAUSSIANSPLATTING_H
 
+#include "JaggedTensor.h"
+
 #include <torch/all.h>
-#include <unordered_map>
 
-#include "JaggedTensor.h"
+#include <unordered_map>
 
 namespace fvdb {
 
@@ -54,7 +55,8 @@ gaussianRender(const JaggedTensor &means,     // [N1 + N2 + ..., 3]
                const float near_plane = 0.01, const float far_plane = 1e10,
                const int sh_degree_to_use = -1, const int tile_size = 16,
                const float radius_clip = 0.0, const float eps2d = 0.3, bool antialias = false,
-               bool render_depth_channel = false, bool return_debug_info = false);
+               bool render_depth_channel = false, bool return_debug_info = false,
+               const torch::optional<JaggedTensor> pixels_to_render = torch::nullopt);
 
 std::tuple<torch::Tensor, torch::Tensor, std::unordered_map<std::string, torch::Tensor>>
 gaussianRenderDepth(const JaggedTensor &means,     // [N1 + N2 + ..., 3]
@@ -66,8 +68,8 @@ gaussianRenderDepth(const JaggedTensor &means,     // [N1 + N2 + ..., 3]
                     const uint32_t image_width, const uint32_t image_height,
                     const float near_plane = 0.01, const float far_plane = 1e10,
                     const int tile_size = 16, const float radius_clip = 0.0,
-                    const float eps2d = 0.3, bool antialias = false,
-                    bool return_debug_info = false);
+                    const float eps2d = 0.3, bool antialias = false, bool return_debug_info = false,
+                    const torch::optional<JaggedTensor> pixels_to_render = torch::nullopt);
 
 } // namespace fvdb
 
diff --git a/fvdb/src/GridBatch.cpp b/fvdb/src/GridBatch.cpp
index 2e1307ccf6..ac1d2ee72c 100644
--- a/fvdb/src/GridBatch.cpp
+++ b/fvdb/src/GridBatch.cpp
@@ -4,11 +4,11 @@
 #include "GridBatch.h"
 
 #include "FVDB.h"
-#include "detail/GridBatchImpl.h"
-#include "detail/autograd/Autograd.h"
-#include "detail/build/Build.h"
-#include "detail/io/IO.h"
-#include "detail/ops/Ops.h"
+#include <detail/GridBatchImpl.h>
+#include <detail/autograd/Autograd.h>
+#include <detail/build/Build.h>
+#include <detail/io/IO.h>
+#include <detail/ops/Ops.h>
 
 namespace fvdb {
 
diff --git a/fvdb/src/GridBatch.h b/fvdb/src/GridBatch.h
index 04bcc42d97..cf299d82da 100644
--- a/fvdb/src/GridBatch.h
+++ b/fvdb/src/GridBatch.h
@@ -4,11 +4,10 @@
 #ifndef FVDB_GRIDBATCH_H
 #define FVDB_GRIDBATCH_H
 
-#include "detail/GridBatchImpl.h"
-#include "detail/utils/Utils.h"
-
 #include "JaggedTensor.h"
 #include "Types.h"
+#include <detail/GridBatchImpl.h>
+#include <detail/utils/Utils.h>
 
 #include <nanovdb/NanoVDB.h>
 #include <nanovdb/io/IO.h>
diff --git a/fvdb/src/JaggedTensor.cpp b/fvdb/src/JaggedTensor.cpp
index c83ba25099..9f701a1eea 100644
--- a/fvdb/src/JaggedTensor.cpp
+++ b/fvdb/src/JaggedTensor.cpp
@@ -4,11 +4,10 @@
 #include "JaggedTensor.h"
 
 #include "Config.h"
-
-#include "detail/autograd/JaggedReduce.h"
-#include "detail/ops/Ops.h"
-#include "detail/ops/jagged/JaggedOps.h"
-#include "detail/utils/Utils.h"
+#include <detail/autograd/JaggedReduce.h>
+#include <detail/ops/Ops.h>
+#include <detail/ops/jagged/JaggedOps.h>
+#include <detail/utils/Utils.h>
 
 namespace fvdb {
 
diff --git a/fvdb/src/JaggedTensor.h b/fvdb/src/JaggedTensor.h
index e6e70c8965..2c0e910983 100644
--- a/fvdb/src/JaggedTensor.h
+++ b/fvdb/src/JaggedTensor.h
@@ -4,7 +4,7 @@
 #ifndef FVDB_JAGGEDTENSOR_H
 #define FVDB_JAGGEDTENSOR_H
 
-#include "detail/utils/Utils.h"
+#include <detail/utils/Utils.h>
 
 #include <torch/all.h>
 #include <torch/custom_class.h>
diff --git a/fvdb/src/SparseConvPackInfo.cpp b/fvdb/src/SparseConvPackInfo.cpp
index c3e040cce0..f386c93632 100644
--- a/fvdb/src/SparseConvPackInfo.cpp
+++ b/fvdb/src/SparseConvPackInfo.cpp
@@ -3,9 +3,9 @@
 //
 #include "SparseConvPackInfo.h"
 
-#include "detail/autograd/Autograd.h"
-#include "detail/ops/Ops.h"
-#include "detail/ops/convolution/pack_info/PackInfoOps.h"
+#include <detail/autograd/Autograd.h>
+#include <detail/ops/Ops.h>
+#include <detail/ops/convolution/pack_info/PackInfoOps.h>
 
 namespace fvdb {
 
diff --git a/fvdb/src/Types.h b/fvdb/src/Types.h
index cacdbe67f5..0a3d075549 100644
--- a/fvdb/src/Types.h
+++ b/fvdb/src/Types.h
@@ -4,7 +4,7 @@
 #ifndef FVDB_TYPES_H
 #define FVDB_TYPES_H
 
-#include "detail/TypesImpl.h"
+#include <detail/TypesImpl.h>
 
 #include <nanovdb/NanoVDB.h>
 
diff --git a/fvdb/src/detail/GridBatchImpl.h b/fvdb/src/detail/GridBatchImpl.h
index d11ea1c1ec..2f446eaad7 100644
--- a/fvdb/src/detail/GridBatchImpl.h
+++ b/fvdb/src/detail/GridBatchImpl.h
@@ -7,7 +7,6 @@
 #include "TorchDeviceBuffer.h"
 #include "VoxelCoordTransform.h"
 #include "utils/Utils.h"
-
 #include <JaggedTensor.h>
 
 #include <nanovdb/GridHandle.h>
diff --git a/fvdb/src/detail/TorchDeviceBuffer.cpp b/fvdb/src/detail/TorchDeviceBuffer.cpp
index 3b62782570..ba1df44da0 100644
--- a/fvdb/src/detail/TorchDeviceBuffer.cpp
+++ b/fvdb/src/detail/TorchDeviceBuffer.cpp
@@ -8,6 +8,7 @@
 
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAGuard.h>
+
 #include <cuda_runtime_api.h> // for cudaMalloc/cudaMallocManaged/cudaFree
 
 namespace nanovdb {
diff --git a/fvdb/src/detail/autograd/AvgPoolGrid.h b/fvdb/src/detail/autograd/AvgPoolGrid.h
index e90b481c2a..72dcde4829 100644
--- a/fvdb/src/detail/autograd/AvgPoolGrid.h
+++ b/fvdb/src/detail/autograd/AvgPoolGrid.h
@@ -4,7 +4,7 @@
 #ifndef FVDB_DETAIL_AUTOGRAD_AVGPOOLGRID_H
 #define FVDB_DETAIL_AUTOGRAD_AVGPOOLGRID_H
 
-#include "detail/GridBatchImpl.h"
+#include <detail/GridBatchImpl.h>
 
 #include <torch/autograd.h>
 
diff --git a/fvdb/src/detail/autograd/GaussianRender.cpp b/fvdb/src/detail/autograd/GaussianRender.cpp
index 5c1409a386..ecb44dcee3 100644
--- a/fvdb/src/detail/autograd/GaussianRender.cpp
+++ b/fvdb/src/detail/autograd/GaussianRender.cpp
@@ -3,8 +3,8 @@
 //
 #include "GaussianRender.h"
 
-#include "detail/ops/Ops.h"
-#include "detail/utils/Utils.h"
+#include <detail/ops/Ops.h>
+#include <detail/utils/Utils.h>
 
 namespace fvdb {
 namespace detail {
@@ -13,15 +13,20 @@ namespace autograd {
 SphericalHarmonics::variable_list
 SphericalHarmonics::forward(
     SphericalHarmonics::AutogradContext *ctx, const int sh_degree_to_use,
-    const SphericalHarmonics::Variable &dirs,      // (C, N, 3) or (N, 3)
-    const SphericalHarmonics::Variable &sh_coeffs, // (C, M, K, D) or (N, K, D)
-    const SphericalHarmonics::Variable &radii      // (C, N) or (N,) (optional)
+    const torch::optional<SphericalHarmonics::Variable> maybe_dirs, // (C, N, 3) or (N, 3)
+    const SphericalHarmonics::Variable                 &sh_coeffs,  // (C, M, K, D) or (N, K, D)
+    const SphericalHarmonics::Variable                 &radii       // (C, N) or (N,) (optional)
 ) {
-    Variable colors = FVDB_DISPATCH_KERNEL_DEVICE(dirs.device(), [&]() {
+    torch::Tensor dirs;
+    if (maybe_dirs.has_value()) {
+        dirs = maybe_dirs.value();
+    } else {
+        dirs = sh_coeffs.dim() == 3 ? torch::empty({ 0, 3 }) : torch::empty({ 0, 0, 3 });
+    }
+    Variable colors = FVDB_DISPATCH_KERNEL_DEVICE(sh_coeffs.device(), [&]() {
         return ops::dispatchSphericalHarmonicsForward<DeviceTag>(sh_degree_to_use, dirs, sh_coeffs,
                                                                  radii);
     });
-
     ctx->save_for_backward({ dirs, sh_coeffs, radii });
     ctx->saved_data["sh_degree_to_use"] = (int64_t)sh_degree_to_use;
     return { colors };
@@ -45,7 +50,7 @@ SphericalHarmonics::backward(SphericalHarmonics::AutogradContext *ctx,
     const int  sh_degree_to_use = (int)ctx->saved_data["sh_degree_to_use"].toInt();
     const bool compute_v_dirs   = ctx->needs_input_grad(1);
 
-    auto     variables   = FVDB_DISPATCH_KERNEL_DEVICE(dirs.device(), [&]() {
+    auto     variables   = FVDB_DISPATCH_KERNEL_DEVICE(sh_coeffs.device(), [&]() {
         return ops::dispatchSphericalHarmonicsBackward<DeviceTag>(sh_degree_to_use, dirs, sh_coeffs,
                                                                         v_colors, radii, compute_v_dirs);
     });
@@ -167,17 +172,14 @@ GaussianFullyFusedProjection::backward(GaussianFullyFusedProjection::AutogradCon
 GaussianRasterizeToPixels::variable_list
 GaussianRasterizeToPixels::forward(
     GaussianRasterizeToPixels::AutogradContext *ctx,
-    // Gaussian parameters
-    const GaussianRasterizeToPixels::Variable &means2d,   // [C, N, 2]
-    const GaussianRasterizeToPixels::Variable &conics,    // [C, N, 3]
-    const GaussianRasterizeToPixels::Variable &colors,    // [C, N, 3]
-    const GaussianRasterizeToPixels::Variable &opacities, // [N]
-    // image size
+    const GaussianRasterizeToPixels::Variable  &means2d,          // [C, N, 2]
+    const GaussianRasterizeToPixels::Variable  &conics,           // [C, N, 3]
+    const GaussianRasterizeToPixels::Variable  &colors,           // [C, N, 3]
+    const GaussianRasterizeToPixels::Variable  &opacities,        // [N]
     const uint32_t image_width, const uint32_t image_height, const uint32_t image_origin_w,
     const uint32_t image_origin_h, const uint32_t tile_size,
-    // intersections
-    const GaussianRasterizeToPixels::Variable &tile_offsets, // [C, tile_height, tile_width]
-    const GaussianRasterizeToPixels::Variable &flatten_ids,  // [n_isects]
+    const GaussianRasterizeToPixels::Variable &tile_offsets,      // [C, tile_height, tile_width]
+    const GaussianRasterizeToPixels::Variable &tile_gaussian_ids, // [n_isects]
     const bool                                 absgrad) {
     // const int C = means2d.size(0);
     // const int N = means2d.size(1);
@@ -185,14 +187,14 @@ GaussianRasterizeToPixels::forward(
     auto     variables     = FVDB_DISPATCH_KERNEL_DEVICE(means2d.device(), [&]() {
         return ops::dispatchGaussianRasterizeForward<DeviceTag>(
             means2d, conics, colors, opacities, image_width, image_height, image_origin_w,
-            image_origin_h, tile_size, tile_offsets, flatten_ids);
+            image_origin_h, tile_size, tile_offsets, tile_gaussian_ids);
     });
     Variable render_colors = std::get<0>(variables);
     Variable render_alphas = std::get<1>(variables);
     Variable last_ids      = std::get<2>(variables);
 
-    ctx->save_for_backward(
-        { means2d, conics, colors, opacities, tile_offsets, flatten_ids, render_alphas, last_ids });
+    ctx->save_for_backward({ means2d, conics, colors, opacities, tile_offsets, tile_gaussian_ids,
+                             render_alphas, last_ids });
     ctx->saved_data["image_width"]    = (int64_t)image_width;
     ctx->saved_data["image_height"]   = (int64_t)image_height;
     ctx->saved_data["tile_size"]      = (int64_t)tile_size;
@@ -215,15 +217,15 @@ GaussianRasterizeToPixels::backward(GaussianRasterizeToPixels::AutogradContext *
     if (v_render_alphas.defined())
         v_render_alphas = v_render_alphas.contiguous();
 
-    variable_list saved         = ctx->get_saved_variables();
-    Variable      means2d       = saved.at(0);
-    Variable      conics        = saved.at(1);
-    Variable      colors        = saved.at(2);
-    Variable      opacities     = saved.at(3);
-    Variable      tile_offsets  = saved.at(4);
-    Variable      flatten_ids   = saved.at(5);
-    Variable      render_alphas = saved.at(6);
-    Variable      last_ids      = saved.at(7);
+    variable_list saved             = ctx->get_saved_variables();
+    Variable      means2d           = saved.at(0);
+    Variable      conics            = saved.at(1);
+    Variable      colors            = saved.at(2);
+    Variable      opacities         = saved.at(3);
+    Variable      tile_offsets      = saved.at(4);
+    Variable      tile_gaussian_ids = saved.at(5);
+    Variable      render_alphas     = saved.at(6);
+    Variable      last_ids          = saved.at(7);
 
     const int  image_width    = (int)ctx->saved_data["image_width"].toInt();
     const int  image_height   = (int)ctx->saved_data["image_height"].toInt();
@@ -235,7 +237,7 @@ GaussianRasterizeToPixels::backward(GaussianRasterizeToPixels::AutogradContext *
     auto     variables = FVDB_DISPATCH_KERNEL_DEVICE(means2d.device(), [&]() {
         return ops::dispatchGaussianRasterizeBackward<DeviceTag>(
             means2d, conics, colors, opacities, image_width, image_height, image_origin_w,
-            image_origin_h, tile_size, tile_offsets, flatten_ids, render_alphas, last_ids,
+            image_origin_h, tile_size, tile_offsets, tile_gaussian_ids, render_alphas, last_ids,
             v_render_colors, v_render_alphas, absgrad);
     });
     Variable v_means2d_abs;
diff --git a/fvdb/src/detail/autograd/GaussianRender.h b/fvdb/src/detail/autograd/GaussianRender.h
index c18757e8cb..50297b60aa 100644
--- a/fvdb/src/detail/autograd/GaussianRender.h
+++ b/fvdb/src/detail/autograd/GaussianRender.h
@@ -4,6 +4,7 @@
 #ifndef FVDB_DETAIL_AUTOGRAD_GAUSSIANRENDER_H
 #define FVDB_DETAIL_AUTOGRAD_GAUSSIANRENDER_H
 
+#include <torch/all.h>
 #include <torch/autograd.h>
 
 namespace fvdb {
@@ -16,9 +17,9 @@ struct SphericalHarmonics : public torch::autograd::Function<SphericalHarmonics>
     using Variable        = torch::autograd::Variable;
 
     static variable_list forward(AutogradContext *ctx, const int sh_degree_to_use,
-                                 const Variable &dirs,      // (N, 3)
-                                 const Variable &sh_coeffs, // (N, K, 3)
-                                 const Variable &radii      // (N,)
+                                 const torch::optional<Variable> dirs, // [N, 3] or empty for deg 0
+                                 const Variable                 &sh_coeffs, // [K, N, 3]
+                                 const Variable                 &radii      // [N,]
     );
 
     static variable_list backward(AutogradContext *ctx, variable_list grad_output);
diff --git a/fvdb/src/detail/autograd/JaggedReduce.h b/fvdb/src/detail/autograd/JaggedReduce.h
index b364c8e70d..e5dfc36338 100644
--- a/fvdb/src/detail/autograd/JaggedReduce.h
+++ b/fvdb/src/detail/autograd/JaggedReduce.h
@@ -4,7 +4,7 @@
 #ifndef FVDB_DETAIL_AUTOGRAD_JAGGEDREDUCE_H
 #define FVDB_DETAIL_AUTOGRAD_JAGGEDREDUCE_H
 
-#include "detail/GridBatchImpl.h"
+#include <detail/GridBatchImpl.h>
 
 #include <torch/autograd.h>
 
diff --git a/fvdb/src/detail/autograd/ReadIntoDense.h b/fvdb/src/detail/autograd/ReadIntoDense.h
index 5d8c0d6f78..fe620a7f33 100644
--- a/fvdb/src/detail/autograd/ReadIntoDense.h
+++ b/fvdb/src/detail/autograd/ReadIntoDense.h
@@ -4,11 +4,10 @@
 #ifndef FVDB_DETAIL_AUTOGRAD_READINTODENSE_H
 #define FVDB_DETAIL_AUTOGRAD_READINTODENSE_H
 
-#include <torch/autograd.h>
-
-#include "detail/GridBatchImpl.h"
+#include <Types.h>
+#include <detail/GridBatchImpl.h>
 
-#include "Types.h"
+#include <torch/autograd.h>
 
 namespace fvdb {
 namespace detail {
diff --git a/fvdb/src/detail/autograd/SampleGrid.h b/fvdb/src/detail/autograd/SampleGrid.h
index 8eaf7ad558..6c9ce39507 100644
--- a/fvdb/src/detail/autograd/SampleGrid.h
+++ b/fvdb/src/detail/autograd/SampleGrid.h
@@ -4,9 +4,9 @@
 #ifndef FVDB_DETAIL_AUTOGRAD_SAMPLEGRID_H
 #define FVDB_DETAIL_AUTOGRAD_SAMPLEGRID_H
 
-#include <torch/autograd.h>
+#include <detail/GridBatchImpl.h>
 
-#include "detail/GridBatchImpl.h"
+#include <torch/autograd.h>
 
 namespace fvdb {
 namespace detail {
diff --git a/fvdb/src/detail/autograd/SplatIntoGrid.h b/fvdb/src/detail/autograd/SplatIntoGrid.h
index 50cac190d7..e066b73e43 100644
--- a/fvdb/src/detail/autograd/SplatIntoGrid.h
+++ b/fvdb/src/detail/autograd/SplatIntoGrid.h
@@ -4,9 +4,9 @@
 #ifndef FVDB_DETAIL_AUTOGRAD_SPLATINTOGRID_H
 #define FVDB_DETAIL_AUTOGRAD_SPLATINTOGRID_H
 
-#include <torch/autograd.h>
+#include <detail/GridBatchImpl.h>
 
-#include "detail/GridBatchImpl.h"
+#include <torch/autograd.h>
 
 namespace fvdb {
 namespace detail {
diff --git a/fvdb/src/detail/autograd/TransformPoints.h b/fvdb/src/detail/autograd/TransformPoints.h
index 22d870809a..5a15450666 100644
--- a/fvdb/src/detail/autograd/TransformPoints.h
+++ b/fvdb/src/detail/autograd/TransformPoints.h
@@ -4,9 +4,9 @@
 #ifndef FVDB_DETAIL_AUTOGRAD_TRANSFORMPOINTS_H
 #define FVDB_DETAIL_AUTOGRAD_TRANSFORMPOINTS_H
 
-#include <torch/autograd.h>
+#include <detail/GridBatchImpl.h>
 
-#include "detail/GridBatchImpl.h"
+#include <torch/autograd.h>
 
 namespace fvdb {
 namespace detail {
diff --git a/fvdb/src/detail/autograd/UpsampleGrid.h b/fvdb/src/detail/autograd/UpsampleGrid.h
index a20f728f03..2325b02fb2 100644
--- a/fvdb/src/detail/autograd/UpsampleGrid.h
+++ b/fvdb/src/detail/autograd/UpsampleGrid.h
@@ -4,9 +4,9 @@
 #ifndef FVDB_DETAIL_AUTOGRAD_UPSAMPLEGRID_H
 #define FVDB_DETAIL_AUTOGRAD_UPSAMPLEGRID_H
 
-#include <torch/autograd.h>
+#include <detail/GridBatchImpl.h>
 
-#include "detail/GridBatchImpl.h"
+#include <torch/autograd.h>
 
 namespace fvdb {
 namespace detail {
diff --git a/fvdb/src/detail/build/CoarseFromFine.cpp b/fvdb/src/detail/build/CoarseFromFine.cpp
index 75768379eb..cf1b967253 100644
--- a/fvdb/src/detail/build/CoarseFromFine.cpp
+++ b/fvdb/src/detail/build/CoarseFromFine.cpp
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "Build.h"
-
 #include <detail/ops/Ops.h>
 #include <detail/utils/Utils.h>
 
diff --git a/fvdb/src/detail/build/ConvGrid.cpp b/fvdb/src/detail/build/ConvGrid.cpp
index e44188270b..653bbbf075 100644
--- a/fvdb/src/detail/build/ConvGrid.cpp
+++ b/fvdb/src/detail/build/ConvGrid.cpp
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "Build.h"
-
 #include <detail/ops/Ops.h>
 #include <detail/utils/Utils.h>
 
diff --git a/fvdb/src/detail/build/DenseGrid.cpp b/fvdb/src/detail/build/DenseGrid.cpp
index 192a7c53f7..ba536bbc30 100644
--- a/fvdb/src/detail/build/DenseGrid.cpp
+++ b/fvdb/src/detail/build/DenseGrid.cpp
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "Build.h"
-
 #include <detail/ops/Ops.h>
 #include <detail/utils/Utils.h>
 
diff --git a/fvdb/src/detail/build/EmptyGrid.cpp b/fvdb/src/detail/build/EmptyGrid.cpp
index 8d0f82d7a2..13d3c6bfe9 100644
--- a/fvdb/src/detail/build/EmptyGrid.cpp
+++ b/fvdb/src/detail/build/EmptyGrid.cpp
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "Build.h"
-
 #include <detail/utils/Utils.h>
 
 #include <nanovdb/NanoVDB.h>
diff --git a/fvdb/src/detail/build/FineFromCoarse.cpp b/fvdb/src/detail/build/FineFromCoarse.cpp
index 6c62ae761d..5a40af71a2 100644
--- a/fvdb/src/detail/build/FineFromCoarse.cpp
+++ b/fvdb/src/detail/build/FineFromCoarse.cpp
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "Build.h"
-
 #include <detail/ops/Ops.h>
 #include <detail/utils/Utils.h>
 
diff --git a/fvdb/src/detail/build/FromMesh.cpp b/fvdb/src/detail/build/FromMesh.cpp
index 1ae906c339..a032cb85da 100644
--- a/fvdb/src/detail/build/FromMesh.cpp
+++ b/fvdb/src/detail/build/FromMesh.cpp
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "Build.h"
-
 #include <detail/ops/Ops.h>
 #include <detail/utils/Utils.h>
 
diff --git a/fvdb/src/detail/build/NearestNeighborGridFromPoints.cpp b/fvdb/src/detail/build/NearestNeighborGridFromPoints.cpp
index aaf9e611bb..34a7b17691 100644
--- a/fvdb/src/detail/build/NearestNeighborGridFromPoints.cpp
+++ b/fvdb/src/detail/build/NearestNeighborGridFromPoints.cpp
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "Build.h"
-
 #include <detail/ops/Ops.h>
 #include <detail/utils/Utils.h>
 
diff --git a/fvdb/src/detail/build/PaddedGridFromCoords.cpp b/fvdb/src/detail/build/PaddedGridFromCoords.cpp
index 3e7a02d4d7..b78f36c474 100644
--- a/fvdb/src/detail/build/PaddedGridFromCoords.cpp
+++ b/fvdb/src/detail/build/PaddedGridFromCoords.cpp
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "Build.h"
-
 #include <detail/ops/Ops.h>
 #include <detail/utils/Utils.h>
 
diff --git a/fvdb/src/detail/build/PaddedGridFromGrid.cpp b/fvdb/src/detail/build/PaddedGridFromGrid.cpp
index cba219d532..1fe28aa1ec 100644
--- a/fvdb/src/detail/build/PaddedGridFromGrid.cpp
+++ b/fvdb/src/detail/build/PaddedGridFromGrid.cpp
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "Build.h"
-
 #include <detail/ops/Ops.h>
 #include <detail/utils/Utils.h>
 
diff --git a/fvdb/src/detail/build/PaddedGridFromPoints.cpp b/fvdb/src/detail/build/PaddedGridFromPoints.cpp
index b6e7414700..149a770944 100644
--- a/fvdb/src/detail/build/PaddedGridFromPoints.cpp
+++ b/fvdb/src/detail/build/PaddedGridFromPoints.cpp
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "Build.h"
-
 #include <detail/ops/Ops.h>
 #include <detail/utils/Utils.h>
 
diff --git a/fvdb/src/detail/io/IO.h b/fvdb/src/detail/io/IO.h
index a4f5c2b6b0..1228a1a29b 100644
--- a/fvdb/src/detail/io/IO.h
+++ b/fvdb/src/detail/io/IO.h
@@ -8,6 +8,7 @@
 #include <Types.h>
 
 #include <torch/torch.h>
+
 #include <tuple>
 
 namespace fvdb {
diff --git a/fvdb/src/detail/io/LoadNanovdb.cpp b/fvdb/src/detail/io/LoadNanovdb.cpp
index 9113c3ea3d..6df7cc01d3 100644
--- a/fvdb/src/detail/io/LoadNanovdb.cpp
+++ b/fvdb/src/detail/io/LoadNanovdb.cpp
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "IO.h"
-
 #include <Types.h>
 #include <detail/GridBatchImpl.h>
 #include <detail/utils/Utils.h>
diff --git a/fvdb/src/detail/io/SaveNanoVDB.cpp b/fvdb/src/detail/io/SaveNanoVDB.cpp
index 2244a3a002..24f92c39a1 100644
--- a/fvdb/src/detail/io/SaveNanoVDB.cpp
+++ b/fvdb/src/detail/io/SaveNanoVDB.cpp
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "IO.h"
-
 #include <detail/utils/Utils.h>
 
 #include <nanovdb/NanoVDB.h>
@@ -10,9 +9,8 @@
 #include <nanovdb/tools/CreateNanoGrid.h>
 #include <nanovdb/tools/GridBuilder.h>
 
-#include <torch/all.h>
-
 #include <c10/cuda/CUDACachingAllocator.h>
+#include <torch/all.h>
 
 #include <cuda.h>
 #include <cuda_runtime_api.h>
diff --git a/fvdb/src/detail/ops/BuildDeviceGrid.cu b/fvdb/src/detail/ops/BuildDeviceGrid.cu
index 9fb5578609..d3f88bfd8b 100644
--- a/fvdb/src/detail/ops/BuildDeviceGrid.cu
+++ b/fvdb/src/detail/ops/BuildDeviceGrid.cu
@@ -1,11 +1,10 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: Apache-2.0
 //
-#include <detail/utils/Utils.h>
-#include <detail/utils/cuda/Utils.cuh>
-
 #include <detail/GridBatchImpl.h>
 #include <detail/build/Build.h>
+#include <detail/utils/Utils.h>
+#include <detail/utils/cuda/Utils.cuh>
 
 #include <nanovdb/tools/cuda/PointsToGrid.cuh>
 
diff --git a/fvdb/src/detail/ops/JCat0.cu b/fvdb/src/detail/ops/JCat0.cu
index c9dd2e6995..f586945e82 100644
--- a/fvdb/src/detail/ops/JCat0.cu
+++ b/fvdb/src/detail/ops/JCat0.cu
@@ -2,11 +2,11 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "Ops.h"
-
 #include <detail/utils/Utils.h>
 #include <detail/utils/cuda/Utils.cuh>
 
 #include <ATen/cuda/Atomic.cuh>
+
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 
diff --git a/fvdb/src/detail/ops/JOffsetsFromJIdx.cu b/fvdb/src/detail/ops/JOffsetsFromJIdx.cu
index e46093d6a3..e93c12fb7c 100644
--- a/fvdb/src/detail/ops/JOffsetsFromJIdx.cu
+++ b/fvdb/src/detail/ops/JOffsetsFromJIdx.cu
@@ -2,10 +2,10 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "Ops.h"
-
 #include <detail/utils/cuda/Utils.cuh>
 
 #include <c10/cuda/CUDACachingAllocator.h>
+
 #include <cub/cub.cuh>
 
 namespace fvdb {
diff --git a/fvdb/src/detail/ops/JaggedTensorIndex.cu b/fvdb/src/detail/ops/JaggedTensorIndex.cu
index 7e25962bc9..af5a21a0e6 100644
--- a/fvdb/src/detail/ops/JaggedTensorIndex.cu
+++ b/fvdb/src/detail/ops/JaggedTensorIndex.cu
@@ -5,6 +5,7 @@
 #include <detail/utils/cuda/Utils.cuh>
 
 #include <ATen/cuda/Atomic.cuh>
+
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 
diff --git a/fvdb/src/detail/ops/PaddedIJKForMesh.cu b/fvdb/src/detail/ops/PaddedIJKForMesh.cu
index c99ee8fecd..9febd62bb0 100644
--- a/fvdb/src/detail/ops/PaddedIJKForMesh.cu
+++ b/fvdb/src/detail/ops/PaddedIJKForMesh.cu
@@ -2,8 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "Ops.h"
-
-#include "detail/utils/cuda/Utils.cuh"
+#include <detail/utils/cuda/Utils.cuh>
 
 #include <c10/cuda/CUDACachingAllocator.h>
 
diff --git a/fvdb/src/detail/ops/VolumeRender.cu b/fvdb/src/detail/ops/VolumeRender.cu
index 767377cf88..1746e899fa 100644
--- a/fvdb/src/detail/ops/VolumeRender.cu
+++ b/fvdb/src/detail/ops/VolumeRender.cu
@@ -1,15 +1,15 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: Apache-2.0
 //
-#include "detail/utils/cuda/Utils.cuh"
-
-#include <thrust/execution_policy.h>
-#include <thrust/scan.h>
+#include <detail/utils/cuda/Utils.cuh>
 
 #include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAMathCompat.h>
 
+#include <thrust/execution_policy.h>
+#include <thrust/scan.h>
+
 namespace fvdb {
 namespace detail {
 namespace ops {
diff --git a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionCutlass.cu b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionCutlass.cu
index a0cb036e3a..8a6bd7b47c 100644
--- a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionCutlass.cu
+++ b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionCutlass.cu
@@ -2,18 +2,15 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "ConvOps.h"
-
 #include <detail/utils/cuda/Utils.cuh>
 
-// NOTE: Getting an error about duplicate definitions of `copy_if` if cute/tenosr.hpp is included
-// after other cute headers
-#include <cute/tensor.hpp>
+#include <THC/THCAtomics.cuh>
+#include <c10/cuda/CUDAException.h>
 
+#include <cute/algorithm/copy.hpp>
 #include <cute/atom/copy_atom.hpp>
 #include <cute/atom/mma_atom.hpp>
-
-#include <THC/THCAtomics.cuh>
-#include <c10/cuda/CUDAException.h>
+#include <cute/tensor.hpp>
 
 #include <algorithm>
 
diff --git a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionHalo.cu b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionHalo.cu
index 3d00244a74..2c3cc486e1 100644
--- a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionHalo.cu
+++ b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionHalo.cu
@@ -2,11 +2,11 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "ConvOps.h"
-
 #include <detail/utils/cuda/Utils.cuh>
 
 #include <THC/THCAtomics.cuh>
 #include <c10/cuda/CUDAException.h>
+
 #include <mma.h>
 
 #define COALESCED_MEMORY_ACCESS_VARIANT
diff --git a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionHaloGrad.cu b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionHaloGrad.cu
index 0c87be657e..308b26a540 100644
--- a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionHaloGrad.cu
+++ b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionHaloGrad.cu
@@ -2,11 +2,11 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "ConvOps.h"
-
 #include <detail/utils/cuda/Utils.cuh>
 
 #include <THC/THCAtomics.cuh>
 #include <c10/cuda/CUDAException.h>
+
 #include <mma.h>
 
 #define COALESCED_MEMORY_ACCESS_VARIANT
diff --git a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMM.cu b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMM.cu
index 9e7e6b8e60..78a8558eb7 100644
--- a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMM.cu
+++ b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMM.cu
@@ -2,12 +2,12 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "ConvOps.h"
-
 #include <detail/ops/Ops.h>
 
-#include <cuda_fp16.h>
 #include <torch/extension.h>
 
+#include <cuda_fp16.h>
+
 namespace fvdb {
 namespace detail {
 namespace ops {
diff --git a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMMGrad.cu b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMMGrad.cu
index c0921be6d1..a144a11a37 100644
--- a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMMGrad.cu
+++ b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMMGrad.cu
@@ -2,12 +2,12 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "ConvOps.h"
-
 #include <detail/ops/Ops.h>
 
-#include <cuda_fp16.h>
 #include <torch/extension.h>
 
+#include <cuda_fp16.h>
+
 namespace fvdb {
 namespace detail {
 namespace ops {
diff --git a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMMGradSorted.cu b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMMGradSorted.cu
index e7c3a01ded..e85686b3f3 100644
--- a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMMGradSorted.cu
+++ b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMMGradSorted.cu
@@ -2,12 +2,12 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "ConvOps.h"
-
 #include <detail/ops/Ops.h>
 
-#include <cuda_fp16.h>
 #include <torch/extension.h>
 
+#include <cuda_fp16.h>
+
 namespace fvdb {
 namespace detail {
 namespace ops {
diff --git a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMMSorted.cu b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMMSorted.cu
index f024bd76ac..977f462316 100644
--- a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMMSorted.cu
+++ b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMMSorted.cu
@@ -2,12 +2,12 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "ConvOps.h"
-
 #include <detail/ops/Ops.h>
 
-#include <cuda_fp16.h>
 #include <torch/extension.h>
 
+#include <cuda_fp16.h>
+
 namespace fvdb {
 namespace detail {
 namespace ops {
diff --git a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionKernelMap.cu b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionKernelMap.cu
index 4c303ebd9c..2353292715 100644
--- a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionKernelMap.cu
+++ b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionKernelMap.cu
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "ConvOps.h"
-
 #include <detail/ops/Ops.h>
 
 #include <ATen/cuda/CUDAContext.h>
diff --git a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionLggs.cu b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionLggs.cu
index 8512afed0b..fc0140d2a4 100644
--- a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionLggs.cu
+++ b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionLggs.cu
@@ -2,17 +2,14 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "ConvOps.h"
-
 #include <detail/utils/cuda/Utils.cuh>
 
 #include <c10/cuda/CUDAException.h>
 
-// NOTE: Getting an error about duplicate definitions of `copy_if` if cute/tenosr.hpp is included
-// after other cute headers
-#include <cute/tensor.hpp>
-
+#include <cute/algorithm/copy.hpp>
 #include <cute/atom/copy_atom.hpp>
 #include <cute/atom/mma_atom.hpp>
+#include <cute/tensor.hpp>
 
 namespace fvdb {
 namespace detail {
diff --git a/fvdb/src/detail/ops/convolution/pack_info/BrickHaloBuffer.cu b/fvdb/src/detail/ops/convolution/pack_info/BrickHaloBuffer.cu
index af95aae16d..9d26274028 100644
--- a/fvdb/src/detail/ops/convolution/pack_info/BrickHaloBuffer.cu
+++ b/fvdb/src/detail/ops/convolution/pack_info/BrickHaloBuffer.cu
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "PackInfoOps.h"
-
 #include <Types.h>
 #include <detail/utils/cuda/Utils.cuh>
 
@@ -10,6 +9,7 @@
 #include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/util/Half.h>
+
 #include <cute/tensor.hpp>
 
 namespace fvdb {
diff --git a/fvdb/src/detail/ops/convolution/pack_info/ConvolutionKernelMap.cu b/fvdb/src/detail/ops/convolution/pack_info/ConvolutionKernelMap.cu
index 618c418c5d..dbc6bdd0e2 100644
--- a/fvdb/src/detail/ops/convolution/pack_info/ConvolutionKernelMap.cu
+++ b/fvdb/src/detail/ops/convolution/pack_info/ConvolutionKernelMap.cu
@@ -2,9 +2,8 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "PackInfoOps.h"
-
-#include "detail/utils/cuda/Utils.cuh"
-#include "detail/utils/nanovdb/CustomAccessors.h"
+#include <detail/utils/cuda/Utils.cuh>
+#include <detail/utils/nanovdb/CustomAccessors.h>
 
 #include <THC/THCAtomics.cuh>
 #include <c10/cuda/CUDAException.h>
diff --git a/fvdb/src/detail/ops/convolution/pack_info/IGEMMBitOperations.cu b/fvdb/src/detail/ops/convolution/pack_info/IGEMMBitOperations.cu
index e76b9c50be..37787c4a7a 100644
--- a/fvdb/src/detail/ops/convolution/pack_info/IGEMMBitOperations.cu
+++ b/fvdb/src/detail/ops/convolution/pack_info/IGEMMBitOperations.cu
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "PackInfoOps.h"
-
 #include <detail/utils/cuda/Utils.cuh>
 
 #include <c10/cuda/CUDAException.h>
diff --git a/fvdb/src/detail/ops/gsplat/GaussianFullyFusedProjection.cu b/fvdb/src/detail/ops/gsplat/GaussianFullyFusedProjection.cu
index 6b428d4239..dd3ca6cafa 100644
--- a/fvdb/src/detail/ops/gsplat/GaussianFullyFusedProjection.cu
+++ b/fvdb/src/detail/ops/gsplat/GaussianFullyFusedProjection.cu
@@ -2,10 +2,10 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "GsplatUtils.cuh"
-
 #include <detail/ops/Ops.h>
 
 #include <ATen/cuda/Atomic.cuh>
+
 #include <cooperative_groups.h>
 
 constexpr int NUM_THREADS = 256;
diff --git a/fvdb/src/detail/ops/gsplat/GaussianFullyFusedProjectionJagged.cu b/fvdb/src/detail/ops/gsplat/GaussianFullyFusedProjectionJagged.cu
index 19b5239ac0..0e38e019da 100644
--- a/fvdb/src/detail/ops/gsplat/GaussianFullyFusedProjectionJagged.cu
+++ b/fvdb/src/detail/ops/gsplat/GaussianFullyFusedProjectionJagged.cu
@@ -2,10 +2,10 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "GsplatUtils.cuh"
-
 #include <detail/ops/Ops.h>
 
 #include <ATen/cuda/Atomic.cuh>
+
 #include <cooperative_groups.h>
 
 constexpr int NUM_THREADS = 256;
diff --git a/fvdb/src/detail/ops/gsplat/GaussianRasterizeBackward.cu b/fvdb/src/detail/ops/gsplat/GaussianRasterizeBackward.cu
new file mode 100644
index 0000000000..61b4da96cb
--- /dev/null
+++ b/fvdb/src/detail/ops/gsplat/GaussianRasterizeBackward.cu
@@ -0,0 +1,548 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+//
+#include "GsplatTypes.cuh"
+#include "VectorTypes.cuh"
+#include <detail/ops/Ops.h>
+
+#include <ATen/cuda/Atomic.cuh>
+
+#include <cooperative_groups.h>
+
+#include <cub/cub.cuh>
+
+namespace fvdb {
+namespace detail {
+namespace ops {
+
+namespace cg = cooperative_groups;
+
+/****************************************************************************
+ * Rasterization to Pixels Backward Pass
+ ****************************************************************************/
+
+template <uint32_t COLOR_DIM, uint32_t N_OUTER_DIMS, typename S>
+__global__ void
+rasterize_to_pixels_bwd_kernel(
+    const uint32_t C, const uint32_t N, const uint32_t n_isects, const bool packed,
+    // fwd inputs
+    const vec2<S> *__restrict__ means2d, // [C, N, 2] or [nnz, 2]
+    const vec3<S> *__restrict__ conics,  // [C, N, 3] or [nnz, 3]
+    // const S *__restrict__ colors,        // [C, N, COLOR_DIM] or [nnz, COLOR_DIM]
+    torch::PackedTensorAccessor64<S, N_OUTER_DIMS + 1, torch::RestrictPtrTraits>
+        colors,                        // [C, N, COLOR_DIM] or [nnz, COLOR_DIM]
+    const S *__restrict__ opacities,   // [C, N] or [nnz]
+    const S *__restrict__ backgrounds, // [C, COLOR_DIM] or [nnz, COLOR_DIM]
+    const bool *__restrict__ masks,    // [C, tile_height, tile_width]
+    const uint32_t image_width, const uint32_t image_height, const uint32_t image_origin_w,
+    const uint32_t image_origin_h, const uint32_t tile_origin_w, const uint32_t tile_origin_h,
+    const uint32_t tile_size, const uint32_t tile_width, const uint32_t tile_height,
+    const int32_t *__restrict__ tile_offsets, // [C, tile_height, tile_width]
+    const int32_t *__restrict__ flatten_ids,  // [n_isects]
+    // fwd outputs
+    const S *__restrict__ render_alphas,  // [C, image_height, image_width, 1]
+    const int32_t *__restrict__ last_ids, // [C, image_height, image_width]
+    // grad outputs
+    const S *__restrict__ v_render_colors, // [C, image_height, image_width,
+                                           // COLOR_DIM]
+    const S *__restrict__ v_render_alphas, // [C, image_height, image_width, 1]
+    // grad inputs
+    vec2<S> *__restrict__ v_means2d_abs, // [C, N, 2] or [nnz, 2]
+    vec2<S> *__restrict__ v_means2d,     // [C, N, 2] or [nnz, 2]
+    vec3<S> *__restrict__ v_conics,      // [C, N, 3] or [nnz, 3]
+    S *__restrict__ v_colors,            // [C, N, COLOR_DIM] or [nnz, COLOR_DIM]
+    S *__restrict__ v_opacities          // [C, N] or [nnz]
+) {
+    auto     block     = cg::this_thread_block();
+    uint32_t camera_id = block.group_index().x;
+
+    // blockIdx runs from [0, num_tiles_h] x [0, num_tiles_w]
+    const int32_t tile_id = (block.group_index().y + tile_origin_h) * tile_width +
+                            block.group_index().z + tile_origin_w;
+    // Pixel coordinates run from [0, height] x [0, width]
+    const uint32_t i = block.group_index().y * tile_size + block.thread_index().y;
+    const uint32_t j = block.group_index().z * tile_size + block.thread_index().x;
+
+    tile_offsets += camera_id * tile_height * tile_width;
+    render_alphas += camera_id * image_height * image_width;
+    last_ids += camera_id * image_height * image_width;
+    v_render_colors += camera_id * image_height * image_width * COLOR_DIM;
+    v_render_alphas += camera_id * image_height * image_width;
+    if (backgrounds != nullptr) {
+        backgrounds += camera_id * COLOR_DIM;
+    }
+    if (masks != nullptr) {
+        masks += camera_id * tile_height * tile_width;
+    }
+
+    // when the mask is provided, do nothing and return if
+    // this tile is labeled as False
+    if (masks != nullptr && !masks[tile_id]) {
+        return;
+    }
+
+    const S px = (S)(j + image_origin_w) + 0.5f;
+    const S py = (S)(i + image_origin_h) + 0.5f;
+
+    // clamp this value to the last pixel
+    const int32_t pix_id = min(i * image_width + j, image_width * image_height - 1);
+
+    // keep not rasterizing threads around for reading data
+    const bool inside = (i < image_height && j < image_width);
+
+    // have all threads in tile process the same gaussians in batches
+    // first collect gaussians between range.x and range.y in batches
+    // which gaussians to look through in this tile
+    int32_t        range_start = tile_offsets[tile_id];
+    int32_t        range_end   = (camera_id == C - 1) && (tile_id == tile_width * tile_height - 1)
+                                     ? n_isects
+                                     : tile_offsets[tile_id + 1];
+    const uint32_t block_size  = block.size();
+    const uint32_t num_batches = (range_end - range_start + block_size - 1) / block_size;
+
+    extern __shared__ int s[];
+    int32_t              *id_batch = (int32_t *)s;                      // [block_size]
+    vec3<S>              *xy_opacity_batch =
+        reinterpret_cast<vec3<float> *>(&id_batch[block_size]);         // [block_size]
+    vec3<S> *conic_batch =
+        reinterpret_cast<vec3<float> *>(&xy_opacity_batch[block_size]); // [block_size]
+    S *rgbs_batch = (S *)&conic_batch[block_size];                      // [block_size * COLOR_DIM]
+
+    // this is the T AFTER the last gaussian in this pixel
+    S T_final = 1.0f - render_alphas[pix_id];
+    S T       = T_final;
+    // the contribution from gaussians behind the current one
+    S buffer[COLOR_DIM] = { 0.f };
+    // index of last gaussian to contribute to this pixel
+    const int32_t bin_final = inside ? last_ids[pix_id] : 0;
+
+    // df/d_out for this pixel
+    S v_render_c[COLOR_DIM];
+    GSPLAT_PRAGMA_UNROLL
+    for (uint32_t k = 0; k < COLOR_DIM; ++k) {
+        v_render_c[k] = v_render_colors[pix_id * COLOR_DIM + k];
+    }
+    const S v_render_a = v_render_alphas[pix_id];
+
+    // collect and process batches of gaussians
+    // each thread loads one gaussian at a time before rasterizing
+    const uint32_t            tr             = block.thread_rank();
+    cg::thread_block_tile<32> warp           = cg::tiled_partition<32>(block);
+    const int32_t             warp_bin_final = cg::reduce(warp, bin_final, cg::greater<int>());
+    for (uint32_t b = 0; b < num_batches; ++b) {
+        // resync all threads before writing next batch of shared mem
+        block.sync();
+
+        // each thread fetch 1 gaussian from back to front
+        // 0 index will be furthest back in batch
+        // index of gaussian to load
+        // batch end is the index of the last gaussian in the batch
+        // These values can be negative so must be int32 instead of uint32
+        const int32_t batch_end  = range_end - 1 - block_size * b;
+        const int32_t batch_size = min(block_size, batch_end + 1 - range_start);
+        const int32_t idx        = batch_end - tr;
+        if (idx >= range_start) {
+            int32_t g            = flatten_ids[idx]; // flatten index in [C * N] or [nnz]
+            id_batch[tr]         = g;
+            const vec2<S> xy     = means2d[g];
+            const S       opac   = opacities[g];
+            xy_opacity_batch[tr] = { xy.x, xy.y, opac };
+            conic_batch[tr]      = conics[g];
+            if constexpr (N_OUTER_DIMS == 2) {
+                // colors: [C, N, COLOR_DIM]
+                // colors[c, n, k] = [c * N * COLOR_DIM + n * COLOR_DIM + k]
+                // g = c * N + n
+                const int32_t cid   = g / N;
+                const int32_t gid   = g % N;
+                const S      *c_ptr = colors[cid][gid].data();
+                GSPLAT_PRAGMA_UNROLL
+                for (uint32_t k = 0; k < COLOR_DIM; ++k) {
+                    rgbs_batch[tr * COLOR_DIM + k] = c_ptr[k];
+                }
+            } else {
+                const S *c_ptr = colors[g].data(); // + g * COLOR_DIM;
+                GSPLAT_PRAGMA_UNROLL
+                for (uint32_t k = 0; k < COLOR_DIM; ++k) {
+                    rgbs_batch[tr * COLOR_DIM + k] = c_ptr[k];
+                }
+            }
+        }
+        // wait for other threads to collect the gaussians in batch
+        block.sync();
+        // process gaussians in the current batch for this pixel
+        // 0 index is the furthest back gaussian in the batch
+        for (uint32_t t = max(0, batch_end - warp_bin_final); t < batch_size; ++t) {
+            bool valid = inside;
+            if (batch_end - t > bin_final) {
+                valid = 0;
+            }
+            S       alpha;
+            S       opac;
+            vec2<S> delta;
+            vec3<S> conic;
+            S       vis;
+
+            if (valid) {
+                conic           = conic_batch[t];
+                vec3<S> xy_opac = xy_opacity_batch[t];
+                opac            = xy_opac.z;
+                delta           = { xy_opac.x - px, xy_opac.y - py };
+                S sigma = 0.5f * (conic.x * delta.x * delta.x + conic.z * delta.y * delta.y) +
+                          conic.y * delta.x * delta.y;
+                vis   = __expf(-sigma);
+                alpha = min(0.999f, opac * vis);
+                if (sigma < 0.f || alpha < 1.f / 255.f) {
+                    valid = false;
+                }
+            }
+
+            // if all threads are inactive in this warp, skip this loop
+            if (!warp.any(valid)) {
+                continue;
+            }
+            S       v_rgb_local[COLOR_DIM] = { 0.f };
+            vec3<S> v_conic_local          = { 0.f, 0.f, 0.f };
+            vec2<S> v_xy_local             = { 0.f, 0.f };
+            vec2<S> v_xy_abs_local         = { 0.f, 0.f };
+            S       v_opacity_local        = 0.f;
+            // initialize everything to 0, only set if the lane is valid
+            if (valid) {
+                // compute the current T for this gaussian
+                S ra = 1.0f / (1.0f - alpha);
+                T *= ra;
+                // update v_rgb for this gaussian
+                const S fac = alpha * T;
+                GSPLAT_PRAGMA_UNROLL
+                for (uint32_t k = 0; k < COLOR_DIM; ++k) {
+                    v_rgb_local[k] = fac * v_render_c[k];
+                }
+                // contribution from this pixel
+                S v_alpha = 0.f;
+                for (uint32_t k = 0; k < COLOR_DIM; ++k) {
+                    v_alpha += (rgbs_batch[t * COLOR_DIM + k] * T - buffer[k] * ra) * v_render_c[k];
+                }
+
+                v_alpha += T_final * ra * v_render_a;
+                // contribution from background pixel
+                if (backgrounds != nullptr) {
+                    S accum = 0.f;
+                    GSPLAT_PRAGMA_UNROLL
+                    for (uint32_t k = 0; k < COLOR_DIM; ++k) {
+                        accum += backgrounds[k] * v_render_c[k];
+                    }
+                    v_alpha += -T_final * ra * accum;
+                }
+
+                if (opac * vis <= 0.999f) {
+                    const S v_sigma = -opac * vis * v_alpha;
+                    v_conic_local   = { 0.5f * v_sigma * delta.x * delta.x,
+                                        v_sigma * delta.x * delta.y,
+                                        0.5f * v_sigma * delta.y * delta.y };
+                    v_xy_local      = { v_sigma * (conic.x * delta.x + conic.y * delta.y),
+                                        v_sigma * (conic.y * delta.x + conic.z * delta.y) };
+                    if (v_means2d_abs != nullptr) {
+                        v_xy_abs_local = { abs(v_xy_local.x), abs(v_xy_local.y) };
+                    }
+                    v_opacity_local = vis * v_alpha;
+                }
+
+                GSPLAT_PRAGMA_UNROLL
+                for (uint32_t k = 0; k < COLOR_DIM; ++k) {
+                    buffer[k] += rgbs_batch[t * COLOR_DIM + k] * fac;
+                }
+            }
+            warpSum<COLOR_DIM, S>(v_rgb_local, warp);
+            warpSum<decltype(warp), S>(v_conic_local, warp);
+            warpSum<decltype(warp), S>(v_xy_local, warp);
+            if (v_means2d_abs != nullptr) {
+                warpSum<decltype(warp), S>(v_xy_abs_local, warp);
+            }
+            warpSum<decltype(warp), S>(v_opacity_local, warp);
+            if (warp.thread_rank() == 0) {
+                int32_t g         = id_batch[t]; // flatten index in [C * N] or [nnz]
+                S      *v_rgb_ptr = (S *)(v_colors) + COLOR_DIM * g;
+                GSPLAT_PRAGMA_UNROLL
+                for (uint32_t k = 0; k < COLOR_DIM; ++k) {
+                    gpuAtomicAdd(v_rgb_ptr + k, v_rgb_local[k]);
+                }
+
+                S *v_conic_ptr = (S *)(v_conics) + 3 * g;
+                gpuAtomicAdd(v_conic_ptr, v_conic_local.x);
+                gpuAtomicAdd(v_conic_ptr + 1, v_conic_local.y);
+                gpuAtomicAdd(v_conic_ptr + 2, v_conic_local.z);
+
+                S *v_xy_ptr = (S *)(v_means2d) + 2 * g;
+                gpuAtomicAdd(v_xy_ptr, v_xy_local.x);
+                gpuAtomicAdd(v_xy_ptr + 1, v_xy_local.y);
+
+                if (v_means2d_abs != nullptr) {
+                    S *v_xy_abs_ptr = (S *)(v_means2d_abs) + 2 * g;
+                    gpuAtomicAdd(v_xy_abs_ptr, v_xy_abs_local.x);
+                    gpuAtomicAdd(v_xy_abs_ptr + 1, v_xy_abs_local.y);
+                }
+
+                gpuAtomicAdd(v_opacities + g, v_opacity_local);
+            }
+        }
+    }
+}
+
+template <uint32_t CDIM>
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+call_bwd_kernel_with_dim(
+    // Gaussian parameters
+    const torch::Tensor               &means2d,     // [C, N, 2] or [nnz, 2]
+    const torch::Tensor               &conics,      // [C, N, 3] or [nnz, 3]
+    const torch::Tensor               &colors,      // [C, N, 3] or [nnz, 3]
+    const torch::Tensor               &opacities,   // [C, N] or [nnz]
+    const at::optional<torch::Tensor> &backgrounds, // [C, 3]
+    const at::optional<torch::Tensor> &masks,       // [C, tile_height, tile_width]
+    // image size
+    const uint32_t image_width, const uint32_t image_height, const uint32_t image_origin_w,
+    const uint32_t image_origin_h, const uint32_t tile_size,
+    // intersections
+    const torch::Tensor &tile_offsets, // [C, tile_height, tile_width]
+    const torch::Tensor &flatten_ids,  // [n_isects]
+    // forward outputs
+    const torch::Tensor &render_alphas, // [C, image_height, image_width, 1]
+    const torch::Tensor &last_ids,      // [C, image_height, image_width]
+    // gradients of outputs
+    const torch::Tensor &v_render_colors, // [C, image_height, image_width, 3]
+    const torch::Tensor &v_render_alphas, // [C, image_height, image_width, 1]
+    // options
+    bool absgrad) {
+    GSPLAT_DEVICE_GUARD(means2d);
+    GSPLAT_CHECK_INPUT(means2d);
+    GSPLAT_CHECK_INPUT(conics);
+    GSPLAT_CHECK_CUDA(colors);
+    GSPLAT_CHECK_INPUT(opacities);
+    GSPLAT_CHECK_INPUT(tile_offsets);
+    GSPLAT_CHECK_INPUT(flatten_ids);
+    GSPLAT_CHECK_INPUT(render_alphas);
+    GSPLAT_CHECK_INPUT(last_ids);
+    GSPLAT_CHECK_INPUT(v_render_colors);
+    GSPLAT_CHECK_INPUT(v_render_alphas);
+    if (backgrounds.has_value()) {
+        GSPLAT_CHECK_INPUT(backgrounds.value());
+    }
+    if (masks.has_value()) {
+        GSPLAT_CHECK_INPUT(masks.value());
+    }
+
+    bool packed = means2d.dim() == 2;
+
+    uint32_t C           = tile_offsets.size(0);         // number of cameras
+    uint32_t N           = packed ? 0 : means2d.size(1); // number of gaussians
+    uint32_t n_isects    = flatten_ids.size(0);
+    uint32_t COLOR_DIM   = colors.size(-1);
+    uint32_t tile_height = tile_offsets.size(1);
+    uint32_t tile_width  = tile_offsets.size(2);
+
+    const uint32_t tile_origin_w = image_origin_w / tile_size;
+    const uint32_t tile_origin_h = image_origin_h / tile_size;
+    const uint32_t tile_extent_w = (image_width + tile_size - 1) / tile_size;
+    const uint32_t tile_extent_h = (image_height + tile_size - 1) / tile_size;
+
+    // std::cerr << "RASTERIZE TO PIXELS BACKWARD " << std::endl;
+    // std::cerr << "  BLOCKS = (" << C << ", " << tile_extent_h << ", " << tile_extent_w << ")"
+    //           << std::endl;
+    // std::cerr << "  THREADS = (" << tile_size << ", " << tile_size << ". " << 1 << ")" <<
+    // std::endl; std::cerr << "  TILE WIDTH = " << tile_width << ", TILE HEIGHT = " << tile_height
+    // << std::endl;
+
+    // Each block covers a tile on the image. In total there are
+    // C * tile_height * tile_width blocks.
+    dim3 threads = { tile_size, tile_size, 1 };
+    dim3 blocks  = { C, tile_extent_h, tile_extent_w };
+
+    torch::Tensor v_means2d   = torch::zeros_like(means2d);
+    torch::Tensor v_conics    = torch::zeros_like(conics);
+    torch::Tensor v_colors    = torch::zeros_like(colors);
+    torch::Tensor v_opacities = torch::zeros_like(opacities);
+    torch::Tensor v_means2d_abs;
+    if (absgrad) {
+        v_means2d_abs = torch::zeros_like(means2d);
+    }
+
+    if (n_isects) {
+        const uint32_t shared_mem = tile_size * tile_size *
+                                    (sizeof(int32_t) + sizeof(vec3<float>) + sizeof(vec3<float>) +
+                                     sizeof(float) * COLOR_DIM);
+        at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();
+
+        if (packed) {
+            if (cudaFuncSetAttribute(rasterize_to_pixels_bwd_kernel<CDIM, 1, float>,
+                                     cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                     shared_mem) != cudaSuccess) {
+                AT_ERROR("Failed to set maximum shared memory size (requested ", shared_mem,
+                         " bytes), try lowering tile_size.");
+            }
+            rasterize_to_pixels_bwd_kernel<CDIM, 1, float><<<blocks, threads, shared_mem, stream>>>(
+                C, N, n_isects, packed, reinterpret_cast<vec2<float> *>(means2d.data_ptr<float>()),
+                reinterpret_cast<vec3<float> *>(conics.data_ptr<float>()),
+                colors.packed_accessor64<float, 2, torch::RestrictPtrTraits>(),
+                opacities.data_ptr<float>(),
+                backgrounds.has_value() ? backgrounds.value().data_ptr<float>() : nullptr,
+                masks.has_value() ? masks.value().data_ptr<bool>() : nullptr, image_width,
+                image_height, image_origin_w, image_origin_h, tile_origin_w, tile_origin_h,
+                tile_size, tile_width, tile_height, tile_offsets.data_ptr<int32_t>(),
+                flatten_ids.data_ptr<int32_t>(), render_alphas.data_ptr<float>(),
+                last_ids.data_ptr<int32_t>(), v_render_colors.data_ptr<float>(),
+                v_render_alphas.data_ptr<float>(),
+                absgrad ? reinterpret_cast<vec2<float> *>(v_means2d_abs.data_ptr<float>())
+                        : nullptr,
+                reinterpret_cast<vec2<float> *>(v_means2d.data_ptr<float>()),
+                reinterpret_cast<vec3<float> *>(v_conics.data_ptr<float>()),
+                v_colors.data_ptr<float>(), v_opacities.data_ptr<float>());
+            C10_CUDA_KERNEL_LAUNCH_CHECK();
+        } else {
+            // int maxshmemperblock = 0;
+            // cudaDeviceGetAttribute(&maxshmemperblock, cudaDevAttrMaxSharedMemoryPerBlockOptin,
+            // 0); std::cerr << "maximum shared mem per block is " << maxshmemperblock << std::endl;
+            if (cudaFuncSetAttribute(rasterize_to_pixels_bwd_kernel<CDIM, 2, float>,
+                                     cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                     shared_mem) != cudaSuccess) {
+                AT_ERROR("Failed to set maximum shared memory size (requested ", shared_mem,
+                         " bytes), try lowering tile_size.");
+            }
+            rasterize_to_pixels_bwd_kernel<CDIM, 2, float><<<blocks, threads, shared_mem, stream>>>(
+                C, N, n_isects, packed, reinterpret_cast<vec2<float> *>(means2d.data_ptr<float>()),
+                reinterpret_cast<vec3<float> *>(conics.data_ptr<float>()),
+                colors.packed_accessor64<float, 3, torch::RestrictPtrTraits>(),
+                opacities.data_ptr<float>(),
+                backgrounds.has_value() ? backgrounds.value().data_ptr<float>() : nullptr,
+                masks.has_value() ? masks.value().data_ptr<bool>() : nullptr, image_width,
+                image_height, image_origin_w, image_origin_h, tile_origin_w, tile_origin_h,
+                tile_size, tile_width, tile_height, tile_offsets.data_ptr<int32_t>(),
+                flatten_ids.data_ptr<int32_t>(), render_alphas.data_ptr<float>(),
+                last_ids.data_ptr<int32_t>(), v_render_colors.data_ptr<float>(),
+                v_render_alphas.data_ptr<float>(),
+                absgrad ? reinterpret_cast<vec2<float> *>(v_means2d_abs.data_ptr<float>())
+                        : nullptr,
+                reinterpret_cast<vec2<float> *>(v_means2d.data_ptr<float>()),
+                reinterpret_cast<vec3<float> *>(v_conics.data_ptr<float>()),
+                v_colors.data_ptr<float>(), v_opacities.data_ptr<float>());
+            C10_CUDA_KERNEL_LAUNCH_CHECK();
+        }
+    }
+
+    return std::make_tuple(v_means2d_abs, v_means2d, v_conics, v_colors, v_opacities);
+}
+
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+rasterize_to_pixels_bwd_tensor(
+    // Gaussian parameters
+    const torch::Tensor               &means2d,     // [C, N, 2] or [nnz, 2]
+    const torch::Tensor               &conics,      // [C, N, 3] or [nnz, 3]
+    const torch::Tensor               &colors,      // [C, N, 3] or [nnz, 3]
+    const torch::Tensor               &opacities,   // [C, N] or [nnz]
+    const at::optional<torch::Tensor> &backgrounds, // [C, 3]
+    const at::optional<torch::Tensor> &masks,       // [C, tile_height, tile_width]
+    // image size
+    const uint32_t image_width, const uint32_t image_height, const uint32_t image_origin_w,
+    const uint32_t image_origin_h, const uint32_t tile_size,
+    // intersections
+    const torch::Tensor &tile_offsets, // [C, tile_height, tile_width]
+    const torch::Tensor &flatten_ids,  // [n_isects]
+    // forward outputs
+    const torch::Tensor &render_alphas, // [C, image_height, image_width, 1]
+    const torch::Tensor &last_ids,      // [C, image_height, image_width]
+    // gradients of outputs
+    const torch::Tensor &v_render_colors, // [C, image_height, image_width, 3]
+    const torch::Tensor &v_render_alphas, // [C, image_height, image_width, 1]
+    // options
+    bool absgrad) {
+    GSPLAT_CHECK_CUDA(colors);
+    uint32_t COLOR_DIM = colors.size(-1);
+
+#define __GS__CALL_BWD_(N)                                                                       \
+    case N:                                                                                      \
+        return call_bwd_kernel_with_dim<N>(                                                      \
+            means2d, conics, colors, opacities, backgrounds, masks, image_width, image_height,   \
+            image_origin_w, image_origin_h, tile_size, tile_offsets, flatten_ids, render_alphas, \
+            last_ids, v_render_colors, v_render_alphas, absgrad);
+
+    switch (COLOR_DIM) {
+        __GS__CALL_BWD_(1)
+        __GS__CALL_BWD_(2)
+        __GS__CALL_BWD_(3)
+        __GS__CALL_BWD_(4)
+        __GS__CALL_BWD_(5)
+        __GS__CALL_BWD_(8)
+        __GS__CALL_BWD_(9)
+        __GS__CALL_BWD_(16)
+        __GS__CALL_BWD_(17)
+        __GS__CALL_BWD_(32)
+        __GS__CALL_BWD_(33)
+        __GS__CALL_BWD_(64)
+        __GS__CALL_BWD_(65)
+        __GS__CALL_BWD_(128)
+        __GS__CALL_BWD_(129)
+        __GS__CALL_BWD_(256)
+        __GS__CALL_BWD_(257)
+        __GS__CALL_BWD_(512)
+        __GS__CALL_BWD_(513)
+    default:
+        AT_ERROR("Unsupported number of channels: ", COLOR_DIM);
+    }
+}
+
+template <>
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+dispatchGaussianRasterizeBackward<torch::kCUDA>(
+    // Gaussian parameters
+    const torch::Tensor &means2d,   // [C, N, 2]
+    const torch::Tensor &conics,    // [C, N, 3]
+    const torch::Tensor &colors,    // [C, N, 3]
+    const torch::Tensor &opacities, // [N]
+    // image size
+    const uint32_t image_width, const uint32_t image_height, const uint32_t image_origin_w,
+    const uint32_t image_origin_h,
+
+    const uint32_t tile_size,
+    // intersections
+    const torch::Tensor &tile_offsets, // [C, tile_height, tile_width]
+    const torch::Tensor &flatten_ids,  // [n_isects]
+    // forward outputs
+    const torch::Tensor &render_alphas, // [C, image_height, image_width, 1]
+    const torch::Tensor &last_ids,      // [C, image_height, image_width]
+    // gradients of outputs
+    const torch::Tensor &v_render_colors, // [C, image_height, image_width, 3]
+    const torch::Tensor &v_render_alphas, // [C, image_height, image_width, 1]
+    // options
+    bool absgrad) {
+    return rasterize_to_pixels_bwd_tensor(
+        means2d, conics, colors, opacities, std::nullopt /*backgrounds*/, std::nullopt /*mask*/,
+        image_width, image_height, image_origin_w, image_origin_h, tile_size, tile_offsets,
+        flatten_ids, render_alphas, last_ids, v_render_colors, v_render_alphas, absgrad);
+}
+
+template <>
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+dispatchGaussianRasterizeBackward<torch::kCPU>(
+    // Gaussian parameters
+    const torch::Tensor &means2d,   // [C, N, 2]
+    const torch::Tensor &conics,    // [C, N, 3]
+    const torch::Tensor &colors,    // [C, N, 3]
+    const torch::Tensor &opacities, // [N]
+
+    // image size
+    const uint32_t image_width, const uint32_t image_height, const uint32_t image_origin_w,
+    const uint32_t image_origin_h, const uint32_t tile_size,
+    // intersections
+    const torch::Tensor &tile_offsets, // [C, tile_height, tile_width]
+    const torch::Tensor &flatten_ids,  // [n_isects]
+    // forward outputs
+    const torch::Tensor &render_alphas, // [C, image_height, image_width, 1]
+    const torch::Tensor &last_ids,      // [C, image_height, image_width]
+    // gradients of outputs
+    const torch::Tensor &v_render_colors, // [C, image_height, image_width, 3]
+    const torch::Tensor &v_render_alphas, // [C, image_height, image_width, 1]
+    // options
+    bool absgrad) {
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "CPU implementation not available");
+}
+
+} // namespace ops
+} // namespace detail
+} // namespace fvdb
diff --git a/fvdb/src/detail/ops/gsplat/GaussianRasterizeForward.cu b/fvdb/src/detail/ops/gsplat/GaussianRasterizeForward.cu
new file mode 100644
index 0000000000..03ef9be0f0
--- /dev/null
+++ b/fvdb/src/detail/ops/gsplat/GaussianRasterizeForward.cu
@@ -0,0 +1,403 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+//
+#include "VectorTypes.cuh"
+#include <detail/ops/Ops.h>
+
+#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) \
+    CHECK_CUDA(x);     \
+    CHECK_CONTIGUOUS(x)
+#define PRAGMA_UNROLL _Pragma("unroll")
+
+namespace fvdb {
+namespace detail {
+namespace ops {
+
+/****************************************************************************
+ * Rasterization to Pixels Forward Pass
+ ****************************************************************************/
+
+template <typename S, uint32_t COLOR_DIM>
+__device__ void
+volume_render_tile(const uint32_t tile_start, const uint32_t tile_end, const uint32_t block_size,
+                   const uint32_t tile_size, const bool write_pixel, const uint32_t i,
+                   const uint32_t j, const typename Vec2Type<S>::type *__restrict__ means2d,
+                   const typename Vec3Type<S>::type *__restrict__ conics,
+                   const S *__restrict__ colors, const S *__restrict__ opacities,
+                   const S *__restrict__ background, const int32_t *__restrict__ tile_gaussian_ids,
+                   S *__restrict__ out_tile_colors, S *__restrict__ out_tile_alphas,
+                   int32_t *__restrict__ out_tile_last_ids) {
+    using coord2t = Vec2Type<int32_t>::type;
+    using vec2t   = Vec2Type<S>::type;
+    using vec3t   = Vec3Type<S>::type;
+
+    const uint32_t num_batches = (tile_end - tile_start + block_size - 1) / block_size;
+
+    // Ordinal of this thread in the block
+    const uint32_t tidx = threadIdx.x * blockDim.y + threadIdx.y;
+
+    // We don't return right away if the pixel is not in the image since we want to use
+    // this thread to load gaussians into shared memory
+    bool done = !write_pixel;
+
+    extern __shared__ int s[];
+    struct gaussian_t {
+        int32_t id;                                        // 4 bytes
+        vec2t   xy;                                        // 8 bytes
+        S       opacity;                                   // 4 bytes
+        vec3t   conic;                                     // 12 bytes
+    };
+    gaussian_t *batch = reinterpret_cast<gaussian_t *>(s); // [block_size]
+
+    const S px = (S)(j) + 0.5f;
+    const S py = (S)(i) + 0.5f;
+
+    // NOTE: The accumulated transmittance is used in the backward pass, and since it's a
+    //       sum of many small numbers, we should really use double precision. However,
+    //       this makes the backward pass 1.5x slower, so we stick with float for now and sort of
+    //       just ignore small impact gaussians ¯\_(ツ)_/¯.
+    S accum_transmittance = 1.0f;
+    // index of most recent gaussian to write to this thread's pixel
+    uint32_t cur_idx = 0;
+
+    // collect and process batches of gaussians
+    // each thread loads one gaussian at a time before rasterizing its
+    // designated pixel
+
+    S pix_out[COLOR_DIM] = { 0.f };
+    for (uint32_t b = 0; b < num_batches; ++b) {
+        // Sync threads before we start integrating the next batch
+        // If all threads are done, we can break early
+        if (__syncthreads_count(done) == block_size) {
+            break;
+        }
+
+        // Each thread fetches one gaussian from front to back (tile_gaussian_ids is depth sorted)
+        const uint32_t batch_start = tile_start + block_size * b;
+        const uint32_t idx         = batch_start + tidx;
+        if (idx < tile_end) {
+            const int32_t g     = tile_gaussian_ids[idx]; // which gaussian we're rendering
+            const vec2t   xy    = means2d[g];
+            const S       opac  = opacities[g];
+            const vec3t   conic = conics[g];
+            batch[tidx]         = { g, xy, opac, conic };
+        }
+
+        // Sync threads so all gaussians for this batch are loaded in shared memory
+        __syncthreads();
+
+        // Volume render Gaussians in this batch
+        const uint32_t batch_size = min(block_size, tile_end - batch_start);
+        for (uint32_t t = 0; (t < batch_size) && !done; ++t) {
+            const gaussian_t gaussian = batch[t];
+
+            const vec3t conic = gaussian.conic;
+            const vec2t delta = { gaussian.xy.x - px, gaussian.xy.y - py };
+            const S     sigma = 0.5f * (conic.x * delta.x * delta.x + conic.z * delta.y * delta.y) +
+                            conic.y * delta.x * delta.y;
+            const S alpha = min(0.999f, gaussian.opacity * __expf(-sigma));
+
+            if (sigma < 0.f || alpha < 1.f / 255.f) {
+                continue;
+            }
+
+            const S next_transmittance = accum_transmittance * (1.0f - alpha);
+            if (next_transmittance <= 1e-4) { // this pixel is done: exclusive
+                done = true;
+                break;
+            }
+
+            const S  vis   = alpha * accum_transmittance;
+            const S *c_ptr = colors + gaussian.id * COLOR_DIM;
+            PRAGMA_UNROLL
+            for (uint32_t k = 0; k < COLOR_DIM; ++k) {
+                pix_out[k] += c_ptr[k] * vis;
+            }
+
+            cur_idx             = batch_start + t;
+            accum_transmittance = next_transmittance;
+        }
+    }
+
+    if (write_pixel) {
+        // Here T is the transmittance AFTER the last gaussian in this pixel.
+        // We (should) store double precision as T would be used in backward
+        // pass and it can be very small and causing large diff in gradients
+        // with float32. However, double precision makes the backward pass 1.5x
+        // slower so we stick with float for now.
+        *out_tile_alphas = 1.0f - accum_transmittance;
+        PRAGMA_UNROLL
+        for (uint32_t k = 0; k < COLOR_DIM; ++k) {
+            out_tile_colors[k] = background == nullptr
+                                     ? pix_out[k]
+                                     : (pix_out[k] + accum_transmittance * background[k]);
+        }
+        // index in bin of last gaussian in this pixel
+        *out_tile_last_ids = static_cast<int32_t>(cur_idx);
+    }
+}
+
+template <uint32_t COLOR_DIM, typename S>
+__global__ void
+rasterize_forward(const uint32_t C, const uint32_t N, const uint32_t n_isects, const bool packed,
+                  const typename Vec2Type<S>::type *__restrict__ means2d, // [C, N, 2] or [nnz, 2]
+                  const typename Vec3Type<S>::type *__restrict__ conics,  // [C, N, 3] or [nnz, 3]
+                  const S *__restrict__ colors,      // [C, N, COLOR_DIM] or [nnz, COLOR_DIM]
+                  const S *__restrict__ opacities,   // [C, N] or [nnz]
+                  const S *__restrict__ backgrounds, // [C, COLOR_DIM]
+                  const bool *__restrict__ masks,    // [C, tile_height, tile_width]
+                  const uint32_t image_width, const uint32_t image_height,
+                  const uint32_t image_origin_w, const uint32_t image_origin_h,
+                  const uint32_t tile_origin_w, const uint32_t tile_origin_h,
+                  const uint32_t tile_size, const uint32_t tile_width, const uint32_t tile_height,
+                  const int32_t *__restrict__ tile_offsets,      // [C, tile_height, tile_width]
+                  const int32_t *__restrict__ tile_gaussian_ids, // [n_isects]
+                  S *__restrict__ out_render_colors, // [C, image_height, image_width, COLOR_DIM]
+                  S *__restrict__ out_render_alphas, // [C, image_height, image_width, 1]
+                  int32_t *__restrict__ out_last_ids // [C, image_height, image_width]
+) {
+    // each thread draws one pixel, but also timeshares caching gaussians in a
+    // shared tile
+
+    const int32_t camera_id = blockIdx.x;
+
+    // blockIdx runs from [0, num_tiles_h] x [0, num_tiles_w]
+    const int32_t tile_id = (blockIdx.y + tile_origin_h) * tile_width + blockIdx.z + tile_origin_w;
+
+    // Pixel coordinates run from [0, height] x [0, width]
+    const uint32_t i      = blockIdx.y * tile_size + threadIdx.y;
+    const uint32_t j      = blockIdx.z * tile_size + threadIdx.x;
+    const int32_t  pix_id = i * image_width + j;
+
+    tile_offsets += camera_id * tile_height * tile_width;
+    auto const camera_pix_offset = camera_id * image_height * image_width + pix_id;
+    out_render_colors += camera_pix_offset * COLOR_DIM;
+    out_render_alphas += camera_pix_offset;
+    out_last_ids += camera_pix_offset;
+    if (backgrounds != nullptr) {
+        backgrounds += camera_id * COLOR_DIM;
+    }
+    if (masks != nullptr) {
+        masks += camera_id * tile_height * tile_width;
+    }
+
+    // return if out of bounds
+    // keep not rasterizing threads around for reading data
+    const bool pixel_in_image = (i < image_height && j < image_width);
+
+    // when the mask is provided, render the background color and return
+    // if this tile is labeled as False
+    if (masks != nullptr && pixel_in_image && !masks[tile_id]) {
+        PRAGMA_UNROLL
+        for (uint32_t k = 0; k < COLOR_DIM; ++k) {
+            out_render_colors[k] = backgrounds == nullptr ? 0.0f : backgrounds[k];
+        }
+        return;
+    }
+
+    // have all threads in tile process the same gaussians in batches
+    // first collect gaussians between range.x and range.y in batches
+    // which gaussians to look through in this tile
+    const int32_t  range_start = tile_offsets[tile_id];
+    const int32_t  range_end   = (camera_id == C - 1) && (tile_id == tile_width * tile_height - 1)
+                                     ? n_isects
+                                     : tile_offsets[tile_id + 1];
+    const uint32_t block_size  = blockDim.x * blockDim.y;
+
+    return volume_render_tile<S, COLOR_DIM>(range_start, range_end, block_size, tile_size,
+                                            pixel_in_image, i, j, means2d, conics, colors,
+                                            opacities, backgrounds, tile_gaussian_ids,
+                                            out_render_colors, out_render_alphas, out_last_ids);
+}
+
+template <uint32_t CDIM>
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
+call_fwd_kernel_with_dim(
+    // Gaussian parameters
+    const torch::Tensor               &means2d,     // [C, N, 2] or [nnz, 2]
+    const torch::Tensor               &conics,      // [C, N, 3] or [nnz, 3]
+    const torch::Tensor               &colors,      // [C, N, channels] or [nnz, channels]
+    const torch::Tensor               &opacities,   // [C, N]  or [nnz]
+    const at::optional<torch::Tensor> &backgrounds, // [C, channels]
+    const at::optional<torch::Tensor> &masks,       // [C, tile_height, tile_width]
+    // image size
+    const uint32_t image_width, const uint32_t image_height, const uint32_t image_origin_w,
+    const uint32_t image_origin_h, const uint32_t tile_size,
+    // intersections
+    const torch::Tensor &tile_offsets,     // [C, tile_height, tile_width]
+    const torch::Tensor &tile_gaussian_ids // [n_isects]
+) {
+    using vec3t = typename Vec3Type<float>::type;
+    using vec2t = typename Vec2Type<float>::type;
+
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(means2d));
+
+    TORCH_CHECK_VALUE(means2d.dim() == 3 || means2d.dim() == 2,
+                      "means2d must have 3 dimensions (C, N, 2) or 2 dimensions (nnz, 2)");
+    TORCH_CHECK_VALUE(conics.dim() == 3 || conics.dim() == 2,
+                      "conics must have 3 dimensions (C, N, 3) or 2 dimensions (nnz, 3)");
+    TORCH_CHECK_VALUE(
+        colors.dim() == 3 || colors.dim() == 2,
+        "colors must have 3 dimensions (C, N, channels) or 2 dimensions (nnz, channels)");
+    TORCH_CHECK_VALUE(opacities.dim() == 2 || opacities.dim() == 1,
+                      "opacities must have 2 dimensions (C, N) or 1 dimension (nnz)");
+    if (backgrounds.has_value()) {
+        TORCH_CHECK_VALUE(backgrounds.value().dim() == 2,
+                          "backgrounds must have 2 dimensions (C, channels)");
+    }
+    if (masks.has_value()) {
+        TORCH_CHECK_VALUE(masks.value().dim() == 3,
+                          "masks must have 3 dimensions (C, tile_height, tile_width)");
+    }
+    TORCH_CHECK_VALUE(tile_offsets.dim() == 3,
+                      "tile_offsets must have 3 dimensions (C, tile_height, tile_width)");
+    TORCH_CHECK_VALUE(tile_gaussian_ids.dim() == 1,
+                      "tile_gaussian_ids must have 1 dimension (n_isects)");
+
+    CHECK_INPUT(means2d);
+    CHECK_INPUT(conics);
+    CHECK_INPUT(colors);
+    CHECK_INPUT(opacities);
+    CHECK_INPUT(tile_offsets);
+    CHECK_INPUT(tile_gaussian_ids);
+    if (backgrounds.has_value()) {
+        CHECK_INPUT(backgrounds.value());
+    }
+    if (masks.has_value()) {
+        CHECK_INPUT(masks.value());
+    }
+
+    const bool packed = means2d.dim() == 2;
+
+    const uint32_t C           = tile_offsets.size(0);         // number of cameras
+    const uint32_t N           = packed ? 0 : means2d.size(1); // number of gaussians
+    const uint32_t channels    = colors.size(-1);
+    const uint32_t tile_height = tile_offsets.size(1);
+    const uint32_t tile_width  = tile_offsets.size(2);
+    const uint32_t n_isects    = tile_gaussian_ids.size(0);
+
+    const uint32_t tile_origin_w = image_origin_w / tile_size;
+    const uint32_t tile_origin_h = image_origin_h / tile_size;
+    const uint32_t tile_extent_w = (image_width + tile_size - 1) / tile_size;
+    const uint32_t tile_extent_h = (image_height + tile_size - 1) / tile_size;
+
+    // The rendered images to return (one per camera)
+    torch::Tensor out_images = torch::empty({ C, image_height, image_width, channels },
+                                            means2d.options().dtype(torch::kFloat32));
+    torch::Tensor alphas =
+        torch::empty({ C, image_height, image_width, 1 }, means2d.options().dtype(torch::kFloat32));
+    torch::Tensor last_ids =
+        torch::empty({ C, image_height, image_width }, means2d.options().dtype(torch::kInt32));
+
+    const at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();
+
+    // Each pixel in each tile will cache a gaussian consisting of:
+    //   - int32_t  gaussian_id; -- 4 bytes
+    //   - vec2t    xy;          -- 8 bytes for float32
+    //   - scalar_t opacity;     -- 4 bytes for float32
+    //   - vec3t    conic;       -- 12 bytes for float32
+    const uint32_t shared_mem =
+        tile_size * tile_size * (sizeof(int32_t) + sizeof(vec2t) + sizeof(float) + sizeof(vec3t));
+
+    // TODO: an optimization can be done by passing the actual number of
+    // channels into the kernel functions and avoid necessary global memory
+    // writes. This requires moving the channel padding from python to C side.
+    if (cudaFuncSetAttribute(rasterize_forward<CDIM, float>,
+                             cudaFuncAttributeMaxDynamicSharedMemorySize,
+                             shared_mem) != cudaSuccess) {
+        AT_ERROR("Failed to set maximum shared memory size (requested ", shared_mem,
+                 " bytes), try lowering tile_size.");
+    }
+
+    const dim3 threads = { tile_size, tile_size, 1 };
+    const dim3 blocks  = { C, tile_extent_h, tile_extent_w };
+    rasterize_forward<CDIM, float><<<blocks, threads, shared_mem, stream>>>(
+        C, N, n_isects, packed, reinterpret_cast<vec2t *>(means2d.data_ptr<float>()),
+        reinterpret_cast<vec3t *>(conics.data_ptr<float>()), colors.data_ptr<float>(),
+        opacities.data_ptr<float>(),
+        backgrounds.has_value() ? backgrounds.value().data_ptr<float>() : nullptr,
+        masks.has_value() ? masks.value().data_ptr<bool>() : nullptr, image_width, image_height,
+        image_origin_w, image_origin_h, tile_origin_w, tile_origin_h, tile_size, tile_width,
+        tile_height, tile_offsets.data_ptr<int32_t>(), tile_gaussian_ids.data_ptr<int32_t>(),
+        out_images.data_ptr<float>(), alphas.data_ptr<float>(), last_ids.data_ptr<int32_t>());
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+    return std::make_tuple(out_images, alphas, last_ids);
+}
+
+template <>
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
+dispatchGaussianRasterizeForward<torch::kCUDA>(
+    // Gaussian parameters
+    const torch::Tensor &means2d,          // [C, N, 2]
+    const torch::Tensor &conics,           // [C, N, 3]
+    const torch::Tensor &colors,           // [C, N, D]
+    const torch::Tensor &opacities,        // [N]
+    const uint32_t image_width, const uint32_t image_height, const uint32_t image_origin_w,
+    const uint32_t image_origin_h, const uint32_t tile_size,
+    const torch::Tensor &tile_offsets,     // [C, tile_height, tile_width]
+    const torch::Tensor &tile_gaussian_ids // [n_isects]
+) {
+    CHECK_INPUT(colors);
+    const uint32_t channels = colors.size(-1);
+
+    const torch::optional<torch::Tensor> backgrounds = torch::nullopt;
+    const torch::optional<torch::Tensor> masks       = torch::nullopt;
+
+#define __CALL_FWD_(N)                                                                         \
+    case N:                                                                                    \
+        return call_fwd_kernel_with_dim<N>(                                                    \
+            means2d, conics, colors, opacities, backgrounds, masks, image_width, image_height, \
+            image_origin_w, image_origin_h, tile_size, tile_offsets, tile_gaussian_ids);
+    // Make channels a compile time constant and do everything in register space but at the expense
+    // of making this code ugly.
+    // NOTE: We do powers of two and powers of two plus one to handle rendering common feature
+    // channel dimensions with an optional additional depth channel
+    switch (channels) {
+        __CALL_FWD_(1)
+        __CALL_FWD_(2)
+        __CALL_FWD_(3)
+        __CALL_FWD_(4)
+        __CALL_FWD_(5)
+        __CALL_FWD_(8)
+        __CALL_FWD_(9)
+        __CALL_FWD_(16)
+        __CALL_FWD_(17)
+        __CALL_FWD_(32)
+        __CALL_FWD_(33)
+        __CALL_FWD_(64)
+        __CALL_FWD_(65)
+        __CALL_FWD_(128)
+        __CALL_FWD_(129)
+        __CALL_FWD_(256)
+        __CALL_FWD_(257)
+        __CALL_FWD_(512)
+        __CALL_FWD_(513)
+    default:
+        AT_ERROR("Unsupported number of channels: ", channels);
+    }
+}
+
+template <>
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
+dispatchGaussianRasterizeForward<torch::kCPU>(
+    // Gaussian parameters
+    const torch::Tensor &means2d,   // [C, N, 2]
+    const torch::Tensor &conics,    // [C, N, 3]
+    const torch::Tensor &colors,    // [C, N, D]
+    const torch::Tensor &opacities, // [N]
+    // image size
+    const uint32_t image_width, const uint32_t image_height, const uint32_t image_origin_w,
+    const uint32_t image_origin_h, const uint32_t tile_size,
+    // intersections
+    const torch::Tensor &tile_offsets,     // [C, tile_height, tile_width]
+    const torch::Tensor &tile_gaussian_ids // [n_isects]
+) {
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "CPU implementation not available");
+}
+
+} // namespace ops
+} // namespace detail
+} // namespace fvdb
diff --git a/fvdb/src/detail/ops/gsplat/GaussianTileIntersection.cu b/fvdb/src/detail/ops/gsplat/GaussianTileIntersection.cu
index 5b44b75fcb..a012e3d5f8 100644
--- a/fvdb/src/detail/ops/gsplat/GaussianTileIntersection.cu
+++ b/fvdb/src/detail/ops/gsplat/GaussianTileIntersection.cu
@@ -1,14 +1,13 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: Apache-2.0
 //
+#include "VectorTypes.cuh"
 #include <detail/ops/Ops.h>
 
 #include <c10/cuda/CUDACachingAllocator.h>
 
 #include <cub/cub.cuh>
 
-#include "VectorTypes.cuh"
-
 #define NUM_THREADS 1024
 
 #define CUB_WRAPPER(func, ...)                                                    \
@@ -41,7 +40,7 @@ count_tiles_per_gaussian(const uint32_t total_gaussians, const uint32_t tile_siz
     using OpT = typename OpType<T>::type;
 
     // parallelize over num_cameras * num_gaussians.
-    const int32_t idx = blockIdx.x * blockDim.x + threadIdx.x; // cg::this_grid().thread_rank();
+    const int32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= total_gaussians) {
         return;
     }
diff --git a/fvdb/src/detail/ops/gsplat/GsplatTypes.cuh b/fvdb/src/detail/ops/gsplat/GsplatTypes.cuh
index bccfb6d857..47f0fef48c 100644
--- a/fvdb/src/detail/ops/gsplat/GsplatTypes.cuh
+++ b/fvdb/src/detail/ops/gsplat/GsplatTypes.cuh
@@ -6,12 +6,13 @@
 
 #include "GsplatMacros.cuh"
 
-#include <glm/glm.hpp>
-#include <glm/gtc/type_ptr.hpp>
-
 #include <ATen/native/Math.h>
+
 #include <cooperative_groups/reduce.h>
 
+#include <glm/glm.hpp>
+#include <glm/gtc/type_ptr.hpp>
+
 namespace fvdb {
 namespace detail {
 namespace ops {
diff --git a/fvdb/src/detail/ops/gsplat/SphericalHarmonics.cu b/fvdb/src/detail/ops/gsplat/SphericalHarmonics.cu
index 9c06461cc1..58344f213e 100644
--- a/fvdb/src/detail/ops/gsplat/SphericalHarmonics.cu
+++ b/fvdb/src/detail/ops/gsplat/SphericalHarmonics.cu
@@ -1,12 +1,11 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: Apache-2.0
 //
+#include "VectorTypes.cuh"
 #include <detail/ops/Ops.h>
 
 #include <ATen/cuda/Atomic.cuh>
 
-#include "VectorTypes.cuh"
-
 constexpr int NUM_THREADS = 1024;
 
 namespace fvdb {
@@ -14,95 +13,149 @@ namespace detail {
 namespace ops {
 
 namespace {
+
+template <typename T>
+inline __device__ T
+getval(const torch::PackedTensorAccessor32<T, 4, torch::RestrictPtrTraits> coeffs, uint32_t k,
+       uint32_t ci, uint32_t gi, uint32_t c) {
+    return coeffs[k][ci][gi][c];
+}
+
+template <typename T>
+inline __device__ T
+getval(const torch::PackedTensorAccessor32<T, 3, torch::RestrictPtrTraits> coeffs, uint32_t k,
+       uint32_t ci, uint32_t gi, uint32_t c) {
+    return coeffs[k][gi][c];
+}
+
+template <typename T>
+inline __device__ void
+setval(torch::PackedTensorAccessor32<T, 4, torch::RestrictPtrTraits> coeffs, uint32_t k,
+       uint32_t ci, uint32_t gi, uint32_t c, T val) {
+    coeffs[k][ci][gi][c] = val;
+}
+
+template <typename T>
+inline __device__ void
+setval(torch::PackedTensorAccessor32<T, 3, torch::RestrictPtrTraits> coeffs, uint32_t k,
+       uint32_t ci, uint32_t gi, uint32_t c, T val) {
+    coeffs[k][gi][c] = val;
+}
+
 // Evaluate spherical harmonics bases at unit direction for high orders using
 // approach described by Efficient Spherical Harmonic Evaluation, Peter-Pike
 // Sloan, JCGT 2013 See https://jcgt.org/published/0002/02/06/ for reference
 // implementation
-template <typename T>
+template <typename T, size_t N>
 inline __device__ void
-sh_coeffs_to_color(const uint32_t                    degree, // degree of SH to be evaluated
-                   const uint32_t                    c,      // color channel
-                   const typename Vec3Type<T>::type &dir,    // [3]
-                   const T                          *coeffs, // [K, 3]
-                   // output
-                   T *colors // [3]
+eval_sh_function(const uint32_t                    degree, // degree of SH to be evaluated
+                 const uint32_t                    ci,     // camera index
+                 const uint32_t                    gi,     // gaussian index
+                 const uint32_t                    c,      // color channel
+                 const typename Vec3Type<T>::type &dir,    // [D]
+                 const torch::PackedTensorAccessor32<T, N, torch::RestrictPtrTraits> coeffs,
+                 T                                                                  *colors // [D]
 ) {
-    // FIXME (Francis): This is a terrible way to read from coeffs, since we're not going to do any
-    //                  memory coalescing. We should instead read from coeffs in a coalesced manner
-    T result = 0.2820947917738781f * coeffs[c];
+    const T cSH0 = getval(coeffs, 0, ci, gi, c);
+
+    T result = 0.2820947917738781f * cSH0;
+
     if (degree >= 1) {
         // Normally rsqrt is faster than sqrt, but --use_fast_math will optimize
         // sqrt on single precision, so we use sqrt here.
-        T inorm = rsqrtf(dir.x * dir.x + dir.y * dir.y + dir.z * dir.z);
-        T x     = dir.x * inorm;
-        T y     = dir.y * inorm;
-        T z     = dir.z * inorm;
+        const T inorm = rsqrtf(dir.x * dir.x + dir.y * dir.y + dir.z * dir.z);
+        const T x     = dir.x * inorm;
+        const T y     = dir.y * inorm;
+        const T z     = dir.z * inorm;
+
+        const T cSH1 = getval(coeffs, 1, ci, gi, c);
+        const T cSH2 = getval(coeffs, 2, ci, gi, c);
+        const T cSH3 = getval(coeffs, 3, ci, gi, c);
+
+        result += 0.48860251190292f * (-y * cSH1 + z * cSH2 - x * cSH3);
 
-        result += 0.48860251190292f *
-                  (-y * coeffs[1 * 3 + c] + z * coeffs[2 * 3 + c] - x * coeffs[3 * 3 + c]);
         if (degree >= 2) {
-            T z2 = z * z;
-
-            T fTmp0B = -1.092548430592079f * z;
-            T fC1    = x * x - y * y;
-            T fS1    = 2.f * x * y;
-            T pSH6   = (0.9461746957575601f * z2 - 0.3153915652525201f);
-            T pSH7   = fTmp0B * x;
-            T pSH5   = fTmp0B * y;
-            T pSH8   = 0.5462742152960395f * fC1;
-            T pSH4   = 0.5462742152960395f * fS1;
-
-            result += pSH4 * coeffs[4 * 3 + c] + pSH5 * coeffs[5 * 3 + c] +
-                      pSH6 * coeffs[6 * 3 + c] + pSH7 * coeffs[7 * 3 + c] +
-                      pSH8 * coeffs[8 * 3 + c];
+            const T z2 = z * z;
+
+            const T fTmp0B = -1.092548430592079f * z;
+            const T fC1    = x * x - y * y;
+            const T fS1    = 2.f * x * y;
+            const T pSH6   = (0.9461746957575601f * z2 - 0.3153915652525201f);
+            const T pSH7   = fTmp0B * x;
+            const T pSH5   = fTmp0B * y;
+            const T pSH8   = 0.5462742152960395f * fC1;
+            const T pSH4   = 0.5462742152960395f * fS1;
+
+            const T cSH4 = getval(coeffs, 4, ci, gi, c);
+            const T cSH5 = getval(coeffs, 5, ci, gi, c);
+            const T cSH6 = getval(coeffs, 6, ci, gi, c);
+            const T cSH7 = getval(coeffs, 7, ci, gi, c);
+            const T cSH8 = getval(coeffs, 8, ci, gi, c);
+
+            result += (pSH4 * cSH4) + (pSH5 * cSH5) + (pSH6 * cSH6) + (pSH7 * cSH7) + (pSH8 * cSH8);
+
             if (degree >= 3) {
-                T fTmp0C = -2.285228997322329f * z2 + 0.4570457994644658f;
-                T fTmp1B = 1.445305721320277f * z;
-                T fC2    = x * fC1 - y * fS1;
-                T fS2    = x * fS1 + y * fC1;
-                T pSH12  = z * (1.865881662950577f * z2 - 1.119528997770346f);
-                T pSH13  = fTmp0C * x;
-                T pSH11  = fTmp0C * y;
-                T pSH14  = fTmp1B * fC1;
-                T pSH10  = fTmp1B * fS1;
-                T pSH15  = -0.5900435899266435f * fC2;
-                T pSH9   = -0.5900435899266435f * fS2;
-
-                result += pSH9 * coeffs[9 * 3 + c] + pSH10 * coeffs[10 * 3 + c] +
-                          pSH11 * coeffs[11 * 3 + c] + pSH12 * coeffs[12 * 3 + c] +
-                          pSH13 * coeffs[13 * 3 + c] + pSH14 * coeffs[14 * 3 + c] +
-                          pSH15 * coeffs[15 * 3 + c];
+                const T fTmp0C = -2.285228997322329f * z2 + 0.4570457994644658f;
+                const T fTmp1B = 1.445305721320277f * z;
+                const T fC2    = x * fC1 - y * fS1;
+                const T fS2    = x * fS1 + y * fC1;
+                const T pSH12  = z * (1.865881662950577f * z2 - 1.119528997770346f);
+                const T pSH13  = fTmp0C * x;
+                const T pSH11  = fTmp0C * y;
+                const T pSH14  = fTmp1B * fC1;
+                const T pSH10  = fTmp1B * fS1;
+                const T pSH15  = -0.5900435899266435f * fC2;
+                const T pSH9   = -0.5900435899266435f * fS2;
+
+                const T cSH9  = getval(coeffs, 9, ci, gi, c);
+                const T cSH10 = getval(coeffs, 10, ci, gi, c);
+                const T cSH11 = getval(coeffs, 11, ci, gi, c);
+                const T cSH12 = getval(coeffs, 12, ci, gi, c);
+                const T cSH13 = getval(coeffs, 13, ci, gi, c);
+                const T cSH14 = getval(coeffs, 14, ci, gi, c);
+                const T cSH15 = getval(coeffs, 15, ci, gi, c);
+
+                result += (pSH9 * cSH9) + (pSH10 * cSH10) + (pSH11 * cSH11) + (pSH12 * cSH12) +
+                          (pSH13 * cSH13) + (pSH14 * cSH14) + (pSH15 * cSH15);
 
                 if (degree >= 4) {
-                    T fTmp0D = z * (-4.683325804901025f * z2 + 2.007139630671868f);
-                    T fTmp1C = 3.31161143515146f * z2 - 0.47308734787878f;
-                    T fTmp2B = -1.770130769779931f * z;
-                    T fC3    = x * fC2 - y * fS2;
-                    T fS3    = x * fS2 + y * fC2;
-                    T pSH20  = (1.984313483298443f * z * pSH12 - 1.006230589874905f * pSH6);
-                    T pSH21  = fTmp0D * x;
-                    T pSH19  = fTmp0D * y;
-                    T pSH22  = fTmp1C * fC1;
-                    T pSH18  = fTmp1C * fS1;
-                    T pSH23  = fTmp2B * fC2;
-                    T pSH17  = fTmp2B * fS2;
-                    T pSH24  = 0.6258357354491763f * fC3;
-                    T pSH16  = 0.6258357354491763f * fS3;
-
-                    result += pSH16 * coeffs[16 * 3 + c] + pSH17 * coeffs[17 * 3 + c] +
-                              pSH18 * coeffs[18 * 3 + c] + pSH19 * coeffs[19 * 3 + c] +
-                              pSH20 * coeffs[20 * 3 + c] + pSH21 * coeffs[21 * 3 + c] +
-                              pSH22 * coeffs[22 * 3 + c] + pSH23 * coeffs[23 * 3 + c] +
-                              pSH24 * coeffs[24 * 3 + c];
+                    const T fTmp0D = z * (-4.683325804901025f * z2 + 2.007139630671868f);
+                    const T fTmp1C = 3.31161143515146f * z2 - 0.47308734787878f;
+                    const T fTmp2B = -1.770130769779931f * z;
+                    const T fC3    = x * fC2 - y * fS2;
+                    const T fS3    = x * fS2 + y * fC2;
+                    const T pSH20  = (1.984313483298443f * z * pSH12 - 1.006230589874905f * pSH6);
+                    const T pSH21  = fTmp0D * x;
+                    const T pSH19  = fTmp0D * y;
+                    const T pSH22  = fTmp1C * fC1;
+                    const T pSH18  = fTmp1C * fS1;
+                    const T pSH23  = fTmp2B * fC2;
+                    const T pSH17  = fTmp2B * fS2;
+                    const T pSH24  = 0.6258357354491763f * fC3;
+                    const T pSH16  = 0.6258357354491763f * fS3;
+
+                    const T cSH16 = getval(coeffs, 16, ci, gi, c);
+                    const T cSH17 = getval(coeffs, 17, ci, gi, c);
+                    const T cSH18 = getval(coeffs, 18, ci, gi, c);
+                    const T cSH19 = getval(coeffs, 19, ci, gi, c);
+                    const T cSH20 = getval(coeffs, 20, ci, gi, c);
+                    const T cSH21 = getval(coeffs, 21, ci, gi, c);
+                    const T cSH22 = getval(coeffs, 22, ci, gi, c);
+                    const T cSH23 = getval(coeffs, 23, ci, gi, c);
+                    const T cSH24 = getval(coeffs, 24, ci, gi, c);
+
+                    result += (pSH16 * cSH16) + (pSH17 * cSH17) + (pSH18 * cSH18) +
+                              (pSH19 * cSH19) + (pSH20 * cSH20) + (pSH21 * cSH21) +
+                              (pSH22 * cSH22) + (pSH23 * cSH23) + (pSH24 * cSH24);
                 }
             }
         }
     }
 
-    colors[c] = result;
+    colors[c] = result + 0.5f;
 }
 
-// We repeat this code everywhere in sh_coeffs_to_color_vjp to compute the gradient of the
+// We repeat this code everywhere in eval_sh_function_vjp to compute the gradient of the
 // direction and write it out, so pull this into a function.
 template <typename T>
 __device__ inline void
@@ -115,37 +168,39 @@ write_v_dir(T x, T y, T z, T v_x, T v_y, T v_z, T inorm, typename Vec3Type<T>::t
     v_dir->z = (v_z - v_dir_n_dot_dir_n * z) * inorm;
 }
 
-template <typename T>
+template <typename T, size_t N>
 inline __device__ void
-sh_coeffs_to_color_vjp(const uint32_t                    degree,   // degree of SH to be evaluated
-                       const uint32_t                    c,        // color channel
-                       const typename Vec3Type<T>::type &dir,      // [3]
-                       const T                          *coeffs,   // [K, 3]
-                       const T                          *v_colors, // [3]
-                       // output
-                       T                          *v_coeffs, // [K, 3]
-                       typename Vec3Type<T>::type *v_dir     // [3] optional
+eval_sh_function_vjp(const uint32_t                    degree, // degree of SH to be evaluated
+                     const uint32_t                    ci,     // camera index
+                     const uint32_t                    gi,     // gaussian index
+                     const uint32_t                    c,      // color channel
+                     const typename Vec3Type<T>::type &dir,    // [3]
+                     const torch::PackedTensorAccessor32<T, N, torch::RestrictPtrTraits> coeffs,
+                     const T                                                      *v_colors, // [D]
+                     torch::PackedTensorAccessor32<T, N, torch::RestrictPtrTraits> v_coeffs,
+                     typename Vec3Type<T>::type *v_dir // [3] optional
 ) {
     T v_colors_local = v_colors[c];
 
-    v_coeffs[c] = 0.2820947917738781f * v_colors_local;
+    setval(v_coeffs, 0, ci, gi, c, 0.2820947917738781f * v_colors_local);
+
     if (degree < 1) {
         return;
     }
-    T inorm = rsqrtf(dir.x * dir.x + dir.y * dir.y + dir.z * dir.z);
-    T x     = dir.x * inorm;
-    T y     = dir.y * inorm;
-    T z     = dir.z * inorm;
-    T v_x = 0.f, v_y = 0.f, v_z = 0.f;
+    const T inorm = rsqrtf(dir.x * dir.x + dir.y * dir.y + dir.z * dir.z);
+    const T x     = dir.x * inorm;
+    const T y     = dir.y * inorm;
+    const T z     = dir.z * inorm;
+    T       v_x = 0.f, v_y = 0.f, v_z = 0.f;
 
-    v_coeffs[1 * 3 + c] = -0.48860251190292f * y * v_colors_local;
-    v_coeffs[2 * 3 + c] = 0.48860251190292f * z * v_colors_local;
-    v_coeffs[3 * 3 + c] = -0.48860251190292f * x * v_colors_local;
+    setval(v_coeffs, 1, ci, gi, c, -0.48860251190292f * y * v_colors_local);
+    setval(v_coeffs, 2, ci, gi, c, 0.48860251190292f * z * v_colors_local);
+    setval(v_coeffs, 3, ci, gi, c, -0.48860251190292f * x * v_colors_local);
 
     if (v_dir != nullptr) {
-        v_x += -0.48860251190292f * coeffs[3 * 3 + c] * v_colors_local;
-        v_y += -0.48860251190292f * coeffs[1 * 3 + c] * v_colors_local;
-        v_z += 0.48860251190292f * coeffs[2 * 3 + c] * v_colors_local;
+        v_x += -0.48860251190292f * getval(coeffs, 3, ci, gi, c) * v_colors_local;
+        v_y += -0.48860251190292f * getval(coeffs, 1, ci, gi, c) * v_colors_local;
+        v_z += 0.48860251190292f * getval(coeffs, 2, ci, gi, c) * v_colors_local;
     }
     if (degree < 2) {
         if (v_dir != nullptr) {
@@ -154,20 +209,20 @@ sh_coeffs_to_color_vjp(const uint32_t                    degree,   // degree of
         return;
     }
 
-    T z2                = z * z;
-    T fTmp0B            = -1.092548430592079f * z;
-    T fC1               = x * x - y * y;
-    T fS1               = 2.f * x * y;
-    T pSH6              = (0.9461746957575601f * z2 - 0.3153915652525201f);
-    T pSH7              = fTmp0B * x;
-    T pSH5              = fTmp0B * y;
-    T pSH8              = 0.5462742152960395f * fC1;
-    T pSH4              = 0.5462742152960395f * fS1;
-    v_coeffs[4 * 3 + c] = pSH4 * v_colors_local;
-    v_coeffs[5 * 3 + c] = pSH5 * v_colors_local;
-    v_coeffs[6 * 3 + c] = pSH6 * v_colors_local;
-    v_coeffs[7 * 3 + c] = pSH7 * v_colors_local;
-    v_coeffs[8 * 3 + c] = pSH8 * v_colors_local;
+    const T z2     = z * z;
+    const T fTmp0B = -1.092548430592079f * z;
+    const T fC1    = x * x - y * y;
+    const T fS1    = 2.f * x * y;
+    const T pSH6   = (0.9461746957575601f * z2 - 0.3153915652525201f);
+    const T pSH7   = fTmp0B * x;
+    const T pSH5   = fTmp0B * y;
+    const T pSH8   = 0.5462742152960395f * fC1;
+    const T pSH4   = 0.5462742152960395f * fS1;
+    setval(v_coeffs, 4, ci, gi, c, pSH4 * v_colors_local);
+    setval(v_coeffs, 5, ci, gi, c, pSH5 * v_colors_local);
+    setval(v_coeffs, 6, ci, gi, c, pSH6 * v_colors_local);
+    setval(v_coeffs, 7, ci, gi, c, pSH7 * v_colors_local);
+    setval(v_coeffs, 8, ci, gi, c, pSH8 * v_colors_local);
 
     T fTmp0B_z, fC1_x, fC1_y, fS1_x, fS1_y, pSH6_z, pSH7_x, pSH7_z, pSH5_y, pSH5_z, pSH8_x, pSH8_y,
         pSH4_x, pSH4_y;
@@ -187,12 +242,15 @@ sh_coeffs_to_color_vjp(const uint32_t                    degree,   // degree of
         pSH4_x   = 0.5462742152960395f * fS1_x;
         pSH4_y   = 0.5462742152960395f * fS1_y;
 
-        v_x += v_colors_local * (pSH4_x * coeffs[4 * 3 + c] + pSH8_x * coeffs[8 * 3 + c] +
-                                 pSH7_x * coeffs[7 * 3 + c]);
-        v_y += v_colors_local * (pSH4_y * coeffs[4 * 3 + c] + pSH8_y * coeffs[8 * 3 + c] +
-                                 pSH5_y * coeffs[5 * 3 + c]);
-        v_z += v_colors_local * (pSH6_z * coeffs[6 * 3 + c] + pSH7_z * coeffs[7 * 3 + c] +
-                                 pSH5_z * coeffs[5 * 3 + c]);
+        v_x += v_colors_local *
+               (pSH4_x * getval(coeffs, 4, ci, gi, c) + pSH8_x * getval(coeffs, 8, ci, gi, c) +
+                pSH7_x * getval(coeffs, 7, ci, gi, c));
+        v_y += v_colors_local *
+               (pSH4_y * getval(coeffs, 4, ci, gi, c) + pSH8_y * getval(coeffs, 8, ci, gi, c) +
+                pSH5_y * getval(coeffs, 5, ci, gi, c));
+        v_z += v_colors_local *
+               (pSH6_z * getval(coeffs, 6, ci, gi, c) + pSH7_z * getval(coeffs, 7, ci, gi, c) +
+                pSH5_z * getval(coeffs, 5, ci, gi, c));
     }
 
     if (degree < 3) {
@@ -202,24 +260,25 @@ sh_coeffs_to_color_vjp(const uint32_t                    degree,   // degree of
         return;
     }
 
-    T fTmp0C             = -2.285228997322329f * z2 + 0.4570457994644658f;
-    T fTmp1B             = 1.445305721320277f * z;
-    T fC2                = x * fC1 - y * fS1;
-    T fS2                = x * fS1 + y * fC1;
-    T pSH12              = z * (1.865881662950577f * z2 - 1.119528997770346f);
-    T pSH13              = fTmp0C * x;
-    T pSH11              = fTmp0C * y;
-    T pSH14              = fTmp1B * fC1;
-    T pSH10              = fTmp1B * fS1;
-    T pSH15              = -0.5900435899266435f * fC2;
-    T pSH9               = -0.5900435899266435f * fS2;
-    v_coeffs[9 * 3 + c]  = pSH9 * v_colors_local;
-    v_coeffs[10 * 3 + c] = pSH10 * v_colors_local;
-    v_coeffs[11 * 3 + c] = pSH11 * v_colors_local;
-    v_coeffs[12 * 3 + c] = pSH12 * v_colors_local;
-    v_coeffs[13 * 3 + c] = pSH13 * v_colors_local;
-    v_coeffs[14 * 3 + c] = pSH14 * v_colors_local;
-    v_coeffs[15 * 3 + c] = pSH15 * v_colors_local;
+    const T fTmp0C = -2.285228997322329f * z2 + 0.4570457994644658f;
+    const T fTmp1B = 1.445305721320277f * z;
+    const T fC2    = x * fC1 - y * fS1;
+    const T fS2    = x * fS1 + y * fC1;
+    const T pSH12  = z * (1.865881662950577f * z2 - 1.119528997770346f);
+    const T pSH13  = fTmp0C * x;
+    const T pSH11  = fTmp0C * y;
+    const T pSH14  = fTmp1B * fC1;
+    const T pSH10  = fTmp1B * fS1;
+    const T pSH15  = -0.5900435899266435f * fC2;
+    const T pSH9   = -0.5900435899266435f * fS2;
+
+    setval(v_coeffs, 9, ci, gi, c, pSH9 * v_colors_local);
+    setval(v_coeffs, 10, ci, gi, c, pSH10 * v_colors_local);
+    setval(v_coeffs, 11, ci, gi, c, pSH11 * v_colors_local);
+    setval(v_coeffs, 12, ci, gi, c, pSH12 * v_colors_local);
+    setval(v_coeffs, 13, ci, gi, c, pSH13 * v_colors_local);
+    setval(v_coeffs, 14, ci, gi, c, pSH14 * v_colors_local);
+    setval(v_coeffs, 15, ci, gi, c, pSH15 * v_colors_local);
 
     T fTmp0C_z, fTmp1B_z, fC2_x, fC2_y, fS2_x, fS2_y, pSH12_z, pSH13_x, pSH13_z, pSH11_y, pSH11_z,
         pSH14_x, pSH14_y, pSH14_z, pSH10_x, pSH10_y, pSH10_z, pSH15_x, pSH15_y, pSH9_x, pSH9_y;
@@ -246,17 +305,22 @@ sh_coeffs_to_color_vjp(const uint32_t                    degree,   // degree of
         pSH9_x   = -0.5900435899266435f * fS2_x;
         pSH9_y   = -0.5900435899266435f * fS2_y;
 
-        v_x += v_colors_local * (pSH9_x * coeffs[9 * 3 + c] + pSH15_x * coeffs[15 * 3 + c] +
-                                 pSH10_x * coeffs[10 * 3 + c] + pSH14_x * coeffs[14 * 3 + c] +
-                                 pSH13_x * coeffs[13 * 3 + c]);
+        const T cSH9  = getval(coeffs, 9, ci, gi, c);
+        const T cSH10 = getval(coeffs, 10, ci, gi, c);
+        const T cSH11 = getval(coeffs, 11, ci, gi, c);
+        const T cSH12 = getval(coeffs, 12, ci, gi, c);
+        const T cSH13 = getval(coeffs, 13, ci, gi, c);
+        const T cSH14 = getval(coeffs, 14, ci, gi, c);
+        const T cSH15 = getval(coeffs, 15, ci, gi, c);
+
+        v_x += v_colors_local * (pSH9_x * cSH9 + pSH15_x * cSH15 + pSH10_x * cSH10 +
+                                 pSH14_x * cSH14 + pSH13_x * cSH13);
 
-        v_y += v_colors_local * (pSH9_y * coeffs[9 * 3 + c] + pSH15_y * coeffs[15 * 3 + c] +
-                                 pSH10_y * coeffs[10 * 3 + c] + pSH14_y * coeffs[14 * 3 + c] +
-                                 pSH11_y * coeffs[11 * 3 + c]);
+        v_y += v_colors_local * (pSH9_y * cSH9 + pSH15_y * cSH15 + pSH10_y * cSH10 +
+                                 pSH14_y * cSH14 + pSH11_y * cSH11);
 
-        v_z += v_colors_local * (pSH12_z * coeffs[12 * 3 + c] + pSH13_z * coeffs[13 * 3 + c] +
-                                 pSH11_z * coeffs[11 * 3 + c] + pSH14_z * coeffs[14 * 3 + c] +
-                                 pSH10_z * coeffs[10 * 3 + c]);
+        v_z += v_colors_local * (pSH12_z * cSH12 + pSH13_z * cSH13 + pSH11_z * cSH11 +
+                                 pSH14_z * cSH14 + pSH10_z * cSH10);
     }
 
     if (degree < 4) {
@@ -266,29 +330,30 @@ sh_coeffs_to_color_vjp(const uint32_t                    degree,   // degree of
         return;
     }
 
-    T fTmp0D             = z * (-4.683325804901025f * z2 + 2.007139630671868f);
-    T fTmp1C             = 3.31161143515146f * z2 - 0.47308734787878f;
-    T fTmp2B             = -1.770130769779931f * z;
-    T fC3                = x * fC2 - y * fS2;
-    T fS3                = x * fS2 + y * fC2;
-    T pSH20              = (1.984313483298443f * z * pSH12 + -1.006230589874905f * pSH6);
-    T pSH21              = fTmp0D * x;
-    T pSH19              = fTmp0D * y;
-    T pSH22              = fTmp1C * fC1;
-    T pSH18              = fTmp1C * fS1;
-    T pSH23              = fTmp2B * fC2;
-    T pSH17              = fTmp2B * fS2;
-    T pSH24              = 0.6258357354491763f * fC3;
-    T pSH16              = 0.6258357354491763f * fS3;
-    v_coeffs[16 * 3 + c] = pSH16 * v_colors_local;
-    v_coeffs[17 * 3 + c] = pSH17 * v_colors_local;
-    v_coeffs[18 * 3 + c] = pSH18 * v_colors_local;
-    v_coeffs[19 * 3 + c] = pSH19 * v_colors_local;
-    v_coeffs[20 * 3 + c] = pSH20 * v_colors_local;
-    v_coeffs[21 * 3 + c] = pSH21 * v_colors_local;
-    v_coeffs[22 * 3 + c] = pSH22 * v_colors_local;
-    v_coeffs[23 * 3 + c] = pSH23 * v_colors_local;
-    v_coeffs[24 * 3 + c] = pSH24 * v_colors_local;
+    const T fTmp0D = z * (-4.683325804901025f * z2 + 2.007139630671868f);
+    const T fTmp1C = 3.31161143515146f * z2 - 0.47308734787878f;
+    const T fTmp2B = -1.770130769779931f * z;
+    const T fC3    = x * fC2 - y * fS2;
+    const T fS3    = x * fS2 + y * fC2;
+    const T pSH20  = (1.984313483298443f * z * pSH12 + -1.006230589874905f * pSH6);
+    const T pSH21  = fTmp0D * x;
+    const T pSH19  = fTmp0D * y;
+    const T pSH22  = fTmp1C * fC1;
+    const T pSH18  = fTmp1C * fS1;
+    const T pSH23  = fTmp2B * fC2;
+    const T pSH17  = fTmp2B * fS2;
+    const T pSH24  = 0.6258357354491763f * fC3;
+    const T pSH16  = 0.6258357354491763f * fS3;
+
+    setval(v_coeffs, 16, ci, gi, c, pSH16 * v_colors_local);
+    setval(v_coeffs, 17, ci, gi, c, pSH17 * v_colors_local);
+    setval(v_coeffs, 18, ci, gi, c, pSH18 * v_colors_local);
+    setval(v_coeffs, 19, ci, gi, c, pSH19 * v_colors_local);
+    setval(v_coeffs, 20, ci, gi, c, pSH20 * v_colors_local);
+    setval(v_coeffs, 21, ci, gi, c, pSH21 * v_colors_local);
+    setval(v_coeffs, 22, ci, gi, c, pSH22 * v_colors_local);
+    setval(v_coeffs, 23, ci, gi, c, pSH23 * v_colors_local);
+    setval(v_coeffs, 24, ci, gi, c, pSH24 * v_colors_local);
 
     T fTmp0D_z, fTmp1C_z, fTmp2B_z, fC3_x, fC3_y, fS3_x, fS3_y, pSH20_z, pSH21_x, pSH21_z, pSH19_y,
         pSH19_z, pSH22_x, pSH22_y, pSH22_z, pSH18_x, pSH18_y, pSH18_z, pSH23_x, pSH23_y, pSH23_z,
@@ -323,18 +388,25 @@ sh_coeffs_to_color_vjp(const uint32_t                    degree,   // degree of
         pSH16_x  = 0.6258357354491763f * fS3_x;
         pSH16_y  = 0.6258357354491763f * fS3_y;
 
-        v_x += v_colors_local * (pSH16_x * coeffs[16 * 3 + c] + pSH24_x * coeffs[24 * 3 + c] +
-                                 pSH17_x * coeffs[17 * 3 + c] + pSH23_x * coeffs[23 * 3 + c] +
-                                 pSH18_x * coeffs[18 * 3 + c] + pSH22_x * coeffs[22 * 3 + c] +
-                                 pSH21_x * coeffs[21 * 3 + c]);
-        v_y += v_colors_local * (pSH16_y * coeffs[16 * 3 + c] + pSH24_y * coeffs[24 * 3 + c] +
-                                 pSH17_y * coeffs[17 * 3 + c] + pSH23_y * coeffs[23 * 3 + c] +
-                                 pSH18_y * coeffs[18 * 3 + c] + pSH22_y * coeffs[22 * 3 + c] +
-                                 pSH19_y * coeffs[19 * 3 + c]);
-        v_z += v_colors_local * (pSH20_z * coeffs[20 * 3 + c] + pSH21_z * coeffs[21 * 3 + c] +
-                                 pSH19_z * coeffs[19 * 3 + c] + pSH22_z * coeffs[22 * 3 + c] +
-                                 pSH18_z * coeffs[18 * 3 + c] + pSH23_z * coeffs[23 * 3 + c] +
-                                 pSH17_z * coeffs[17 * 3 + c]);
+        const T cSH16 = getval(coeffs, 16, ci, gi, c);
+        const T cSH17 = getval(coeffs, 17, ci, gi, c);
+        const T cSH18 = getval(coeffs, 18, ci, gi, c);
+        const T cSH19 = getval(coeffs, 19, ci, gi, c);
+        const T cSH20 = getval(coeffs, 20, ci, gi, c);
+        const T cSH21 = getval(coeffs, 21, ci, gi, c);
+        const T cSH22 = getval(coeffs, 22, ci, gi, c);
+        const T cSH23 = getval(coeffs, 23, ci, gi, c);
+        const T cSH24 = getval(coeffs, 24, ci, gi, c);
+
+        v_x += v_colors_local *
+               (pSH16_x * cSH16 + pSH24_x * cSH24 + pSH17_x * cSH17 + pSH23_x * cSH23 +
+                pSH18_x * cSH18 + pSH22_x * cSH22 + pSH21_x * cSH21);
+        v_y += v_colors_local *
+               (pSH16_y * cSH16 + pSH24_y * cSH24 + pSH17_y * cSH17 + pSH23_y * cSH23 +
+                pSH18_y * cSH18 + pSH22_y * cSH22 + pSH19_y * cSH19);
+        v_z += v_colors_local *
+               (pSH20_z * cSH20 + pSH21_z * cSH21 + pSH19_z * cSH19 + pSH22_z * cSH22 +
+                pSH18_z * cSH18 + pSH23_z * cSH23 + pSH17_z * cSH17);
 
         write_v_dir(x, y, z, v_x, v_y, v_z, inorm, v_dir);
     }
@@ -343,12 +415,13 @@ sh_coeffs_to_color_vjp(const uint32_t                    degree,   // degree of
 
 // Evalute Spherical Harmonic functions at the given directions, assuming a uniform minibatch
 // of C cameras, each with N gaussians, and K SH coefficients per gaussian.
-template <typename T, uint32_t D>
+template <typename T>
 __global__ void
 compute_sh_fwd_kernel(
-    const uint32_t C, const uint32_t N, const uint32_t K, const uint32_t degree_to_use,
+    const uint32_t C, const uint32_t N, const uint32_t K, const uint32_t D,
+    const uint32_t                                                      degree_to_use,
     const torch::PackedTensorAccessor32<T, 3, torch::RestrictPtrTraits> dirs,   // [C, N, 3]
-    const torch::PackedTensorAccessor32<T, 4, torch::RestrictPtrTraits> coeffs, // [C, N, K, D]
+    const torch::PackedTensorAccessor32<T, 4, torch::RestrictPtrTraits> coeffs, // [K, C, N, D]
     const int *__restrict__ radii,                                              // [C, N]
     T *__restrict__ out_colors                                                  // [C, N, D]
 ) {
@@ -366,21 +439,22 @@ compute_sh_fwd_kernel(
         return;
     }
 
-    using vec3t               = typename Vec3Type<T>::type;
-    const T    *coeffs_ptr    = coeffs[cid][gid].data();
-    const vec3t dir           = *reinterpret_cast<vec3t *>(dirs[cid][gid].data());
-    T          *out_color_ptr = out_colors + eid * D;
-    sh_coeffs_to_color(degree_to_use, c, dir, coeffs_ptr, out_color_ptr);
+    using vec3t          = typename Vec3Type<T>::type;
+    const bool  has_dirs = dirs.size(0) > 0;
+    const vec3t dir =
+        has_dirs ? *reinterpret_cast<vec3t *>(dirs[cid][gid].data()) : vec3t{ 0.f, 0.f, 0.f };
+    T *out_color_ptr = out_colors + eid * D;
+    eval_sh_function(degree_to_use, cid, gid, c, dir, coeffs, out_color_ptr);
 }
 
 // Evalute Spherical Harmonic functions at the given directions, assuming a N gaussians with K SH
 // coefficients per gaussian.
-template <typename T, uint32_t D>
+template <typename T>
 __global__ void
 compute_sh_fwd_kernel_packed(
-    const uint32_t N, const uint32_t K, const uint32_t degree_to_use,
+    const uint32_t N, const uint32_t K, const uint32_t D, const uint32_t degree_to_use,
     const torch::PackedTensorAccessor32<T, 2, torch::RestrictPtrTraits> dirs,   // [N, 3]
-    const torch::PackedTensorAccessor32<T, 3, torch::RestrictPtrTraits> coeffs, // [N, K, D]
+    const torch::PackedTensorAccessor32<T, 3, torch::RestrictPtrTraits> coeffs, // [K, N, D]
     const int *__restrict__ radii,                                              // [N]
     T *__restrict__ out_colors                                                  // [N, D]
 ) {
@@ -396,22 +470,24 @@ compute_sh_fwd_kernel_packed(
         return;
     }
 
-    using vec3t               = typename Vec3Type<T>::type;
-    const T    *coeffs_ptr    = coeffs[gid].data();
-    const vec3t dir           = *reinterpret_cast<vec3t *>(dirs[gid].data());
-    T          *out_color_ptr = out_colors + gid * D;
-    sh_coeffs_to_color(degree_to_use, c, dir, coeffs_ptr, out_color_ptr);
+    using vec3t          = typename Vec3Type<T>::type;
+    const bool  has_dirs = dirs.size(0) > 0;
+    const vec3t dir =
+        has_dirs ? *reinterpret_cast<vec3t *>(dirs[gid].data()) : vec3t{ 0.f, 0.f, 0.f };
+    T *out_color_ptr = out_colors + gid * D;
+    eval_sh_function(degree_to_use, 0, gid, c, dir, coeffs, out_color_ptr);
 }
 
-template <typename T, uint32_t D>
+template <typename T>
 __global__ void
 compute_sh_bwd_kernel(
-    const uint32_t C, const uint32_t N, const uint32_t K, const uint32_t degree_to_use,
+    const uint32_t C, const uint32_t N, const uint32_t K, const uint32_t D,
+    const uint32_t                                                      degree_to_use,
     const torch::PackedTensorAccessor32<T, 3, torch::RestrictPtrTraits> dirs,     // [C, N, 3]
-    const torch::PackedTensorAccessor32<T, 4, torch::RestrictPtrTraits> coeffs,   // [C, N, K, 3]
+    const torch::PackedTensorAccessor32<T, 4, torch::RestrictPtrTraits> coeffs,   // [K, C, N, D]
     const int *__restrict__ radii,                                                // [C, N]
-    const torch::PackedTensorAccessor32<T, 3, torch::RestrictPtrTraits> v_colors, // [C, N, 3]
-    T *__restrict__ out_v_coeffs,                                                 // [C, N, K, 3]
+    const torch::PackedTensorAccessor32<T, 3, torch::RestrictPtrTraits> v_colors, // [C, N, D]
+    torch::PackedTensorAccessor32<T, 4, torch::RestrictPtrTraits> out_v_coeffs,   // [K, C, N, D]
     T *__restrict__ out_v_dirs // [C, N, 3] optional
 ) {
     // parallelize over C * N * D
@@ -428,18 +504,17 @@ compute_sh_bwd_kernel(
         return;
     }
 
-    using vec3t = typename Vec3Type<T>::type;
-
-    const T    *coeffs_ptr       = coeffs[cid][gid].data();
-    const vec3t dir              = *reinterpret_cast<vec3t *>(dirs[cid][gid].data());
-    const T    *v_color_ptr      = v_colors[cid][gid].data();
-    T          *out_v_coeffs_ptr = out_v_coeffs + eid * K * D;
+    using vec3t          = typename Vec3Type<T>::type;
+    const bool  has_dirs = dirs.size(0) > 0;
+    const vec3t dir =
+        has_dirs ? *reinterpret_cast<vec3t *>(dirs[cid][gid].data()) : vec3t{ 0.f, 0.f, 0.f };
+    const T *v_color_ptr = v_colors[cid][gid].data();
 
     vec3t  v_dir{ 0.f, 0.f, 0.f };
     vec3t *out_v_dir_ptr = out_v_dirs == nullptr ? nullptr : &v_dir;
 
-    sh_coeffs_to_color_vjp(degree_to_use, c, dir, coeffs_ptr, v_color_ptr, out_v_coeffs_ptr,
-                           out_v_dir_ptr);
+    eval_sh_function_vjp(degree_to_use, cid, gid, c, dir, coeffs, v_color_ptr, out_v_coeffs,
+                         out_v_dir_ptr);
     if (out_v_dirs != nullptr) {
         gpuAtomicAdd(out_v_dirs + eid * 3, v_dir.x);
         gpuAtomicAdd(out_v_dirs + eid * 3 + 1, v_dir.y);
@@ -447,16 +522,16 @@ compute_sh_bwd_kernel(
     }
 }
 
-template <typename T, uint32_t D>
+template <typename T>
 __global__ void
 compute_sh_bwd_kernel_packed(
-    const uint32_t N, const uint32_t K, const uint32_t degree_to_use,
-    const torch::PackedTensorAccessor32<T, 2, torch::RestrictPtrTraits> dirs,     // [N, 3]
-    const torch::PackedTensorAccessor32<T, 3, torch::RestrictPtrTraits> coeffs,   // [N, K, 3]
-    const int *__restrict__ radii,                                                // [N]
-    const torch::PackedTensorAccessor32<T, 2, torch::RestrictPtrTraits> v_colors, // [N, 3]
-    T *__restrict__ out_v_coeffs,                                                 // [N, K, 3]
-    T *__restrict__ out_v_dirs                                                    // [N, 3] optional
+    const uint32_t N, const uint32_t K, const uint32_t D, const uint32_t degree_to_use,
+    const torch::PackedTensorAccessor32<T, 2, torch::RestrictPtrTraits> dirs,         // [N, 3]
+    const torch::PackedTensorAccessor32<T, 3, torch::RestrictPtrTraits> coeffs,       // [K, N, D]
+    const int *__restrict__ radii,                                                    // [N]
+    const torch::PackedTensorAccessor32<T, 2, torch::RestrictPtrTraits> v_colors,     // [N, D]
+    torch::PackedTensorAccessor32<T, 3, torch::RestrictPtrTraits>       out_v_coeffs, // [K, N, D]
+    T *__restrict__ out_v_dirs // [N, 3] optional
 ) {
     // parallelize over N * D
     const uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x; // gidx * D + c
@@ -470,18 +545,17 @@ compute_sh_bwd_kernel_packed(
         return;
     }
 
-    using vec3t = typename Vec3Type<T>::type;
-
-    const T    *coeffs_ptr       = coeffs[gid].data();
-    const vec3t dir              = *reinterpret_cast<vec3t *>(dirs[gid].data());
-    const T    *v_color_ptr      = v_colors[gid].data();
-    T          *out_v_coeffs_ptr = out_v_coeffs + gid * K * D;
+    using vec3t          = typename Vec3Type<T>::type;
+    const bool  has_dirs = dirs.size(0) > 0;
+    const vec3t dir =
+        has_dirs ? *reinterpret_cast<vec3t *>(dirs[gid].data()) : vec3t{ 0.f, 0.f, 0.f };
+    const T *v_color_ptr = v_colors[gid].data();
 
     vec3t  v_dir{ 0.f, 0.f, 0.f };
     vec3t *out_v_dir_ptr = out_v_dirs == nullptr ? nullptr : &v_dir;
 
-    sh_coeffs_to_color_vjp(degree_to_use, c, dir, coeffs_ptr, v_color_ptr, out_v_coeffs_ptr,
-                           out_v_dir_ptr);
+    eval_sh_function_vjp(degree_to_use, static_cast<uint32_t>(0), gid, c, dir, coeffs, v_color_ptr,
+                         out_v_coeffs, out_v_dir_ptr);
     if (out_v_dirs != nullptr) {
         gpuAtomicAdd(out_v_dirs + gid * 3, v_dir.x);
         gpuAtomicAdd(out_v_dirs + gid * 3 + 1, v_dir.y);
@@ -493,74 +567,96 @@ template <>
 torch::Tensor
 dispatchSphericalHarmonicsForward<torch::kCUDA>(
     const int            sh_degree_to_use,
-    const torch::Tensor &dirs,      // [N, 3] or [C, N, 3]
-    const torch::Tensor &sh_coeffs, // [N, K, 3] or [C, N, K, 3]
-    const torch::Tensor &radii      // [N]
+    const torch::Tensor &dirs,      // [N, 3] or [C, N, D] or empty for degree 0
+    const torch::Tensor &sh_coeffs, // [N, K, D] or [C, N, K, D]
+    const torch::Tensor &radii      // [N] or [C, N]
 ) {
-    const at::cuda::OptionalCUDAGuard device_guard(at::device_of(dirs));
+    const at::cuda::OptionalCUDAGuard device_guard(at::device_of(sh_coeffs));
+
+    const bool is_packed = sh_coeffs.dim() == 3;
 
-    TORCH_CHECK_VALUE(dirs.is_cuda(), "dirs must be a CUDA tensor");
     TORCH_CHECK_VALUE(sh_coeffs.is_cuda(), "sh_coeffs must be a CUDA tensor");
     TORCH_CHECK_VALUE(radii.is_cuda(), "radii must be a CUDA tensor");
     TORCH_CHECK_VALUE(radii.is_contiguous(), "radii must be a contiguous");
 
-    const bool is_packed = sh_coeffs.dim() == 3;
     if (is_packed) {
-        TORCH_CHECK_VALUE(dirs.dim() == 2,
-                          "sh_coeffs must have shape [N, K, 3] and dirs must have shape [N, 3]");
-        TORCH_CHECK_VALUE(sh_coeffs.size(0) == dirs.size(0),
-                          "sh_coeffs must have shape [N, K, 3] and dirs must have shape [N, 3]");
+        TORCH_CHECK_VALUE(sh_coeffs.dim() == 3, "sh_coeffs must have shape [K, N, D]");
+        TORCH_CHECK_VALUE(radii.dim() == 1, "radii must have shape [N]");
+        TORCH_CHECK_VALUE(sh_coeffs.size(1) == radii.size(0),
+                          "sh_coeffs must have shape [K, N, D] and radii must have shape [N]");
     } else {
+        TORCH_CHECK_VALUE(sh_coeffs.dim() == 4, "sh_coeffs must have shape [K, C, N, D]");
+        TORCH_CHECK_VALUE(radii.dim() == 2, "radii must have shape [C, N]");
         TORCH_CHECK_VALUE(
-            dirs.dim() == 3,
-            "sh_coeffs must have shape [C, N, K, 3] and dirs must have shape [C, N, 3]");
-        TORCH_CHECK_VALUE(
-            sh_coeffs.dim() == 4,
-            "sh_coeffs must have shape [C, N, K, 3] and dirs must have shape [C, N, 3]");
+            sh_coeffs.size(1) == radii.size(0),
+            "sh_coeffs must have shape [K. C, N, D] and radii must have shape [C, N]");
         TORCH_CHECK_VALUE(
-            sh_coeffs.size(0) == dirs.size(0),
-            "sh_coeffs must have shape [C, N, K, 3] and dirs must have shape [C, N, 3]");
-        TORCH_CHECK_VALUE(
-            sh_coeffs.size(1) == dirs.size(1),
-            "sh_coeffs must have shape [C, N, K, 3] and dirs must have shape [C, N, 3]");
+            sh_coeffs.size(2) == radii.size(1),
+            "sh_coeffs must have shape [K, C, N, D] and radii must have shape [C, N]");
     }
 
-    TORCH_CHECK(sh_coeffs.size(-1) == 3, "sh_coeffs must have last dimension 3");
-    TORCH_CHECK(dirs.size(-1) == 3, "dirs must have last dimension 3");
-    const uint32_t K = sh_coeffs.size(-2);
-    const uint32_t N = is_packed ? dirs.size(0) : dirs.size(1);
-    const uint32_t C = is_packed ? 1 : dirs.size(0);
-
-    const uint32_t TOTAL_ELEMS = C * N * 3;
+    const uint32_t K           = sh_coeffs.size(0);
+    const uint32_t N           = is_packed ? sh_coeffs.size(1) : sh_coeffs.size(2);
+    const uint32_t C           = is_packed ? 1 : sh_coeffs.size(1);
+    const uint32_t D           = sh_coeffs.size(-1);
+    const uint32_t TOTAL_ELEMS = C * N * D;
     const uint32_t NUM_BLOCKS  = (TOTAL_ELEMS + NUM_THREADS - 1) / NUM_THREADS;
 
-    at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream(dirs.device().index());
+    // If you are using degree > 0, then we are going to use the directions tensor which means
+    // we need to check it has the right shape
+    if (K > 0 && sh_degree_to_use > 0) {
+        if (is_packed) {
+            TORCH_CHECK_VALUE(
+                dirs.dim() == 2,
+                "sh_coeffs must have shape [K, N, D] and dirs must have shape [N, D]");
+            TORCH_CHECK_VALUE(
+                sh_coeffs.size(1) == dirs.size(0),
+                "sh_coeffs must have shape [K, N, D] and dirs must have shape [N, D]");
+        } else {
+            TORCH_CHECK_VALUE(
+                dirs.dim() == 3,
+                "sh_coeffs must have shape [K, C, N, D] and dirs must have shape [C, N, D]");
+            TORCH_CHECK_VALUE(
+                sh_coeffs.size(1) == dirs.size(0),
+                "sh_coeffs must have shape [K, C, N, D] and dirs must have shape [C, N, D]");
+            TORCH_CHECK_VALUE(
+                sh_coeffs.size(2) == dirs.size(1),
+                "sh_coeffs must have shape [K, C, N, D] and dirs must have shape [C, N, D]");
+        }
+        TORCH_CHECK_VALUE(dirs.is_cuda(), "dirs must be a CUDA tensor");
+        TORCH_CHECK_VALUE(dirs.size(-1) == 3, "dirs must have last dimension 3");
+    }
+
+    const at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream(sh_coeffs.device().index());
 
     // TODO (Francis): Might need to do zeros_like here
-    torch::Tensor colors = torch::empty_like(dirs); // [..., 3]
+    const std::vector<int64_t> out_size =
+        is_packed ? std::vector<int64_t>{ N, D } : std::vector<int64_t>{ C, N, D };
+    torch::Tensor colors = torch::empty(out_size, sh_coeffs.options()); // [..., D]
 
     using scalar_t = float;
 
-    // parallelize over N * 3
+    // parallelize over N * D
     if (!N) {
-        return colors; // [..., 3]
+        return colors; // [..., D]
     }
     if (is_packed) {
-        compute_sh_fwd_kernel_packed<scalar_t, 3><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
-            N, K, sh_degree_to_use, dirs.packed_accessor32<scalar_t, 2, torch::RestrictPtrTraits>(),
+        compute_sh_fwd_kernel_packed<scalar_t><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
+            N, K, D, sh_degree_to_use,
+            dirs.packed_accessor32<scalar_t, 2, torch::RestrictPtrTraits>(),
             sh_coeffs.packed_accessor32<scalar_t, 3, torch::RestrictPtrTraits>(),
             radii.data_ptr<int>(), colors.data_ptr<scalar_t>());
         C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else {
-        compute_sh_fwd_kernel<scalar_t, 3><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
-            C, N, K, sh_degree_to_use,
+        compute_sh_fwd_kernel<scalar_t><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
+            C, N, K, D, sh_degree_to_use,
             dirs.packed_accessor32<scalar_t, 3, torch::RestrictPtrTraits>(),
             sh_coeffs.packed_accessor32<scalar_t, 4, torch::RestrictPtrTraits>(),
             radii.data_ptr<int>(), colors.data_ptr<scalar_t>());
         C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
 
-    return colors; // [..., 3]
+    return colors; // [..., D]
 }
 
 template <>
@@ -577,58 +673,81 @@ template <>
 std::tuple<torch::Tensor, torch::Tensor>
 dispatchSphericalHarmonicsBackward<torch::kCUDA>(
     const int            sh_degree_to_use,
-    const torch::Tensor &dirs,      // [C, N, 3] or [N, 3]
+    const torch::Tensor &dirs,      // [C, N, 3] or [N, 3] or empty for degree 0
     const torch::Tensor &sh_coeffs, // [C, N, K, 3] or [N, K, 3]
     const torch::Tensor &v_colors,  // [C, N, 3] or [N, 3]
     const torch::Tensor &radii,     // [C, N] or [N]
     const bool           compute_v_dirs) {
-    const at::cuda::OptionalCUDAGuard device_guard(at::device_of(dirs));
+    const at::cuda::OptionalCUDAGuard device_guard(at::device_of(sh_coeffs));
+
+    const bool is_packed = sh_coeffs.dim() == 3;
 
-    TORCH_CHECK_VALUE(dirs.is_cuda(), "dirs must be a CUDA tensor");
     TORCH_CHECK_VALUE(sh_coeffs.is_cuda(), "sh_coeffs must be a CUDA tensor");
-    TORCH_CHECK_VALUE(v_colors.is_cuda(), "radii must be a CUDA tensor");
     TORCH_CHECK_VALUE(radii.is_cuda(), "radii must be a CUDA tensor");
     TORCH_CHECK_VALUE(radii.is_contiguous(), "radii must be a contiguous");
+    TORCH_CHECK_VALUE(v_colors.is_cuda(), "v_colors must be a CUDA tensor");
 
-    const bool is_packed = sh_coeffs.dim() == 3;
     if (is_packed) {
-        TORCH_CHECK_VALUE(dirs.dim() == 2, "dirs must have shape [N, 3]");
-        TORCH_CHECK_VALUE(v_colors.dim() == 2, "v_colors must have shape [N, 3]");
-        TORCH_CHECK_VALUE(
-            sh_coeffs.size(0) == dirs.size(0),
-            "sh_coeffs and dirs must have the same number of elements in dimension 0");
+        TORCH_CHECK_VALUE(sh_coeffs.dim() == 3, "sh_coeffs must have shape [K, N, D]");
+        TORCH_CHECK_VALUE(radii.dim() == 1, "radii must have shape [N]");
+        TORCH_CHECK_VALUE(sh_coeffs.size(1) == radii.size(0),
+                          "sh_coeffs must have shape [K, N, D] and radii must have shape [N]");
         TORCH_CHECK_VALUE(
-            sh_coeffs.size(0) == v_colors.size(0),
+            sh_coeffs.size(2) == v_colors.size(0),
             "sh_coeffs and v_colors must have the same number of elements in dimension 0");
     } else {
-        TORCH_CHECK_VALUE(dirs.dim() == 3, " dirs must have shape [C, N, 3]");
-        TORCH_CHECK_VALUE(sh_coeffs.dim() == 4, "sh_coeffs must have shape [C, N, K, 3]");
-        TORCH_CHECK_VALUE(v_colors.dim() == 3, "v_colors must have shape [C, N, 3]");
+        TORCH_CHECK_VALUE(sh_coeffs.dim() == 4, "sh_coeffs must have shape [K, C, N, D]");
+        TORCH_CHECK_VALUE(radii.dim() == 2, "radii must have shape [C, N]");
         TORCH_CHECK_VALUE(
-            sh_coeffs.size(0) == dirs.size(0),
-            "sh_coeffs and dirs must have the same number of elements in dimension 0");
+            sh_coeffs.size(1) == radii.size(0),
+            "sh_coeffs must have shape [K, C, N, D] and radii must have shape [C, N]");
         TORCH_CHECK_VALUE(
-            sh_coeffs.size(0) == v_colors.size(0),
-            "sh_coeffs and v_colors must have the same number of elements in dimension 0");
+            sh_coeffs.size(2) == radii.size(1),
+            "sh_coeffs must have shape [K, C, N, D] and radii must have shape [C, N]");
         TORCH_CHECK_VALUE(
-            sh_coeffs.size(1) == dirs.size(1),
-            "sh_coeffs and dirs must have the same number of elements in dimension 1");
+            sh_coeffs.size(1) == v_colors.size(0),
+            "sh_coeffs and v_colors must have the same number of elements in dimension 0");
         TORCH_CHECK_VALUE(
-            sh_coeffs.size(1) == v_colors.size(1),
+            sh_coeffs.size(2) == v_colors.size(1),
             "sh_coeffs and v_colors must have the same number of elements in dimension 1");
     }
-
-    TORCH_CHECK(sh_coeffs.size(-1) == 3, "sh_coeffs must have last dimension 3");
-    TORCH_CHECK(dirs.size(-1) == 3, "dirs must have last dimension 3");
-    TORCH_CHECK(v_colors.size(-1) == 3, "v_colors must have last dimension 3");
-    const uint32_t K = sh_coeffs.size(-2);
-    const uint32_t N = is_packed ? dirs.size(0) : dirs.size(1);
-    const uint32_t C = is_packed ? 1 : dirs.size(0);
-
-    const uint32_t TOTAL_ELEMS = C * N * 3;
+    TORCH_CHECK_VALUE(
+        sh_coeffs.size(-1) == v_colors.size(-1),
+        "sh_coeffs and v_colors must have the same number of elements in the last dimension");
+
+    const uint32_t K           = sh_coeffs.size(0);
+    const uint32_t N           = is_packed ? sh_coeffs.size(1) : sh_coeffs.size(2);
+    const uint32_t C           = is_packed ? 1 : sh_coeffs.size(1);
+    const uint32_t D           = sh_coeffs.size(-1);
+    const uint32_t TOTAL_ELEMS = C * N * D;
     const uint32_t NUM_BLOCKS  = (TOTAL_ELEMS + NUM_THREADS - 1) / NUM_THREADS;
 
-    at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream(dirs.device().index());
+    // If you are using degree > 0, then we are going to use the directions tensor which means
+    // we need to check it has the right shape
+    if (K > 0 && sh_degree_to_use > 0) {
+        if (is_packed) {
+            TORCH_CHECK_VALUE(
+                dirs.dim() == 2,
+                "sh_coeffs must have shape [K, N, D] and dirs must have shape [N, D]");
+            TORCH_CHECK_VALUE(
+                sh_coeffs.size(1) == dirs.size(0),
+                "sh_coeffs must have shape [K, N, D] and dirs must have shape [N, D]");
+        } else {
+            TORCH_CHECK_VALUE(
+                dirs.dim() == 3,
+                "sh_coeffs must have shape [K, C, N, D] and dirs must have shape [C, N, D]");
+            TORCH_CHECK_VALUE(
+                sh_coeffs.size(1) == dirs.size(0),
+                "sh_coeffs must have shape [K, C, N, 3] and dirs must have shape [C, N, D]");
+            TORCH_CHECK_VALUE(
+                sh_coeffs.size(2) == dirs.size(1),
+                "sh_coeffs must have shape [K, C, N, 3] and dirs must have shape [C, N, D]");
+        }
+        TORCH_CHECK_VALUE(dirs.is_cuda(), "dirs must be a CUDA tensor");
+        TORCH_CHECK_VALUE(dirs.size(-1) == 3, "dirs must have last dimension 3");
+    }
+
+    at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream(sh_coeffs.device().index());
 
     torch::Tensor v_coeffs = torch::zeros_like(sh_coeffs);
     torch::Tensor v_dirs;
@@ -641,21 +760,24 @@ dispatchSphericalHarmonicsBackward<torch::kCUDA>(
 
     using scalar_t = float;
     if (is_packed) {
-        compute_sh_bwd_kernel_packed<scalar_t, 3><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
-            N, K, sh_degree_to_use, dirs.packed_accessor32<scalar_t, 2, torch::RestrictPtrTraits>(),
+        compute_sh_bwd_kernel_packed<scalar_t><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
+            N, K, D, sh_degree_to_use,
+            dirs.packed_accessor32<scalar_t, 2, torch::RestrictPtrTraits>(),
             sh_coeffs.packed_accessor32<scalar_t, 3, torch::RestrictPtrTraits>(),
             radii.data_ptr<int>(),
             v_colors.packed_accessor32<scalar_t, 2, torch::RestrictPtrTraits>(),
-            v_coeffs.data_ptr<scalar_t>(), compute_v_dirs ? v_dirs.data_ptr<scalar_t>() : nullptr);
+            v_coeffs.packed_accessor32<scalar_t, 3, torch::RestrictPtrTraits>(),
+            compute_v_dirs ? v_dirs.data_ptr<scalar_t>() : nullptr);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else {
-        compute_sh_bwd_kernel<scalar_t, 3><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
-            C, N, K, sh_degree_to_use,
+        compute_sh_bwd_kernel<scalar_t><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
+            C, N, K, D, sh_degree_to_use,
             dirs.packed_accessor32<scalar_t, 3, torch::RestrictPtrTraits>(),
             sh_coeffs.packed_accessor32<scalar_t, 4, torch::RestrictPtrTraits>(),
             radii.data_ptr<int>(),
             v_colors.packed_accessor32<scalar_t, 3, torch::RestrictPtrTraits>(),
-            v_coeffs.data_ptr<scalar_t>(), compute_v_dirs ? v_dirs.data_ptr<scalar_t>() : nullptr);
+            v_coeffs.packed_accessor32<scalar_t, 4, torch::RestrictPtrTraits>(),
+            compute_v_dirs ? v_dirs.data_ptr<scalar_t>() : nullptr);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
     return std::make_tuple(v_coeffs, v_dirs); // [..., K, 3], [..., 3]
diff --git a/fvdb/src/detail/ops/gsplat/VectorTypes.cuh b/fvdb/src/detail/ops/gsplat/VectorTypes.cuh
index f77349a07a..acf54dcb67 100644
--- a/fvdb/src/detail/ops/gsplat/VectorTypes.cuh
+++ b/fvdb/src/detail/ops/gsplat/VectorTypes.cuh
@@ -5,10 +5,12 @@
 #define FVDB_DETAIL_OPS_GSPLAT_VECTORTYPES_CUH
 
 #include <ATen/native/Math.h>
-#include <cuda_runtime.h>
-#include <cstdint>
+
 #include <cuda_bf16.hpp>
 #include <cuda_fp16.hpp>
+#include <cuda_runtime.h>
+
+#include <cstdint>
 
 /*
 Wrap 2D vector types for different scalar types
diff --git a/fvdb/src/detail/ops/jagged/JaggedReduce.cu b/fvdb/src/detail/ops/jagged/JaggedReduce.cu
index 1788a279b8..f04fab2acc 100644
--- a/fvdb/src/detail/ops/jagged/JaggedReduce.cu
+++ b/fvdb/src/detail/ops/jagged/JaggedReduce.cu
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "JaggedOps.h"
-
 #include <detail/utils/cuda/Atomics.cuh>
 #include <detail/utils/cuda/Utils.cuh>
 
diff --git a/fvdb/src/detail/ops/jagged/JaggedSort.cu b/fvdb/src/detail/ops/jagged/JaggedSort.cu
index bfa43a6aed..8052f921a5 100644
--- a/fvdb/src/detail/ops/jagged/JaggedSort.cu
+++ b/fvdb/src/detail/ops/jagged/JaggedSort.cu
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "JaggedOps.h"
-
 #include <detail/utils/cuda/Utils.cuh>
 
 #include <c10/cuda/CUDAException.h>
diff --git a/fvdb/src/detail/utils/nanovdb/TorchNanoConversions.h b/fvdb/src/detail/utils/nanovdb/TorchNanoConversions.h
index 02dda88df3..f83c2f0ef7 100644
--- a/fvdb/src/detail/utils/nanovdb/TorchNanoConversions.h
+++ b/fvdb/src/detail/utils/nanovdb/TorchNanoConversions.h
@@ -5,6 +5,7 @@
 #define FVDB_DETAIL_UTILS_NANOVDB_TORCHNANOCONVERSIONS_H
 
 #include <nanovdb/NanoVDB.h>
+
 #include <torch/torch.h>
 
 namespace fvdb {
diff --git a/fvdb/src/python/Bindings.cpp b/fvdb/src/python/Bindings.cpp
index fcc7edd9f8..6e2f6ed250 100644
--- a/fvdb/src/python/Bindings.cpp
+++ b/fvdb/src/python/Bindings.cpp
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "TypeCasters.h"
-
 #include <Config.h>
 #include <FVDB.h>
 #include <GaussianSplatting.h>
@@ -86,14 +85,15 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
           py::arg("near_plane") = 0.01, py::arg("far_plane") = 1e10,
           py::arg("sh_degree_to_use") = 3, py::arg("tile_size") = 16, py::arg("radius_clip") = 0.0,
           py::arg("eps2d") = 0.3, py::arg("antialias") = false,
-          py::arg("render_depth_channel") = false, py::arg("return_debug_info") = false);
+          py::arg("render_depth_channel") = false, py::arg("return_debug_info") = false,
+          py::arg("pixels_to_render") = torch::nullopt);
 
     m.def("gaussian_render_depth", &fvdb::gaussianRenderDepth, py::arg("means"), py::arg("quats"),
           py::arg("scales"), py::arg("opacities"), py::arg("viewmats"), py::arg("Ks"),
           py::arg("image_width"), py::arg("image_height"), py::arg("near_plane") = 0.01,
           py::arg("far_plane") = 1e10, py::arg("tile_size") = 16, py::arg("radius_clip") = 0.0,
           py::arg("eps2d") = 0.3, py::arg("antialias") = false,
-          py::arg("return_debug_info") = false);
+          py::arg("return_debug_info") = false, py::arg("pixels_to_render") = torch::nullopt);
 
     m.def("precompute_gaussian_render_state", &fvdb::precomputeGaussianRenderStateUnbatched,
           py::arg("means"), py::arg("quats"), py::arg("scales"), py::arg("opacities"),
diff --git a/fvdb/src/python/GridBatchBinding.cpp b/fvdb/src/python/GridBatchBinding.cpp
index ad50a65b74..667d6e4d74 100644
--- a/fvdb/src/python/GridBatchBinding.cpp
+++ b/fvdb/src/python/GridBatchBinding.cpp
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "TypeCasters.h"
-
 #include <FVDB.h>
 
 #include <torch/extension.h>
diff --git a/fvdb/src/python/JaggedTensorBinding.cpp b/fvdb/src/python/JaggedTensorBinding.cpp
index d8a25cbfd9..16bbb5c1bd 100644
--- a/fvdb/src/python/JaggedTensorBinding.cpp
+++ b/fvdb/src/python/JaggedTensorBinding.cpp
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "TypeCasters.h"
-
 #include <FVDB.h>
 
 #include <torch/extension.h>
diff --git a/fvdb/src/python/TypeCasters.h b/fvdb/src/python/TypeCasters.h
index 3c316da779..7f7a59b7c5 100644
--- a/fvdb/src/python/TypeCasters.h
+++ b/fvdb/src/python/TypeCasters.h
@@ -7,9 +7,10 @@
 #include <JaggedTensor.h>
 #include <Types.h>
 
+#include <torch/extension.h>
+
 #include <pybind11/numpy.h>
 #include <pybind11/stl.h>
-#include <torch/extension.h>
 
 namespace pybind11 {
 namespace detail {
diff --git a/fvdb/tests/unit/test_gsplat.py b/fvdb/tests/unit/test_gsplat.py
index bb31f539c0..8a60ee4bdc 100644
--- a/fvdb/tests/unit/test_gsplat.py
+++ b/fvdb/tests/unit/test_gsplat.py
@@ -66,8 +66,8 @@ def setUp(self):
         self.opacities.requires_grad = True
 
         self.sh_degree = 3
-        self.sh_coeffs = torch.zeros((self.means.shape[0], (self.sh_degree + 1) ** 2, 3), device=self.device)
-        self.sh_coeffs[:, 0, :] = rgb_to_sh(self.colors)
+        self.sh_coeffs = torch.zeros(((self.sh_degree + 1) ** 2, self.means.shape[0], 3), device=self.device)
+        self.sh_coeffs[0, :, :] = rgb_to_sh(self.colors)
         self.sh_coeffs.requires_grad = True
 
     def test_fully_fused_projection(self):
@@ -161,7 +161,7 @@ def test_gaussian_render_jagged(self):
         jt_quats = JaggedTensor([self.quats, self.quats]).to(self.device)
         jt_scales = JaggedTensor([self.scales, self.scales]).to(self.device)
         jt_opacities = JaggedTensor([self.opacities, self.opacities]).to(self.device)
-        jt_sh_coeffs = JaggedTensor([self.sh_coeffs, self.sh_coeffs]).to(self.device)
+        jt_sh_coeffs = JaggedTensor([self.sh_coeffs.permute(1, 0, 2), self.sh_coeffs.permute(1, 0, 2)]).to(self.device)
 
         # The first scene renders to 2 views and the second scene renders to a single view
         jt_viewmats = JaggedTensor([self.viewmats[:2], self.viewmats[2:]]).to(self.device)

From 9d691a88d998e504d9731e9c43b45558d3b05319 Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Sun, 22 Dec 2024 16:38:38 +1300
Subject: [PATCH 20/59] Removing GaussianRasterize.cu which was split into two
 source files

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 .../detail/ops/gsplat/GaussianRasterize.cu    | 846 ------------------
 1 file changed, 846 deletions(-)
 delete mode 100644 fvdb/src/detail/ops/gsplat/GaussianRasterize.cu

diff --git a/fvdb/src/detail/ops/gsplat/GaussianRasterize.cu b/fvdb/src/detail/ops/gsplat/GaussianRasterize.cu
deleted file mode 100644
index 100f83d168..0000000000
--- a/fvdb/src/detail/ops/gsplat/GaussianRasterize.cu
+++ /dev/null
@@ -1,846 +0,0 @@
-// Copyright Contributors to the OpenVDB Project
-// SPDX-License-Identifier: Apache-2.0
-//
-#include "GsplatTypes.cuh"
-
-#include <detail/ops/Ops.h>
-
-#include <ATen/cuda/Atomic.cuh>
-#include <cooperative_groups.h>
-#include <cub/cub.cuh>
-
-namespace fvdb {
-namespace detail {
-namespace ops {
-
-namespace cg = cooperative_groups;
-
-/****************************************************************************
- * Rasterization to Pixels Forward Pass
- ****************************************************************************/
-
-template <uint32_t COLOR_DIM, typename S>
-__global__ void
-rasterize_to_pixels_fwd_kernel(
-    const uint32_t C, const uint32_t N, const uint32_t n_isects, const bool packed,
-    const vec2<S> *__restrict__ means2d, // [C, N, 2] or [nnz, 2]
-    const vec3<S> *__restrict__ conics,  // [C, N, 3] or [nnz, 3]
-    const S *__restrict__ colors,        // [C, N, COLOR_DIM] or [nnz, COLOR_DIM]
-    const S *__restrict__ opacities,     // [C, N] or [nnz]
-    const S *__restrict__ backgrounds,   // [C, COLOR_DIM]
-    const bool *__restrict__ masks,      // [C, tile_height, tile_width]
-    const uint32_t image_width, const uint32_t image_height,
-
-    const uint32_t image_origin_w, const uint32_t image_origin_h,
-
-    const uint32_t tile_origin_w, const uint32_t tile_origin_h,
-
-    const uint32_t tile_size, const uint32_t tile_width, const uint32_t tile_height,
-    const int32_t *__restrict__ tile_offsets, // [C, tile_height, tile_width]
-    const int32_t *__restrict__ flatten_ids,  // [n_isects]
-    S *__restrict__ render_colors,            // [C, image_height, image_width, COLOR_DIM]
-    S *__restrict__ render_alphas,            // [C, image_height, image_width, 1]
-    int32_t *__restrict__ last_ids            // [C, image_height, image_width]
-) {
-    // each thread draws one pixel, but also timeshares caching gaussians in a
-    // shared tile
-
-    // printf("tile_origin_h = %u, tile_origin_w = %u\n", tile_origin_h, tile_origin_w);
-    auto          block     = cg::this_thread_block();
-    const int32_t camera_id = block.group_index().x;
-
-    // blockIdx runs from [0, num_tiles_h] x [0, num_tiles_w]
-    const int32_t tile_id = (block.group_index().y + tile_origin_h) * tile_width +
-                            block.group_index().z + tile_origin_w;
-    // Pixel coordinates run from [0, height] x [0, width]
-    const uint32_t i = block.group_index().y * tile_size + block.thread_index().y;
-    const uint32_t j = block.group_index().z * tile_size + block.thread_index().x;
-
-    tile_offsets += camera_id * tile_height * tile_width;
-    render_colors += camera_id * image_height * image_width * COLOR_DIM;
-    render_alphas += camera_id * image_height * image_width;
-    last_ids += camera_id * image_height * image_width;
-    if (backgrounds != nullptr) {
-        backgrounds += camera_id * COLOR_DIM;
-    }
-    if (masks != nullptr) {
-        masks += camera_id * tile_height * tile_width;
-    }
-
-    const S       px     = (S)(j + image_origin_w) + 0.5f;
-    const S       py     = (S)(i + image_origin_h) + 0.5f;
-    const int32_t pix_id = i * image_width + j;
-
-    // return if out of bounds
-    // keep not rasterizing threads around for reading data
-    const bool inside = (i < image_height && j < image_width);
-    bool       done   = !inside;
-
-    // when the mask is provided, render the background color and return
-    // if this tile is labeled as False
-    if (masks != nullptr && inside && !masks[tile_id]) {
-        for (uint32_t k = 0; k < COLOR_DIM; ++k) {
-            render_colors[pix_id * COLOR_DIM + k] = backgrounds == nullptr ? 0.0f : backgrounds[k];
-        }
-        return;
-    }
-
-    // have all threads in tile process the same gaussians in batches
-    // first collect gaussians between range.x and range.y in batches
-    // which gaussians to look through in this tile
-    const int32_t  range_start = tile_offsets[tile_id];
-    const int32_t  range_end   = (camera_id == C - 1) && (tile_id == tile_width * tile_height - 1)
-                                     ? n_isects
-                                     : tile_offsets[tile_id + 1];
-    const uint32_t block_size  = block.size();
-    const uint32_t num_batches = (range_end - range_start + block_size - 1) / block_size;
-
-    extern __shared__ int s[];
-    int32_t              *id_batch = (int32_t *)s;                      // [block_size]
-    vec3<S>              *xy_opacity_batch =
-        reinterpret_cast<vec3<float> *>(&id_batch[block_size]);         // [block_size]
-    vec3<S> *conic_batch =
-        reinterpret_cast<vec3<float> *>(&xy_opacity_batch[block_size]); // [block_size]
-
-    // current visibility left to render
-    // transmittance is gonna be used in the backward pass which requires a high
-    // numerical precision so we use double for it. However double make bwd 1.5x
-    // slower so we stick with float for now.
-    S T = 1.0f;
-    // index of most recent gaussian to write to this thread's pixel
-    uint32_t cur_idx = 0;
-
-    // collect and process batches of gaussians
-    // each thread loads one gaussian at a time before rasterizing its
-    // designated pixel
-    uint32_t tr = block.thread_rank();
-
-    S pix_out[COLOR_DIM] = { 0.f };
-    for (uint32_t b = 0; b < num_batches; ++b) {
-        // resync all threads before beginning next batch
-        // end early if entire tile is done
-        if (__syncthreads_count(done) >= block_size) {
-            break;
-        }
-
-        // each thread fetch 1 gaussian from front to back
-        // index of gaussian to load
-        uint32_t batch_start = range_start + block_size * b;
-        uint32_t idx         = batch_start + tr;
-        if (idx < range_end) {
-            int32_t g            = flatten_ids[idx]; // flatten index in [C * N] or [nnz]
-            id_batch[tr]         = g;
-            const vec2<S> xy     = means2d[g];
-            const S       opac   = opacities[g];
-            xy_opacity_batch[tr] = { xy.x, xy.y, opac };
-            conic_batch[tr]      = conics[g];
-        }
-
-        // wait for other threads to collect the gaussians in batch
-        block.sync();
-
-        // process gaussians in the current batch for this pixel
-        uint32_t batch_size = min(block_size, range_end - batch_start);
-        for (uint32_t t = 0; (t < batch_size) && !done; ++t) {
-            const vec3<S> conic   = conic_batch[t];
-            const vec3<S> xy_opac = xy_opacity_batch[t];
-            const S       opac    = xy_opac.z;
-            const vec2<S> delta   = { xy_opac.x - px, xy_opac.y - py };
-            const S sigma = 0.5f * (conic.x * delta.x * delta.x + conic.z * delta.y * delta.y) +
-                            conic.y * delta.x * delta.y;
-            S alpha = min(0.999f, opac * __expf(-sigma));
-            if (sigma < 0.f || alpha < 1.f / 255.f) {
-                continue;
-            }
-
-            const S next_T = T * (1.0f - alpha);
-            if (next_T <= 1e-4) { // this pixel is done: exclusive
-                done = true;
-                break;
-            }
-
-            int32_t  g     = id_batch[t];
-            const S  vis   = alpha * T;
-            const S *c_ptr = colors + g * COLOR_DIM;
-            GSPLAT_PRAGMA_UNROLL
-            for (uint32_t k = 0; k < COLOR_DIM; ++k) {
-                pix_out[k] += c_ptr[k] * vis;
-            }
-            cur_idx = batch_start + t;
-
-            T = next_T;
-        }
-    }
-
-    if (inside) {
-        // Here T is the transmittance AFTER the last gaussian in this pixel.
-        // We (should) store double precision as T would be used in backward
-        // pass and it can be very small and causing large diff in gradients
-        // with float32. However, double precision makes the backward pass 1.5x
-        // slower so we stick with float for now.
-        render_alphas[pix_id] = 1.0f - T;
-        GSPLAT_PRAGMA_UNROLL
-        for (uint32_t k = 0; k < COLOR_DIM; ++k) {
-            render_colors[pix_id * COLOR_DIM + k] =
-                backgrounds == nullptr ? pix_out[k] : (pix_out[k] + T * backgrounds[k]);
-        }
-        // index in bin of last gaussian in this pixel
-        last_ids[pix_id] = static_cast<int32_t>(cur_idx);
-    }
-}
-
-template <uint32_t CDIM>
-std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
-call_fwd_kernel_with_dim(
-    // Gaussian parameters
-    const torch::Tensor               &means2d,     // [C, N, 2] or [nnz, 2]
-    const torch::Tensor               &conics,      // [C, N, 3] or [nnz, 3]
-    const torch::Tensor               &colors,      // [C, N, channels] or [nnz, channels]
-    const torch::Tensor               &opacities,   // [C, N]  or [nnz]
-    const at::optional<torch::Tensor> &backgrounds, // [C, channels]
-    const at::optional<torch::Tensor> &masks,       // [C, tile_height, tile_width]
-    // image size
-    const uint32_t image_width, const uint32_t image_height, const uint32_t image_origin_w,
-    const uint32_t image_origin_h, const uint32_t tile_size,
-    // intersections
-    const torch::Tensor &tile_offsets, // [C, tile_height, tile_width]
-    const torch::Tensor &flatten_ids   // [n_isects]
-) {
-    GSPLAT_DEVICE_GUARD(means2d);
-    GSPLAT_CHECK_INPUT(means2d);
-    GSPLAT_CHECK_INPUT(conics);
-    GSPLAT_CHECK_INPUT(colors);
-    GSPLAT_CHECK_INPUT(opacities);
-    GSPLAT_CHECK_INPUT(tile_offsets);
-    GSPLAT_CHECK_INPUT(flatten_ids);
-    if (backgrounds.has_value()) {
-        GSPLAT_CHECK_INPUT(backgrounds.value());
-    }
-    if (masks.has_value()) {
-        GSPLAT_CHECK_INPUT(masks.value());
-    }
-    bool packed = means2d.dim() == 2;
-
-    uint32_t C           = tile_offsets.size(0);         // number of cameras
-    uint32_t N           = packed ? 0 : means2d.size(1); // number of gaussians
-    uint32_t channels    = colors.size(-1);
-    uint32_t tile_height = tile_offsets.size(1);
-    uint32_t tile_width  = tile_offsets.size(2);
-    uint32_t n_isects    = flatten_ids.size(0);
-
-    const uint32_t tile_origin_w = image_origin_w / tile_size;
-    const uint32_t tile_origin_h = image_origin_h / tile_size;
-    const uint32_t tile_extent_w = (image_width + tile_size - 1) / tile_size;
-    const uint32_t tile_extent_h = (image_height + tile_size - 1) / tile_size;
-
-    // tile_width blocks.
-    dim3 threads = { tile_size, tile_size, 1 };
-    dim3 blocks  = { C, tile_extent_h, tile_extent_w };
-
-    torch::Tensor renders = torch::empty({ C, image_height, image_width, channels },
-                                         means2d.options().dtype(torch::kFloat32));
-    torch::Tensor alphas =
-        torch::empty({ C, image_height, image_width, 1 }, means2d.options().dtype(torch::kFloat32));
-    torch::Tensor last_ids =
-        torch::empty({ C, image_height, image_width }, means2d.options().dtype(torch::kInt32));
-
-    at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();
-    const uint32_t       shared_mem =
-        tile_size * tile_size * (sizeof(int32_t) + sizeof(vec3<float>) + sizeof(vec3<float>));
-
-    // TODO: an optimization can be done by passing the actual number of
-    // channels into the kernel functions and avoid necessary global memory
-    // writes. This requires moving the channel padding from python to C side.
-    if (cudaFuncSetAttribute(rasterize_to_pixels_fwd_kernel<CDIM, float>,
-                             cudaFuncAttributeMaxDynamicSharedMemorySize,
-                             shared_mem) != cudaSuccess) {
-        AT_ERROR("Failed to set maximum shared memory size (requested ", shared_mem,
-                 " bytes), try lowering tile_size.");
-    }
-    rasterize_to_pixels_fwd_kernel<CDIM, float><<<blocks, threads, shared_mem, stream>>>(
-        C, N, n_isects, packed, reinterpret_cast<vec2<float> *>(means2d.data_ptr<float>()),
-        reinterpret_cast<vec3<float> *>(conics.data_ptr<float>()), colors.data_ptr<float>(),
-        opacities.data_ptr<float>(),
-        backgrounds.has_value() ? backgrounds.value().data_ptr<float>() : nullptr,
-        masks.has_value() ? masks.value().data_ptr<bool>() : nullptr, image_width, image_height,
-        image_origin_w, image_origin_h, tile_origin_w, tile_origin_h, tile_size, tile_width,
-        tile_height, tile_offsets.data_ptr<int32_t>(), flatten_ids.data_ptr<int32_t>(),
-        renders.data_ptr<float>(), alphas.data_ptr<float>(), last_ids.data_ptr<int32_t>());
-    C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-    return std::make_tuple(renders, alphas, last_ids);
-}
-
-std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
-rasterize_to_pixels_fwd_tensor(
-    // Gaussian parameters
-    const torch::Tensor               &means2d,     // [C, N, 2] or [nnz, 2]
-    const torch::Tensor               &conics,      // [C, N, 3] or [nnz, 3]
-    const torch::Tensor               &colors,      // [C, N, channels] or [nnz, channels]
-    const torch::Tensor               &opacities,   // [C, N]  or [nnz]
-    const at::optional<torch::Tensor> &backgrounds, // [C, channels]
-    const at::optional<torch::Tensor> &masks,       // [C, tile_height, tile_width]
-    // image size
-    const uint32_t image_width, const uint32_t image_height, const uint32_t image_origin_w,
-    const uint32_t image_origin_h, const uint32_t tile_size,
-    // intersections
-    const torch::Tensor &tile_offsets, // [C, tile_height, tile_width]
-    const torch::Tensor &flatten_ids   // [n_isects]
-) {
-    GSPLAT_CHECK_INPUT(colors);
-    uint32_t channels = colors.size(-1);
-
-#define __GS__CALL_FWD_(N)                                                                         \
-    case N:                                                                                        \
-        return call_fwd_kernel_with_dim<N>(means2d, conics, colors, opacities, backgrounds, masks, \
-                                           image_width, image_height, image_origin_w,              \
-                                           image_origin_h, tile_size, tile_offsets, flatten_ids);
-
-    // TODO: an optimization can be done by passing the actual number of
-    // channels into the kernel functions and avoid necessary global memory
-    // writes. This requires moving the channel padding from python to C side.
-    switch (channels) {
-        __GS__CALL_FWD_(1)
-        __GS__CALL_FWD_(2)
-        __GS__CALL_FWD_(3)
-        __GS__CALL_FWD_(4)
-        __GS__CALL_FWD_(5)
-        __GS__CALL_FWD_(8)
-        __GS__CALL_FWD_(9)
-        __GS__CALL_FWD_(16)
-        __GS__CALL_FWD_(17)
-        __GS__CALL_FWD_(32)
-        __GS__CALL_FWD_(33)
-        __GS__CALL_FWD_(64)
-        __GS__CALL_FWD_(65)
-        __GS__CALL_FWD_(128)
-        __GS__CALL_FWD_(129)
-        __GS__CALL_FWD_(256)
-        __GS__CALL_FWD_(257)
-        __GS__CALL_FWD_(512)
-        __GS__CALL_FWD_(513)
-    default:
-        AT_ERROR("Unsupported number of channels: ", channels);
-    }
-}
-
-/****************************************************************************
- * Rasterization to Pixels Backward Pass
- ****************************************************************************/
-
-template <uint32_t COLOR_DIM, typename S>
-__global__ void
-rasterize_to_pixels_bwd_kernel(
-    const uint32_t C, const uint32_t N, const uint32_t n_isects, const bool packed,
-    // fwd inputs
-    const vec2<S> *__restrict__ means2d, // [C, N, 2] or [nnz, 2]
-    const vec3<S> *__restrict__ conics,  // [C, N, 3] or [nnz, 3]
-    const S *__restrict__ colors,        // [C, N, COLOR_DIM] or [nnz, COLOR_DIM]
-    const S *__restrict__ opacities,     // [C, N] or [nnz]
-    const S *__restrict__ backgrounds,   // [C, COLOR_DIM] or [nnz, COLOR_DIM]
-    const bool *__restrict__ masks,      // [C, tile_height, tile_width]
-    const uint32_t image_width, const uint32_t image_height, const uint32_t image_origin_w,
-    const uint32_t image_origin_h, const uint32_t tile_origin_w, const uint32_t tile_origin_h,
-    const uint32_t tile_size, const uint32_t tile_width, const uint32_t tile_height,
-    const int32_t *__restrict__ tile_offsets, // [C, tile_height, tile_width]
-    const int32_t *__restrict__ flatten_ids,  // [n_isects]
-    // fwd outputs
-    const S *__restrict__ render_alphas,  // [C, image_height, image_width, 1]
-    const int32_t *__restrict__ last_ids, // [C, image_height, image_width]
-    // grad outputs
-    const S *__restrict__ v_render_colors, // [C, image_height, image_width,
-                                           // COLOR_DIM]
-    const S *__restrict__ v_render_alphas, // [C, image_height, image_width, 1]
-    // grad inputs
-    vec2<S> *__restrict__ v_means2d_abs, // [C, N, 2] or [nnz, 2]
-    vec2<S> *__restrict__ v_means2d,     // [C, N, 2] or [nnz, 2]
-    vec3<S> *__restrict__ v_conics,      // [C, N, 3] or [nnz, 3]
-    S *__restrict__ v_colors,            // [C, N, COLOR_DIM] or [nnz, COLOR_DIM]
-    S *__restrict__ v_opacities          // [C, N] or [nnz]
-) {
-    auto     block     = cg::this_thread_block();
-    uint32_t camera_id = block.group_index().x;
-
-    // blockIdx runs from [0, num_tiles_h] x [0, num_tiles_w]
-    const int32_t tile_id = (block.group_index().y + tile_origin_h) * tile_width +
-                            block.group_index().z + tile_origin_w;
-    // Pixel coordinates run from [0, height] x [0, width]
-    const uint32_t i = block.group_index().y * tile_size + block.thread_index().y;
-    const uint32_t j = block.group_index().z * tile_size + block.thread_index().x;
-
-    tile_offsets += camera_id * tile_height * tile_width;
-    render_alphas += camera_id * image_height * image_width;
-    last_ids += camera_id * image_height * image_width;
-    v_render_colors += camera_id * image_height * image_width * COLOR_DIM;
-    v_render_alphas += camera_id * image_height * image_width;
-    if (backgrounds != nullptr) {
-        backgrounds += camera_id * COLOR_DIM;
-    }
-    if (masks != nullptr) {
-        masks += camera_id * tile_height * tile_width;
-    }
-
-    // when the mask is provided, do nothing and return if
-    // this tile is labeled as False
-    if (masks != nullptr && !masks[tile_id]) {
-        return;
-    }
-
-    const S px = (S)(j + image_origin_w) + 0.5f;
-    const S py = (S)(i + image_origin_h) + 0.5f;
-
-    // clamp this value to the last pixel
-    const int32_t pix_id = min(i * image_width + j, image_width * image_height - 1);
-
-    // keep not rasterizing threads around for reading data
-    const bool inside = (i < image_height && j < image_width);
-
-    // have all threads in tile process the same gaussians in batches
-    // first collect gaussians between range.x and range.y in batches
-    // which gaussians to look through in this tile
-    int32_t        range_start = tile_offsets[tile_id];
-    int32_t        range_end   = (camera_id == C - 1) && (tile_id == tile_width * tile_height - 1)
-                                     ? n_isects
-                                     : tile_offsets[tile_id + 1];
-    const uint32_t block_size  = block.size();
-    const uint32_t num_batches = (range_end - range_start + block_size - 1) / block_size;
-
-    extern __shared__ int s[];
-    int32_t              *id_batch = (int32_t *)s;                      // [block_size]
-    vec3<S>              *xy_opacity_batch =
-        reinterpret_cast<vec3<float> *>(&id_batch[block_size]);         // [block_size]
-    vec3<S> *conic_batch =
-        reinterpret_cast<vec3<float> *>(&xy_opacity_batch[block_size]); // [block_size]
-    S *rgbs_batch = (S *)&conic_batch[block_size];                      // [block_size * COLOR_DIM]
-
-    // this is the T AFTER the last gaussian in this pixel
-    S T_final = 1.0f - render_alphas[pix_id];
-    S T       = T_final;
-    // the contribution from gaussians behind the current one
-    S buffer[COLOR_DIM] = { 0.f };
-    // index of last gaussian to contribute to this pixel
-    const int32_t bin_final = inside ? last_ids[pix_id] : 0;
-
-    // df/d_out for this pixel
-    S v_render_c[COLOR_DIM];
-    GSPLAT_PRAGMA_UNROLL
-    for (uint32_t k = 0; k < COLOR_DIM; ++k) {
-        v_render_c[k] = v_render_colors[pix_id * COLOR_DIM + k];
-    }
-    const S v_render_a = v_render_alphas[pix_id];
-
-    // collect and process batches of gaussians
-    // each thread loads one gaussian at a time before rasterizing
-    const uint32_t            tr             = block.thread_rank();
-    cg::thread_block_tile<32> warp           = cg::tiled_partition<32>(block);
-    const int32_t             warp_bin_final = cg::reduce(warp, bin_final, cg::greater<int>());
-    for (uint32_t b = 0; b < num_batches; ++b) {
-        // resync all threads before writing next batch of shared mem
-        block.sync();
-
-        // each thread fetch 1 gaussian from back to front
-        // 0 index will be furthest back in batch
-        // index of gaussian to load
-        // batch end is the index of the last gaussian in the batch
-        // These values can be negative so must be int32 instead of uint32
-        const int32_t batch_end  = range_end - 1 - block_size * b;
-        const int32_t batch_size = min(block_size, batch_end + 1 - range_start);
-        const int32_t idx        = batch_end - tr;
-        if (idx >= range_start) {
-            int32_t g            = flatten_ids[idx]; // flatten index in [C * N] or [nnz]
-            id_batch[tr]         = g;
-            const vec2<S> xy     = means2d[g];
-            const S       opac   = opacities[g];
-            xy_opacity_batch[tr] = { xy.x, xy.y, opac };
-            conic_batch[tr]      = conics[g];
-            GSPLAT_PRAGMA_UNROLL
-            for (uint32_t k = 0; k < COLOR_DIM; ++k) {
-                rgbs_batch[tr * COLOR_DIM + k] = colors[g * COLOR_DIM + k];
-            }
-        }
-        // wait for other threads to collect the gaussians in batch
-        block.sync();
-        // process gaussians in the current batch for this pixel
-        // 0 index is the furthest back gaussian in the batch
-        for (uint32_t t = max(0, batch_end - warp_bin_final); t < batch_size; ++t) {
-            bool valid = inside;
-            if (batch_end - t > bin_final) {
-                valid = 0;
-            }
-            S       alpha;
-            S       opac;
-            vec2<S> delta;
-            vec3<S> conic;
-            S       vis;
-
-            if (valid) {
-                conic           = conic_batch[t];
-                vec3<S> xy_opac = xy_opacity_batch[t];
-                opac            = xy_opac.z;
-                delta           = { xy_opac.x - px, xy_opac.y - py };
-                S sigma = 0.5f * (conic.x * delta.x * delta.x + conic.z * delta.y * delta.y) +
-                          conic.y * delta.x * delta.y;
-                vis   = __expf(-sigma);
-                alpha = min(0.999f, opac * vis);
-                if (sigma < 0.f || alpha < 1.f / 255.f) {
-                    valid = false;
-                }
-            }
-
-            // if all threads are inactive in this warp, skip this loop
-            if (!warp.any(valid)) {
-                continue;
-            }
-            S       v_rgb_local[COLOR_DIM] = { 0.f };
-            vec3<S> v_conic_local          = { 0.f, 0.f, 0.f };
-            vec2<S> v_xy_local             = { 0.f, 0.f };
-            vec2<S> v_xy_abs_local         = { 0.f, 0.f };
-            S       v_opacity_local        = 0.f;
-            // initialize everything to 0, only set if the lane is valid
-            if (valid) {
-                // compute the current T for this gaussian
-                S ra = 1.0f / (1.0f - alpha);
-                T *= ra;
-                // update v_rgb for this gaussian
-                const S fac = alpha * T;
-                GSPLAT_PRAGMA_UNROLL
-                for (uint32_t k = 0; k < COLOR_DIM; ++k) {
-                    v_rgb_local[k] = fac * v_render_c[k];
-                }
-                // contribution from this pixel
-                S v_alpha = 0.f;
-                for (uint32_t k = 0; k < COLOR_DIM; ++k) {
-                    v_alpha += (rgbs_batch[t * COLOR_DIM + k] * T - buffer[k] * ra) * v_render_c[k];
-                }
-
-                v_alpha += T_final * ra * v_render_a;
-                // contribution from background pixel
-                if (backgrounds != nullptr) {
-                    S accum = 0.f;
-                    GSPLAT_PRAGMA_UNROLL
-                    for (uint32_t k = 0; k < COLOR_DIM; ++k) {
-                        accum += backgrounds[k] * v_render_c[k];
-                    }
-                    v_alpha += -T_final * ra * accum;
-                }
-
-                if (opac * vis <= 0.999f) {
-                    const S v_sigma = -opac * vis * v_alpha;
-                    v_conic_local   = { 0.5f * v_sigma * delta.x * delta.x,
-                                        v_sigma * delta.x * delta.y,
-                                        0.5f * v_sigma * delta.y * delta.y };
-                    v_xy_local      = { v_sigma * (conic.x * delta.x + conic.y * delta.y),
-                                        v_sigma * (conic.y * delta.x + conic.z * delta.y) };
-                    if (v_means2d_abs != nullptr) {
-                        v_xy_abs_local = { abs(v_xy_local.x), abs(v_xy_local.y) };
-                    }
-                    v_opacity_local = vis * v_alpha;
-                }
-
-                GSPLAT_PRAGMA_UNROLL
-                for (uint32_t k = 0; k < COLOR_DIM; ++k) {
-                    buffer[k] += rgbs_batch[t * COLOR_DIM + k] * fac;
-                }
-            }
-            warpSum<COLOR_DIM, S>(v_rgb_local, warp);
-            warpSum<decltype(warp), S>(v_conic_local, warp);
-            warpSum<decltype(warp), S>(v_xy_local, warp);
-            if (v_means2d_abs != nullptr) {
-                warpSum<decltype(warp), S>(v_xy_abs_local, warp);
-            }
-            warpSum<decltype(warp), S>(v_opacity_local, warp);
-            if (warp.thread_rank() == 0) {
-                int32_t g         = id_batch[t]; // flatten index in [C * N] or [nnz]
-                S      *v_rgb_ptr = (S *)(v_colors) + COLOR_DIM * g;
-                GSPLAT_PRAGMA_UNROLL
-                for (uint32_t k = 0; k < COLOR_DIM; ++k) {
-                    gpuAtomicAdd(v_rgb_ptr + k, v_rgb_local[k]);
-                }
-
-                S *v_conic_ptr = (S *)(v_conics) + 3 * g;
-                gpuAtomicAdd(v_conic_ptr, v_conic_local.x);
-                gpuAtomicAdd(v_conic_ptr + 1, v_conic_local.y);
-                gpuAtomicAdd(v_conic_ptr + 2, v_conic_local.z);
-
-                S *v_xy_ptr = (S *)(v_means2d) + 2 * g;
-                gpuAtomicAdd(v_xy_ptr, v_xy_local.x);
-                gpuAtomicAdd(v_xy_ptr + 1, v_xy_local.y);
-
-                if (v_means2d_abs != nullptr) {
-                    S *v_xy_abs_ptr = (S *)(v_means2d_abs) + 2 * g;
-                    gpuAtomicAdd(v_xy_abs_ptr, v_xy_abs_local.x);
-                    gpuAtomicAdd(v_xy_abs_ptr + 1, v_xy_abs_local.y);
-                }
-
-                gpuAtomicAdd(v_opacities + g, v_opacity_local);
-            }
-        }
-    }
-}
-
-template <uint32_t CDIM>
-std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
-call_bwd_kernel_with_dim(
-    // Gaussian parameters
-    const torch::Tensor               &means2d,     // [C, N, 2] or [nnz, 2]
-    const torch::Tensor               &conics,      // [C, N, 3] or [nnz, 3]
-    const torch::Tensor               &colors,      // [C, N, 3] or [nnz, 3]
-    const torch::Tensor               &opacities,   // [C, N] or [nnz]
-    const at::optional<torch::Tensor> &backgrounds, // [C, 3]
-    const at::optional<torch::Tensor> &masks,       // [C, tile_height, tile_width]
-    // image size
-    const uint32_t image_width, const uint32_t image_height, const uint32_t image_origin_w,
-    const uint32_t image_origin_h, const uint32_t tile_size,
-    // intersections
-    const torch::Tensor &tile_offsets, // [C, tile_height, tile_width]
-    const torch::Tensor &flatten_ids,  // [n_isects]
-    // forward outputs
-    const torch::Tensor &render_alphas, // [C, image_height, image_width, 1]
-    const torch::Tensor &last_ids,      // [C, image_height, image_width]
-    // gradients of outputs
-    const torch::Tensor &v_render_colors, // [C, image_height, image_width, 3]
-    const torch::Tensor &v_render_alphas, // [C, image_height, image_width, 1]
-    // options
-    bool absgrad) {
-    GSPLAT_DEVICE_GUARD(means2d);
-    GSPLAT_CHECK_INPUT(means2d);
-    GSPLAT_CHECK_INPUT(conics);
-    GSPLAT_CHECK_INPUT(colors);
-    GSPLAT_CHECK_INPUT(opacities);
-    GSPLAT_CHECK_INPUT(tile_offsets);
-    GSPLAT_CHECK_INPUT(flatten_ids);
-    GSPLAT_CHECK_INPUT(render_alphas);
-    GSPLAT_CHECK_INPUT(last_ids);
-    GSPLAT_CHECK_INPUT(v_render_colors);
-    GSPLAT_CHECK_INPUT(v_render_alphas);
-    if (backgrounds.has_value()) {
-        GSPLAT_CHECK_INPUT(backgrounds.value());
-    }
-    if (masks.has_value()) {
-        GSPLAT_CHECK_INPUT(masks.value());
-    }
-
-    bool packed = means2d.dim() == 2;
-
-    uint32_t C           = tile_offsets.size(0);         // number of cameras
-    uint32_t N           = packed ? 0 : means2d.size(1); // number of gaussians
-    uint32_t n_isects    = flatten_ids.size(0);
-    uint32_t COLOR_DIM   = colors.size(-1);
-    uint32_t tile_height = tile_offsets.size(1);
-    uint32_t tile_width  = tile_offsets.size(2);
-
-    const uint32_t tile_origin_w = image_origin_w / tile_size;
-    const uint32_t tile_origin_h = image_origin_h / tile_size;
-    const uint32_t tile_extent_w = (image_width + tile_size - 1) / tile_size;
-    const uint32_t tile_extent_h = (image_height + tile_size - 1) / tile_size;
-
-    // std::cerr << "RASTERIZE TO PIXELS BACKWARD " << std::endl;
-    // std::cerr << "  BLOCKS = (" << C << ", " << tile_extent_h << ", " << tile_extent_w << ")"
-    //           << std::endl;
-    // std::cerr << "  THREADS = (" << tile_size << ", " << tile_size << ". " << 1 << ")" <<
-    // std::endl; std::cerr << "  TILE WIDTH = " << tile_width << ", TILE HEIGHT = " << tile_height
-    // << std::endl;
-
-    // Each block covers a tile on the image. In total there are
-    // C * tile_height * tile_width blocks.
-    dim3 threads = { tile_size, tile_size, 1 };
-    dim3 blocks  = { C, tile_extent_h, tile_extent_w };
-
-    torch::Tensor v_means2d   = torch::zeros_like(means2d);
-    torch::Tensor v_conics    = torch::zeros_like(conics);
-    torch::Tensor v_colors    = torch::zeros_like(colors);
-    torch::Tensor v_opacities = torch::zeros_like(opacities);
-    torch::Tensor v_means2d_abs;
-    if (absgrad) {
-        v_means2d_abs = torch::zeros_like(means2d);
-    }
-
-    if (n_isects) {
-        const uint32_t shared_mem = tile_size * tile_size *
-                                    (sizeof(int32_t) + sizeof(vec3<float>) + sizeof(vec3<float>) +
-                                     sizeof(float) * COLOR_DIM);
-        at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();
-
-        if (cudaFuncSetAttribute(rasterize_to_pixels_bwd_kernel<CDIM, float>,
-                                 cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                 shared_mem) != cudaSuccess) {
-            AT_ERROR("Failed to set maximum shared memory size (requested ", shared_mem,
-                     " bytes), try lowering tile_size.");
-        }
-        rasterize_to_pixels_bwd_kernel<CDIM, float><<<blocks, threads, shared_mem, stream>>>(
-            C, N, n_isects, packed, reinterpret_cast<vec2<float> *>(means2d.data_ptr<float>()),
-            reinterpret_cast<vec3<float> *>(conics.data_ptr<float>()), colors.data_ptr<float>(),
-            opacities.data_ptr<float>(),
-            backgrounds.has_value() ? backgrounds.value().data_ptr<float>() : nullptr,
-            masks.has_value() ? masks.value().data_ptr<bool>() : nullptr, image_width, image_height,
-            image_origin_w, image_origin_h, tile_origin_w, tile_origin_h, tile_size, tile_width,
-            tile_height, tile_offsets.data_ptr<int32_t>(), flatten_ids.data_ptr<int32_t>(),
-            render_alphas.data_ptr<float>(), last_ids.data_ptr<int32_t>(),
-            v_render_colors.data_ptr<float>(), v_render_alphas.data_ptr<float>(),
-            absgrad ? reinterpret_cast<vec2<float> *>(v_means2d_abs.data_ptr<float>()) : nullptr,
-            reinterpret_cast<vec2<float> *>(v_means2d.data_ptr<float>()),
-            reinterpret_cast<vec3<float> *>(v_conics.data_ptr<float>()), v_colors.data_ptr<float>(),
-            v_opacities.data_ptr<float>());
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-    }
-
-    return std::make_tuple(v_means2d_abs, v_means2d, v_conics, v_colors, v_opacities);
-}
-
-std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
-rasterize_to_pixels_bwd_tensor(
-    // Gaussian parameters
-    const torch::Tensor               &means2d,     // [C, N, 2] or [nnz, 2]
-    const torch::Tensor               &conics,      // [C, N, 3] or [nnz, 3]
-    const torch::Tensor               &colors,      // [C, N, 3] or [nnz, 3]
-    const torch::Tensor               &opacities,   // [C, N] or [nnz]
-    const at::optional<torch::Tensor> &backgrounds, // [C, 3]
-    const at::optional<torch::Tensor> &masks,       // [C, tile_height, tile_width]
-    // image size
-    const uint32_t image_width, const uint32_t image_height, const uint32_t image_origin_w,
-    const uint32_t image_origin_h, const uint32_t tile_size,
-    // intersections
-    const torch::Tensor &tile_offsets, // [C, tile_height, tile_width]
-    const torch::Tensor &flatten_ids,  // [n_isects]
-    // forward outputs
-    const torch::Tensor &render_alphas, // [C, image_height, image_width, 1]
-    const torch::Tensor &last_ids,      // [C, image_height, image_width]
-    // gradients of outputs
-    const torch::Tensor &v_render_colors, // [C, image_height, image_width, 3]
-    const torch::Tensor &v_render_alphas, // [C, image_height, image_width, 1]
-    // options
-    bool absgrad) {
-    GSPLAT_CHECK_INPUT(colors);
-    uint32_t COLOR_DIM = colors.size(-1);
-
-#define __GS__CALL_BWD_(N)                                                                       \
-    case N:                                                                                      \
-        return call_bwd_kernel_with_dim<N>(                                                      \
-            means2d, conics, colors, opacities, backgrounds, masks, image_width, image_height,   \
-            image_origin_w, image_origin_h, tile_size, tile_offsets, flatten_ids, render_alphas, \
-            last_ids, v_render_colors, v_render_alphas, absgrad);
-
-    switch (COLOR_DIM) {
-        __GS__CALL_BWD_(1)
-        __GS__CALL_BWD_(2)
-        __GS__CALL_BWD_(3)
-        __GS__CALL_BWD_(4)
-        __GS__CALL_BWD_(5)
-        __GS__CALL_BWD_(8)
-        __GS__CALL_BWD_(9)
-        __GS__CALL_BWD_(16)
-        __GS__CALL_BWD_(17)
-        __GS__CALL_BWD_(32)
-        __GS__CALL_BWD_(33)
-        __GS__CALL_BWD_(64)
-        __GS__CALL_BWD_(65)
-        __GS__CALL_BWD_(128)
-        __GS__CALL_BWD_(129)
-        __GS__CALL_BWD_(256)
-        __GS__CALL_BWD_(257)
-        __GS__CALL_BWD_(512)
-        __GS__CALL_BWD_(513)
-    default:
-        AT_ERROR("Unsupported number of channels: ", COLOR_DIM);
-    }
-}
-
-// rasterize_to_pixels
-template <>
-std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
-dispatchGaussianRasterizeForward<torch::kCUDA>(
-    // Gaussian parameters
-    const torch::Tensor &means2d,   // [C, N, 2]
-    const torch::Tensor &conics,    // [C, N, 3]
-    const torch::Tensor &colors,    // [C, N, D]
-    const torch::Tensor &opacities, // [N]
-    // image size
-    const uint32_t image_width, const uint32_t image_height, const uint32_t image_origin_w,
-    const uint32_t image_origin_h, const uint32_t tile_size,
-    // intersections
-    const torch::Tensor &tile_offsets, // [C, tile_height, tile_width]
-    const torch::Tensor &flatten_ids   // [n_isects]
-) {
-    return rasterize_to_pixels_fwd_tensor(means2d, conics, colors, opacities,
-                                          std::nullopt /*backgrounds*/, std::nullopt /*mask*/,
-                                          image_width, image_height, image_origin_w, image_origin_h,
-                                          tile_size, tile_offsets, flatten_ids);
-}
-
-template <>
-std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
-dispatchGaussianRasterizeForward<torch::kCPU>(
-    // Gaussian parameters
-    const torch::Tensor &means2d,   // [C, N, 2]
-    const torch::Tensor &conics,    // [C, N, 3]
-    const torch::Tensor &colors,    // [C, N, D]
-    const torch::Tensor &opacities, // [N]
-    // image size
-    const uint32_t image_width, const uint32_t image_height, const uint32_t image_origin_w,
-    const uint32_t image_origin_h, const uint32_t tile_size,
-    // intersections
-    const torch::Tensor &tile_offsets, // [C, tile_height, tile_width]
-    const torch::Tensor &flatten_ids   // [n_isects]
-) {
-    TORCH_CHECK_NOT_IMPLEMENTED(false, "CPU implementation not available");
-}
-
-template <>
-std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
-dispatchGaussianRasterizeBackward<torch::kCUDA>(
-    // Gaussian parameters
-    const torch::Tensor &means2d,   // [C, N, 2]
-    const torch::Tensor &conics,    // [C, N, 3]
-    const torch::Tensor &colors,    // [C, N, 3]
-    const torch::Tensor &opacities, // [N]
-    // image size
-    const uint32_t image_width, const uint32_t image_height, const uint32_t image_origin_w,
-    const uint32_t image_origin_h,
-
-    const uint32_t tile_size,
-    // intersections
-    const torch::Tensor &tile_offsets, // [C, tile_height, tile_width]
-    const torch::Tensor &flatten_ids,  // [n_isects]
-    // forward outputs
-    const torch::Tensor &render_alphas, // [C, image_height, image_width, 1]
-    const torch::Tensor &last_ids,      // [C, image_height, image_width]
-    // gradients of outputs
-    const torch::Tensor &v_render_colors, // [C, image_height, image_width, 3]
-    const torch::Tensor &v_render_alphas, // [C, image_height, image_width, 1]
-    // options
-    bool absgrad) {
-    return rasterize_to_pixels_bwd_tensor(
-        means2d, conics, colors, opacities, std::nullopt /*backgrounds*/, std::nullopt /*mask*/,
-        image_width, image_height, image_origin_w, image_origin_h, tile_size, tile_offsets,
-        flatten_ids, render_alphas, last_ids, v_render_colors, v_render_alphas, absgrad);
-}
-
-template <>
-std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
-dispatchGaussianRasterizeBackward<torch::kCPU>(
-    // Gaussian parameters
-    const torch::Tensor &means2d,   // [C, N, 2]
-    const torch::Tensor &conics,    // [C, N, 3]
-    const torch::Tensor &colors,    // [C, N, 3]
-    const torch::Tensor &opacities, // [N]
-
-    // image size
-    const uint32_t image_width, const uint32_t image_height, const uint32_t image_origin_w,
-    const uint32_t image_origin_h, const uint32_t tile_size,
-    // intersections
-    const torch::Tensor &tile_offsets, // [C, tile_height, tile_width]
-    const torch::Tensor &flatten_ids,  // [n_isects]
-    // forward outputs
-    const torch::Tensor &render_alphas, // [C, image_height, image_width, 1]
-    const torch::Tensor &last_ids,      // [C, image_height, image_width]
-    // gradients of outputs
-    const torch::Tensor &v_render_colors, // [C, image_height, image_width, 3]
-    const torch::Tensor &v_render_alphas, // [C, image_height, image_width, 1]
-    // options
-    bool absgrad) {
-    TORCH_CHECK_NOT_IMPLEMENTED(false, "CPU implementation not available");
-}
-
-} // namespace ops
-} // namespace detail
-} // namespace fvdb

From 8c92ec47a627a1f6d7c34f2fd4f527d89dd9f6e7 Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <2375296+swahtz@users.noreply.github.com>
Date: Mon, 30 Dec 2024 15:04:03 +1300
Subject: [PATCH 21/59] Pull Request Testing Setup (#86)

* Added copy-pr-bot. Updated triggers to branch pushes per instructions:
https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>

* Add NV CPU runners to tasks requiring larger instances
Turning off USE_EXPLICIT_INSTANTIATION, now unblocked by larger runner

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>

---------

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 .github/copy-pr-bot.yaml         |  3 +++
 .github/workflows/ax.yml         | 21 +++------------------
 .github/workflows/build.yml      | 22 +++-------------------
 .github/workflows/docs.yml       | 13 +------------
 .github/workflows/houdini.yml    | 18 ++----------------
 .github/workflows/nanovdb.yml    | 21 +++------------------
 .github/workflows/weekly.yml     |  6 +++---
 .github/workflows/whitespace.yml |  7 +++----
 8 files changed, 21 insertions(+), 90 deletions(-)
 create mode 100644 .github/copy-pr-bot.yaml

diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
new file mode 100644
index 0000000000..4cfbdc7f05
--- /dev/null
+++ b/.github/copy-pr-bot.yaml
@@ -0,0 +1,3 @@
+enabled: true
+auto_sync_draft: false
+auto_sync_ready: true
diff --git a/.github/workflows/ax.yml b/.github/workflows/ax.yml
index 4e56d3d721..560607b84d 100644
--- a/.github/workflows/ax.yml
+++ b/.github/workflows/ax.yml
@@ -4,22 +4,7 @@ name: AX
 on:
   push:
     branches:
-      - 'master'
-      - 'feature/**'
-      - 'pr/**'
-    paths-ignore:
-      - 'CHANGES'
-      - 'CODEOWNERS'
-      - 'doc/**'
-      - 'nanovdb/**'
-      - 'openvdb_maya/**'
-      - 'openvdb_houdini/**'
-      - 'fvdb/**'
-      - 'pendingchanges/**'
-      - '**.md'
-  pull_request:
-    branches:
-      - '**'
+      - "pull-request/[0-9]+"
     paths-ignore:
       - 'CHANGES'
       - 'CODEOWNERS'
@@ -55,7 +40,7 @@ jobs:
       github.event_name != 'workflow_dispatch' ||
       github.event.inputs.type == 'all' ||
       github.event.inputs.type == 'linux'
-    runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
+    runs-on: ${{ (github.repository_owner == 'NVIDIA-Omniverse' && 'linux-amd64-cpu32') || (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
     name: >
       linux-ax:${{ matrix.config.image }}-cxx:${{ matrix.config.cxx }}-${{ matrix.config.build }}
     container:
@@ -152,7 +137,7 @@ jobs:
     if: |
       github.event_name == 'workflow_dispatch' &&
       github.event.inputs.type == 'grammar'
-    runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
+    runs-on: ${{ (github.repository_owner == 'NVIDIA-Omniverse' && 'linux-amd64-cpu32') || (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
     container:
       image: aswf/ci-openvdb:2023-clang15
     steps:
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index ed4b12d8fb..4eed4cfaa4 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -4,23 +4,7 @@ name: Build
 on:
   push:
     branches:
-      - 'master'
-      - 'feature/**'
-      - 'pr/**'
-    paths-ignore:
-      - 'CHANGES'
-      - 'CODEOWNERS'
-      - 'doc/**'
-      - 'openvdb_maya/**'
-      - 'openvdb_houdini/**'
-      - 'openvdb_ax/**'
-      - 'nanovdb/**'
-      - 'fvdb/**'
-      - 'pendingchanges/**'
-      - '**.md'
-  pull_request:
-    branches:
-      - '**'
+      - "pull-request/[0-9]+"
     paths-ignore:
       - 'CHANGES'
       - 'CODEOWNERS'
@@ -62,7 +46,7 @@ jobs:
       github.event_name != 'workflow_dispatch' ||
       github.event.inputs.type == 'all' ||
       github.event.inputs.type == 'linux'
-    runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
+    runs-on: ${{ (github.repository_owner == 'NVIDIA-Omniverse' && 'linux-amd64-cpu32') || (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
     name: >
       linux-vfx:${{ matrix.config.image }}-
       abi:${{ matrix.config.abi }}-
@@ -78,7 +62,7 @@ jobs:
         config:
           - { cxx: clang++, image: '2024', abi: '12', build: 'Release', cmake: '' }
           - { cxx: g++,     image: '2024', abi: '12', build: 'Release', cmake: '' }
-          - { cxx: clang++, image: '2024', abi: '12', build: 'Debug',   cmake: '-DUSE_EXPLICIT_INSTANTIATION=OFF' }
+          - { cxx: clang++, image: '2024', abi: '12', build: 'Debug',   cmake: '' }
           - { cxx: clang++, image: '2023', abi: '11', build: 'Release', cmake: '-DDISABLE_DEPENDENCY_VERSION_CHECKS=ON' }
           - { cxx: g++,     image: '2023', abi: '11', build: 'Release', cmake: '-DDISABLE_DEPENDENCY_VERSION_CHECKS=ON' }
       fail-fast: false
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 617ae20a12..b7f6e7b5b2 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -4,17 +4,7 @@ name: Docs
 on:
   push:
     branches:
-      - 'master'
-      - 'feature/**'
-      - 'pr/**'
-    paths-ignore:
-      - 'CHANGES'
-      - 'openvdb_maya/**'
-      - 'pendingchanges/**'
-      - '**.md'
-  pull_request:
-    branches:
-      - '**'
+      - "pull-request/[0-9]+"
     paths-ignore:
       - 'CHANGES'
       - 'openvdb_maya/**'
@@ -156,4 +146,3 @@ jobs:
           - Deployed from: AcademySoftwareFoundation/openvdb ${{ github.sha }}
 
           Signed-off-by: ${{ github.actor }} <${{ github.actor }}@users.noreply.github.com>"
-
diff --git a/.github/workflows/houdini.yml b/.github/workflows/houdini.yml
index 28efc703b8..ac87f2dc2d 100644
--- a/.github/workflows/houdini.yml
+++ b/.github/workflows/houdini.yml
@@ -4,21 +4,7 @@ name: Houdini
 on:
   push:
     branches:
-      - 'master'
-      - 'feature/**'
-      - 'pr/**'
-    paths-ignore:
-      - 'CHANGES'
-      - 'CODEOWNERS'
-      - 'doc/**'
-      - 'nanovdb/**'
-      - 'openvdb_maya/**'
-      - 'fvdb/**'
-      - 'pendingchanges/**'
-      - '**.md'
-  pull_request:
-    branches:
-      - '**'
+      - "pull-request/[0-9]+"
     paths-ignore:
       - 'CHANGES'
       - 'CODEOWNERS'
@@ -65,7 +51,7 @@ jobs:
     if: >
       ${{ needs.checksecret.outputs.HOUDINI_SECRETS == 'true' ||
           github.repository_owner == 'AcademySoftwareFoundation' }}
-    runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
+    runs-on: ${{ (github.repository_owner == 'NVIDIA-Omniverse' && 'linux-amd64-cpu32') || (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
     name: hou:${{ matrix.config.hou_hash }}-vfx:${{ matrix.config.image }}-cxx:${{ matrix.config.cxx }}
     container:
       image: aswf/ci-base:${{ matrix.config.image }}
diff --git a/.github/workflows/nanovdb.yml b/.github/workflows/nanovdb.yml
index fdaa3d3a36..f3581cbc7d 100644
--- a/.github/workflows/nanovdb.yml
+++ b/.github/workflows/nanovdb.yml
@@ -4,22 +4,7 @@ name: NanoVDB
 on:
   push:
     branches:
-      - 'master'
-      - 'feature/**'
-      - 'pr/**'
-    paths-ignore:
-      - 'CHANGES'
-      - 'CODEOWNERS'
-      - 'doc/**'
-      - 'openvdb_ax/**'
-      - 'openvdb_maya/**'
-      - 'openvdb_houdini/**'
-      - 'fvdb/**'
-      - 'pendingchanges/**'
-      - '**.md'
-  pull_request:
-    branches:
-      - '**'
+      - "pull-request/[0-9]+"
     paths-ignore:
       - 'CHANGES'
       - 'CODEOWNERS'
@@ -51,7 +36,7 @@ jobs:
       github.event_name != 'workflow_dispatch' ||
       github.event.inputs.type == 'all' ||
       github.event.inputs.type == 'linux'
-    runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
+    runs-on: ${{ (github.repository_owner == 'NVIDIA-Omniverse' && 'linux-amd64-cpu32') || (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
     name: >
       linux-nanovdb:cxx:${{ matrix.config.cxx }}-${{ matrix.config.build }}
     container:
@@ -171,7 +156,7 @@ jobs:
       github.event_name != 'workflow_dispatch' ||
       github.event.inputs.type == 'all' ||
       github.event.inputs.type == 'linux'
-    runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
+    runs-on: ${{ (github.repository_owner == 'NVIDIA-Omniverse' && 'linux-amd64-cpu32') || (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
     container:
       image: aswf/ci-openvdb:2024
     steps:
diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml
index 57dec07ec8..d5ea701aef 100644
--- a/.github/workflows/weekly.yml
+++ b/.github/workflows/weekly.yml
@@ -146,7 +146,7 @@ jobs:
       github.event_name != 'workflow_dispatch' ||
       github.event.inputs.type == 'all' ||
       github.event.inputs.type == 'extra'
-    runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
+    runs-on: ${{ (github.repository_owner == 'NVIDIA-Omniverse' && 'linux-amd64-cpu32') || (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
     name: linux-extra:${{ matrix.config.name }}
     container:
       image: aswf/ci-openvdb:2024
@@ -284,7 +284,7 @@ jobs:
       github.event_name != 'workflow_dispatch' ||
       github.event.inputs.type == 'all' ||
       github.event.inputs.type == 'ax'
-    runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
+    runs-on: ${{ (github.repository_owner == 'NVIDIA-Omniverse' && 'linux-amd64-cpu32') || (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
     name: >
       linux-ax:${{ matrix.config.image }}-cxx:${{ matrix.config.cxx }}-${{ matrix.config.build }}
     container:
@@ -493,7 +493,7 @@ jobs:
       github.event_name != 'workflow_dispatch' ||
       github.event.inputs.type == 'all' ||
       github.event.inputs.type == 'blosc'
-    runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
+    runs-on: ${{ (github.repository_owner == 'NVIDIA-Omniverse' && 'linux-amd64-cpu32') || (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
     name: linux-blosc:${{ matrix.blosc }}
     container:
       image: aswf/ci-base:2023
diff --git a/.github/workflows/whitespace.yml b/.github/workflows/whitespace.yml
index fb2e0187b1..57c3f74b00 100644
--- a/.github/workflows/whitespace.yml
+++ b/.github/workflows/whitespace.yml
@@ -1,4 +1,3 @@
-
 name: Whitespace
 
 on:
@@ -21,7 +20,7 @@ jobs:
   trailingspaces:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: test
       run: |
           set +e
@@ -32,9 +31,9 @@ jobs:
   spacesnottabs:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: test
       run: |
           set +e
-          git grep -n "	" -- ':!*/whitespace.yml' ':!tsc/meetings/*' ':!*.svg' ':!*.cmd' ':!*.png' ':!pendingchanges/*' ':!*.wlt' ':!*.jpg' ':!*.gif' ':!*.mp4' ':!*.pt' ':!*.pth' ':!*.nvdb' ':!*.npz'
+          git grep -n "	" -- ':!*/whitespace.yml' ':!tsc/meetings/*' ':!*.svg' ':!*.cmd' ':!*.png' ':!pendingchanges/*' ':!*.wlt' ':!*.jpg' ':!*.gif' ':!*.mp4' ':!*.pt' ':!*.pth' ':!*.nvdb' ':!*.npz' ':!*.gitmodules'
           test $? -eq 1

From 1251b0e01640338f8648277b71dca4dbfc2c029e Mon Sep 17 00:00:00 2001
From: Matthew Cong <mcong@nvidia.com>
Date: Tue, 24 Dec 2024 14:36:56 -0800
Subject: [PATCH 22/59] Fix missing OpenVDB dependency for NanoVDB-only build

Signed-off-by: Matthew Cong <mcong@nvidia.com>
---
 nanovdb/nanovdb/examples/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nanovdb/nanovdb/examples/CMakeLists.txt b/nanovdb/nanovdb/examples/CMakeLists.txt
index 93c888710b..5d23668d17 100644
--- a/nanovdb/nanovdb/examples/CMakeLists.txt
+++ b/nanovdb/nanovdb/examples/CMakeLists.txt
@@ -100,7 +100,7 @@ nanovdb_example(NAME "ex_read_nanovdb_sphere")
 nanovdb_example(NAME "ex_read_nanovdb_sphere_accessor")
 nanovdb_example(NAME "ex_read_nanovdb_sphere_accessor_cuda")
 nanovdb_example(NAME "ex_index_grid_cuda")
-nanovdb_example(NAME "ex_nodemanager_cuda")
+nanovdb_example(NAME "ex_nodemanager_cuda" OPENVDB)
 nanovdb_example(NAME "ex_voxels_to_grid_cuda")
 nanovdb_example(NAME "ex_modify_nanovdb_thrust")
 nanovdb_example(NAME "ex_map_pool_buffer")

From adfd4cd1ef9241cd6be5b5cb7c8cc3289ce136b1 Mon Sep 17 00:00:00 2001
From: Ken Museth <1495380+kmuseth@users.noreply.github.com>
Date: Tue, 31 Dec 2024 12:48:56 -0800
Subject: [PATCH 23/59] improved change notes and updated version number (#95)

* improved change notes and updated version number

Signed-off-by: Ken Museth <ken.museth@gmail.com>

* improved documentation

Signed-off-by: Ken Museth <ken.museth@gmail.com>

---------

Signed-off-by: Ken Museth <ken.museth@gmail.com>
---
 nanovdb/nanovdb/io/IO.h             | 2 +-
 openvdb_cmd/vdb_tool/include/Tool.h | 4 ++--
 pendingchanges/vdb_tool.txt         | 5 ++++-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/nanovdb/nanovdb/io/IO.h b/nanovdb/nanovdb/io/IO.h
index 5d5fc94141..50645c450c 100644
--- a/nanovdb/nanovdb/io/IO.h
+++ b/nanovdb/nanovdb/io/IO.h
@@ -14,7 +14,7 @@
 
     \note  This file does NOT depend on OpenVDB, but optionally on ZIP and BLOSC
 
-    \details NanoVDB files take on one of two following formats:
+    \details NanoVDB files take on one of the two following formats:
              1) multiple segments each with multiple grids (segments have easy to access metadata about its grids)
              2) starting with verion 32.6.0 nanovdb files also support a raw buffer with one or more grids (just a
              dump of a raw grid buffer, so no new metadata in headers as when using segments mentioned above).
diff --git a/openvdb_cmd/vdb_tool/include/Tool.h b/openvdb_cmd/vdb_tool/include/Tool.h
index b24e71696e..e558be1923 100644
--- a/openvdb_cmd/vdb_tool/include/Tool.h
+++ b/openvdb_cmd/vdb_tool/include/Tool.h
@@ -120,8 +120,8 @@ class Tool
 private:
 
     static const int sMajor =10;// incremented for incompatible changes options or file.
-    static const int sMinor = 6;// incremented for new functionality that is backwards-compatible.
-    static const int sPatch = 1;// incremented for backwards-compatible bug fixes.
+    static const int sMinor = 7;// incremented for new functionality that is backwards-compatible.
+    static const int sPatch = 0;// incremented for backwards-compatible bug fixes.
 
     using GridT = FloatGrid;
     using FilterT = std::unique_ptr<tools::LevelSetFilter<GridT>>;
diff --git a/pendingchanges/vdb_tool.txt b/pendingchanges/vdb_tool.txt
index c68dc25834..5d58d594df 100644
--- a/pendingchanges/vdb_tool.txt
+++ b/pendingchanges/vdb_tool.txt
@@ -1 +1,4 @@
-added read and write support for OFF (Object File Format) files to vdb_tool
\ No newline at end of file
+1) Fixed bugs that prevented build with newer namespaces in nanovdb
+2) Improved read and write of PLY files so they support Vec3d and 64 bit attributes
+3) Added read and write support for OFF (Object File Format) files to vdb_tool
+4) Changed version number from 10.6.1 to 10.7.0
\ No newline at end of file

From a004afbb050b456772541bd844b88dcbdd2753bc Mon Sep 17 00:00:00 2001
From: Matthew Cong <1372750+matthewdcong@users.noreply.github.com>
Date: Thu, 2 Jan 2025 09:57:39 -0800
Subject: [PATCH 24/59] Separate PointsToGrid kernel lambdas into functors
 (#94)

Signed-off-by: Matthew Cong <mcong@nvidia.com>
---
 nanovdb/nanovdb/tools/cuda/PointsToGrid.cuh | 344 +++++++++++++-------
 nanovdb/nanovdb/util/cuda/Util.h            |  11 +
 2 files changed, 242 insertions(+), 113 deletions(-)

diff --git a/nanovdb/nanovdb/tools/cuda/PointsToGrid.cuh b/nanovdb/nanovdb/tools/cuda/PointsToGrid.cuh
index 07c3dab2ba..fdfe1267ce 100644
--- a/nanovdb/nanovdb/tools/cuda/PointsToGrid.cuh
+++ b/nanovdb/nanovdb/tools/cuda/PointsToGrid.cuh
@@ -433,10 +433,10 @@ namespace kernels {
 /// or 'else' block of a constexpr if statement.
 /// function in a lambda through lambdaKernel wrapper defined in CudaUtils.h.
 template <typename BuildT>
-__global__ void fillValueIndexKernel(const size_t numItems, uint64_t* devValueIndex, typename PointsToGrid<BuildT>::Data* d_data) {
+__global__ void fillValueIndexKernel(const size_t numItems, unsigned int offset, uint64_t* devValueIndex, typename PointsToGrid<BuildT>::Data* d_data) {
     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
     if (tid >= numItems) return;
-    devValueIndex[tid] = static_cast<uint64_t>(d_data->getLeaf(tid).mValueMask.countOn());
+    devValueIndex[tid + offset] = static_cast<uint64_t>(d_data->getLeaf(tid + offset).mValueMask.countOn());
 }
 
 /// @details Used by PointsToGrid<BuildT>::processLeafNodes for the computation
@@ -446,11 +446,11 @@ __global__ void fillValueIndexKernel(const size_t numItems, uint64_t* devValueIn
 /// error : For this host platform/dialect, an extended lambda cannot be defined inside the 'if'
 /// or 'else' block of a constexpr if statement.
 template <typename BuildT>
-__global__ void leafPrefixSumKernel(const size_t numItems, uint64_t* devValueIndexPrefix, typename PointsToGrid<BuildT>::Data* d_data) {
+__global__ void leafPrefixSumKernel(const size_t numItems, unsigned int offset, uint64_t* devValueIndexPrefix, typename PointsToGrid<BuildT>::Data* d_data) {
     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
     if (tid >= numItems) return;
 
-    auto &leaf = d_data->getLeaf(tid);
+    auto &leaf = d_data->getLeaf(tid + offset);
     leaf.mOffset = 1u;// will be re-set below
     const uint64_t *w = leaf.mValueMask.words();
     uint64_t &prefixSum = leaf.mPrefixSum, sum = util::countOn(*w++);
@@ -459,11 +459,11 @@ __global__ void leafPrefixSumKernel(const size_t numItems, uint64_t* devValueInd
         sum += util::countOn(*w++);
         prefixSum |= sum << n;// each pre-fixed sum is encoded in 9 bits
     }
-    if (tid==0) {
+    if ((tid + offset) == 0) {
         d_data->getGrid().mData1 = 1u + devValueIndexPrefix[d_data->nodeCount[0]-1];// set total count
         d_data->getTree().mVoxelCount = devValueIndexPrefix[d_data->nodeCount[0]-1];
     } else {
-        leaf.mOffset = 1u + devValueIndexPrefix[tid-1];// background is index 0
+        leaf.mOffset = 1u + devValueIndexPrefix[tid + offset -1];// background is index 0
     }
 }
 
@@ -473,10 +473,10 @@ __global__ void leafPrefixSumKernel(const size_t numItems, uint64_t* devValueInd
 /// error : For this host platform/dialect, an extended lambda cannot be defined inside the 'if'
 /// or 'else' block of a constexpr if statement.
 template <typename BuildT>
-__global__ void setMaskEqValMaskKernel(const size_t numItems, typename PointsToGrid<BuildT>::Data* d_data) {
+__global__ void setMaskEqValMaskKernel(const size_t numItems, unsigned int offset, typename PointsToGrid<BuildT>::Data* d_data) {
     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
     if (tid >= numItems) return;
-    auto &leaf = d_data->getLeaf(tid);
+    auto &leaf = d_data->getLeaf(tid + offset);
     leaf.mMask = leaf.mValueMask;
 }
 } // namespace kernels
@@ -559,6 +559,53 @@ struct ShiftRightIterator : public cub::TransformInputIterator<OutT, ShiftRight<
 
 //-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 
+template <typename BuildT, typename PtrT>
+struct TileKeyFunctor {
+    using Vec3T = typename util::remove_const<typename pointer_traits<PtrT>::element_type>::type;
+
+    __device__
+    void operator()(size_t tid, const typename PointsToGrid<BuildT>::Data *d_data, const PtrT points, uint64_t* d_keys, uint32_t* d_indx) {
+        auto coordToKey = [](const Coord &ijk)->uint64_t{
+            // Note: int32_t has a range of -2^31 to 2^31 - 1 whereas uint32_t has a range of 0 to 2^32 - 1
+            static constexpr int64_t kOffset = 1 << 31;
+            return (uint64_t(uint32_t(int64_t(ijk[2]) + kOffset) >> 12)      ) | // z is the lower 21 bits
+                   (uint64_t(uint32_t(int64_t(ijk[1]) + kOffset) >> 12) << 21) | // y is the middle 21 bits
+                   (uint64_t(uint32_t(int64_t(ijk[0]) + kOffset) >> 12) << 42); //  x is the upper 21 bits
+        };// coordToKey lambda functor
+        d_indx[tid] = uint32_t(tid);
+        uint64_t &key = d_keys[tid];
+        if constexpr(util::is_same<BuildT, Point>::value) {// points are in world space
+            if constexpr(util::is_same<Vec3T, Vec3f>::value) {
+                key = coordToKey(d_data->map.applyInverseMapF(points[tid]).round());
+            } else {// points are Vec3d
+                key = coordToKey(d_data->map.applyInverseMap(points[tid]).round());
+            }
+        } else if constexpr(util::is_same<Vec3T, Coord>::value) {// points Coord are in index space
+            key = coordToKey(points[tid]);
+        } else {// points are Vec3f or Vec3d in index space
+            key = coordToKey(points[tid].round());
+        }
+    }
+};
+
+template <typename BuildT, typename PtrT>
+struct VoxelKeyFunctor {
+    using Vec3T = typename util::remove_const<typename pointer_traits<PtrT>::element_type>::type;
+
+    __device__
+    void operator()(size_t tid, const typename PointsToGrid<BuildT>::Data *d_data, const PtrT points, uint64_t id, uint64_t *d_keys, const uint32_t *d_indx) {
+        auto voxelKey = [] __device__ (uint64_t tileID, const Coord &ijk){
+            return tileID << 36 |                                       // upper offset: 64-15-12-9=28, i.e. last 28 bits
+                uint64_t(NanoUpper<BuildT>::CoordToOffset(ijk)) << 21 | // lower offset: 32^3 = 2^15,   i.e. next 15 bits
+                uint64_t(NanoLower<BuildT>::CoordToOffset(ijk)) <<  9 | // leaf  offset: 16^3 = 2^12,   i.e. next 12 bits
+                uint64_t(NanoLeaf< BuildT>::CoordToOffset(ijk));        // voxel offset:  8^3 =  2^9,   i.e. first 9 bits
+        };// voxelKey lambda functor
+        Vec3T p = points[d_indx[tid]];
+        if constexpr(util::is_same<BuildT, Point>::value) p = util::is_same<Vec3T, Vec3f>::value ? d_data->map.applyInverseMapF(p) : d_data->map.applyInverseMap(p);
+        d_keys[tid] = voxelKey(id, p.round());
+    }
+};
+
 template <typename BuildT>
 template <typename PtrT>
 void PointsToGrid<BuildT>::countNodes(const PtrT points, size_t pointCount)
@@ -589,28 +636,7 @@ jump:// this marks the beginning of the actual algorithm
     auto *d_indx = mMemPool.template alloc<uint32_t>(pointCount, mStream);
 
     if (mVerbose==2) mTimer.restart("Generate tile keys");
-    util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, const Data *d_data, const PtrT points) {
-        auto coordToKey = [](const Coord &ijk)->uint64_t{
-            // Note: int32_t has a range of -2^31 to 2^31 - 1 whereas uint32_t has a range of 0 to 2^32 - 1
-            static constexpr int64_t offset = 1 << 31;
-            return (uint64_t(uint32_t(int64_t(ijk[2]) + offset) >> 12)      ) | // z is the lower 21 bits
-                   (uint64_t(uint32_t(int64_t(ijk[1]) + offset) >> 12) << 21) | // y is the middle 21 bits
-                   (uint64_t(uint32_t(int64_t(ijk[0]) + offset) >> 12) << 42); //  x is the upper 21 bits
-        };// coordToKey lambda functor
-        d_indx[tid] = uint32_t(tid);
-        uint64_t &key = d_keys[tid];
-        if constexpr(util::is_same<BuildT, Point>::value) {// points are in world space
-            if constexpr(util::is_same<Vec3T, Vec3f>::value) {
-                key = coordToKey(d_data->map.applyInverseMapF(points[tid]).round());
-            } else {// points are Vec3d
-                key = coordToKey(d_data->map.applyInverseMap(points[tid]).round());
-            }
-        } else if constexpr(util::is_same<Vec3T, Coord>::value) {// points Coord are in index space
-            key = coordToKey(points[tid]);
-        } else {// points are Vec3f or Vec3d in index space
-            key = coordToKey(points[tid].round());
-        }
-    }, mDeviceData, points);
+    util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, TileKeyFunctor<BuildT, PtrT>(), mDeviceData, points, d_keys, d_indx);
     cudaCheckError();
     if (mVerbose==2) mTimer.restart("DeviceRadixSort of "+std::to_string(pointCount)+" tile keys");
     CALL_CUBS(DeviceRadixSort::SortPairs, d_keys, mData.d_keys, d_indx, mData.d_indx, pointCount, 0, 63);// 21 bits per coord
@@ -627,25 +653,15 @@ jump:// this marks the beginning of the actual algorithm
     mData.d_tile_keys = mMemPool.template alloc<uint64_t>(mData.nodeCount[2], mStream);
     cudaCheck(cudaMemcpyAsync(mData.d_tile_keys, d_keys, mData.nodeCount[2]*sizeof(uint64_t), cudaMemcpyDeviceToDevice, mStream));
 
-    if (mVerbose) mTimer.restart("DeviceRadixSort of " + std::to_string(pointCount) + " voxel keys in " + std::to_string(mData.nodeCount[2]) + " tiles");
+    if (mVerbose==2) mTimer.restart("DeviceRadixSort of " + std::to_string(pointCount) + " voxel keys in " + std::to_string(mData.nodeCount[2]) + " tiles");
     uint32_t *points_per_tile = new uint32_t[mData.nodeCount[2]];
     cudaCheck(cudaMemcpyAsync(points_per_tile, d_points_per_tile, mData.nodeCount[2]*sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
     mMemPool.free(d_points_per_tile, mStream);
 
     for (uint32_t id = 0, offset = 0; id < mData.nodeCount[2]; ++id) {
         const uint32_t count = points_per_tile[id];
-        util::cuda::lambdaKernel<<<numBlocks(count), mNumThreads, 0, mStream>>>(count, [=] __device__(size_t tid, const Data *d_data) {
-            auto voxelKey = [] __device__ (uint64_t tileID, const Coord &ijk){
-                return tileID << 36 |                                       // upper offset: 64-15-12-9=28, i.e. last 28 bits
-                    uint64_t(NanoUpper<BuildT>::CoordToOffset(ijk)) << 21 | // lower offset: 32^3 = 2^15,   i.e. next 15 bits
-                    uint64_t(NanoLower<BuildT>::CoordToOffset(ijk)) <<  9 | // leaf  offset: 16^3 = 2^12,   i.e. next 12 bits
-                    uint64_t(NanoLeaf< BuildT>::CoordToOffset(ijk));        // voxel offset:  8^3 =  2^9,   i.e. first 9 bits
-            };// voxelKey lambda functor
-            tid += offset;
-            Vec3T p = points[d_indx[tid]];
-            if constexpr(util::is_same<BuildT, Point>::value) p = util::is_same<Vec3T, Vec3f>::value ? d_data->map.applyInverseMapF(p) : d_data->map.applyInverseMap(p);
-            d_keys[tid] = voxelKey(id, p.round());
-        }, mDeviceData); cudaCheckError();
+        util::cuda::offsetLambdaKernel<<<numBlocks(count), mNumThreads, 0, mStream>>>(count, offset, VoxelKeyFunctor<BuildT, PtrT>(), mDeviceData, points, id, d_keys, d_indx);
+        cudaCheckError();
         CALL_CUBS(DeviceRadixSort::SortPairs, d_keys + offset, mData.d_keys + offset, d_indx + offset, mData.d_indx + offset, count, 0, 36);// 9+12+15=36
         offset += count;
     }
@@ -789,12 +805,13 @@ inline BufferT PointsToGrid<BuildT>::getBuffer(const PtrT, size_t pointCount, co
 
 //-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 
-template <typename BuildT>
-template <typename PtrT>
-inline void PointsToGrid<BuildT>::processGridTreeRoot(const PtrT points, size_t pointCount)
+template <typename BuildT, typename PtrT>
+struct BuildGridTreeRootFunctor
 {
     using Vec3T = typename util::remove_const<typename pointer_traits<PtrT>::element_type>::type;
-    util::cuda::lambdaKernel<<<1, 1, 0, mStream>>>(1, [=] __device__(size_t, Data *d_data, PointType pointType) {
+
+    __device__
+    void operator()(size_t, typename PointsToGrid<BuildT>::Data *d_data, PointType pointType, size_t pointCount) {
        // process Root
         auto &root = d_data->getRoot();
         root.mBBox = CoordBBox(); // init to empty
@@ -913,7 +930,14 @@ inline void PointsToGrid<BuildT>::processGridTreeRoot(const PtrT points, size_t
             grid.mData1 = 1u + 512u*d_data->nodeCount[0];
             grid.mGridClass = GridClass::IndexGrid;
         }
-    }, mDeviceData, mPointType);// lambdaKernel
+    }
+};
+
+template <typename BuildT>
+template <typename PtrT>
+inline void PointsToGrid<BuildT>::processGridTreeRoot(const PtrT points, size_t pointCount)
+{
+    util::cuda::lambdaKernel<<<1, 1, 0, mStream>>>(1, BuildGridTreeRootFunctor<BuildT, PtrT>(), mDeviceData, mPointType, pointCount);// lambdaKernel
     cudaCheckError();
 
     char *dst = mData.getGrid().mGridName;
@@ -927,9 +951,10 @@ inline void PointsToGrid<BuildT>::processGridTreeRoot(const PtrT points, size_t
 //-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 
 template <typename BuildT>
-inline void PointsToGrid<BuildT>::processUpperNodes()
+struct BuildUpperNodesFunctor
 {
-    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[2]), mNumThreads, 0, mStream>>>(mData.nodeCount[2], [=] __device__(size_t tid, Data *d_data) {
+    __device__
+    void operator()(size_t tid, typename PointsToGrid<BuildT>::Data *d_data) {
         auto &root  = d_data->getRoot();
         auto &upper = d_data->getUpper(tid);
 #if 1
@@ -951,25 +976,39 @@ inline void PointsToGrid<BuildT>::processUpperNodes()
         upper.mChildMask.setOff();
         upper.mMinimum = upper.mMaximum = NanoLower<BuildT>::ValueType(0);
         upper.mAverage = upper.mStdDevi = NanoLower<BuildT>::FloatType(0);
-    }, mDeviceData);
+    }
+};
+
+template <typename BuildT>
+struct SetUpperBackgroundValuesFunctor
+{
+    __device__
+    void operator()(size_t tid, typename PointsToGrid<BuildT>::Data *d_data) {
+        auto &upper = d_data->getUpper(tid >> 15);
+        upper.mTable[tid & 32767u].value = NanoUpper<BuildT>::ValueType(0);// background
+    }
+};
+
+template <typename BuildT>
+inline void PointsToGrid<BuildT>::processUpperNodes()
+{
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[2]), mNumThreads, 0, mStream>>>(mData.nodeCount[2], BuildUpperNodesFunctor<BuildT>(), mDeviceData);
     cudaCheckError();
 
     mMemPool.free(mData.d_tile_keys, mStream);
 
     const uint64_t valueCount = mData.nodeCount[2] << 15;
-    util::cuda::lambdaKernel<<<numBlocks(valueCount), mNumThreads, 0, mStream>>>(valueCount, [=] __device__(size_t tid, Data *d_data) {
-        auto &upper = d_data->getUpper(tid >> 15);
-        upper.mTable[tid & 32767u].value = NanoUpper<BuildT>::ValueType(0);// background
-    }, mDeviceData);
+    util::cuda::lambdaKernel<<<numBlocks(valueCount), mNumThreads, 0, mStream>>>(valueCount, SetUpperBackgroundValuesFunctor<BuildT>(), mDeviceData);
     cudaCheckError();
 }// PointsToGrid<BuildT>::processUpperNodes
 
 //-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 
 template <typename BuildT>
-inline void PointsToGrid<BuildT>::processLowerNodes()
+struct BuildLowerNodesFunctor
 {
-    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[1]), mNumThreads, 0, mStream>>>(mData.nodeCount[1], [=] __device__(size_t tid, Data *d_data) {
+    __device__
+    void operator()(size_t tid, typename PointsToGrid<BuildT>::Data *d_data) {
         auto &root  = d_data->getRoot();
         const uint64_t lowerKey = d_data->d_lower_keys[tid];
         auto &upper = d_data->getUpper(lowerKey >> 15);
@@ -983,28 +1022,37 @@ inline void PointsToGrid<BuildT>::processLowerNodes()
         lower.mChildMask.setOff();
         lower.mMinimum = lower.mMaximum = NanoLower<BuildT>::ValueType(0);// background;
         lower.mAverage = lower.mStdDevi = NanoLower<BuildT>::FloatType(0);
-    }, mDeviceData);
-    cudaCheckError();
+    }
+};
 
-    const uint64_t valueCount = mData.nodeCount[1] << 12;
-    util::cuda::lambdaKernel<<<numBlocks(valueCount), mNumThreads, 0, mStream>>>(valueCount, [=] __device__(size_t tid, Data *d_data) {
+template <typename BuildT>
+struct SetLowerBackgroundValuesFunctor
+{
+    __device__
+    void operator()(size_t tid, typename PointsToGrid<BuildT>::Data *d_data) {
         auto &lower = d_data->getLower(tid >> 12);
         lower.mTable[tid & 4095u].value = NanoLower<BuildT>::ValueType(0);// background
-    }, mDeviceData);
+    }
+};
+
+template <typename BuildT>
+inline void PointsToGrid<BuildT>::processLowerNodes()
+{
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[1]), mNumThreads, 0, mStream>>>(mData.nodeCount[1], BuildLowerNodesFunctor<BuildT>(), mDeviceData);
+    cudaCheckError();
+
+    const uint64_t valueCount = mData.nodeCount[1] << 12;
+    util::cuda::lambdaKernel<<<numBlocks(valueCount), mNumThreads, 0, mStream>>>(valueCount, SetLowerBackgroundValuesFunctor<BuildT>(), mDeviceData);
     cudaCheckError();
 }// PointsToGrid<BuildT>::processLowerNodes
 
 //-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 
 template <typename BuildT>
-template <typename PtrT>
-inline void PointsToGrid<BuildT>::processLeafNodes(const PtrT points)
+struct ProcessLeafMetaDataFunctor
 {
-    const uint8_t flags = static_cast<uint8_t>(mData.flags.data());// mIncludeStats ? 16u : 0u;// 4th bit indicates stats
-
-    if (mVerbose==2) mTimer.start("process leaf meta data");
-    // loop over leaf nodes and add it to its parent node
-    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], [=] __device__(size_t tid, Data *d_data) {
+    __device__
+    void operator()(size_t tid, typename PointsToGrid<BuildT>::Data *d_data, uint8_t flags) {
         const uint64_t leafKey = d_data->d_leaf_keys[tid], tile_id = leafKey >> 27;
         auto &upper = d_data->getUpper(tile_id);
         const uint32_t lowerOffset = leafKey & 4095u, upperOffset = (leafKey >> 12) & 32767u;
@@ -1027,11 +1075,14 @@ inline void PointsToGrid<BuildT>::processLeafNodes(const PtrT points)
             leaf.mAverage = leaf.mStdDevi = NanoLeaf<BuildT>::FloatType(0);
             leaf.mMinimum = leaf.mMaximum = NanoLeaf<BuildT>::ValueType(0);
         }
-    }, mDeviceData); cudaCheckError();
+    }
+};
 
-    if (mVerbose==2) mTimer.restart("set active voxel state and values");
-    // loop over all active voxels and set LeafNode::mValueMask and LeafNode::mValues
-    util::cuda::lambdaKernel<<<numBlocks(mData.voxelCount), mNumThreads, 0, mStream>>>(mData.voxelCount, [=] __device__(size_t tid, Data *d_data) {
+template <typename BuildT>
+struct SetLeafActiveVoxelStateAndValuesFunctor
+{
+    __device__
+    void operator()(size_t tid, typename PointsToGrid<BuildT>::Data *d_data) {
         const uint32_t pointID  = d_data->pointsPerVoxelPrefix[tid];
         const uint64_t voxelKey = d_data->d_keys[pointID];
         auto &upper = d_data->getUpper(voxelKey >> 36);
@@ -1044,17 +1095,14 @@ inline void PointsToGrid<BuildT>::processLeafNodes(const PtrT points)
         } else if constexpr(!BuildTraits<BuildT>::is_special) {
             leaf.mValues[n] = NanoLeaf<BuildT>::ValueType(1);// set value of active voxels that are not points (or index)
         }
-    }, mDeviceData); cudaCheckError();
-
-    mMemPool.free(mData.d_keys, mStream);
-    mMemPool.free(mData.pointsPerVoxel, mStream);
-    mMemPool.free(mData.pointsPerVoxelPrefix, mStream);
-    mMemPool.free(mData.pointsPerLeafPrefix, mStream);
-    mMemPool.free(mData.pointsPerLeaf, mStream);
+    }
+};
 
-    if (mVerbose==2) mTimer.restart("set inactive voxel values");
-    const uint64_t denseVoxelCount = mData.nodeCount[0] << 9;
-    util::cuda::lambdaKernel<<<numBlocks(denseVoxelCount), mNumThreads, 0, mStream>>>(denseVoxelCount, [=] __device__(size_t tid, Data *d_data) {
+template <typename BuildT>
+struct SetLeafInactiveVoxelValuesFunctor
+{
+    __device__
+    void operator()(size_t tid, typename PointsToGrid<BuildT>::Data *d_data) {
         auto &leaf = d_data->getLeaf(tid >> 9u);
         const uint32_t n = tid & 511u;
         if (leaf.mValueMask.isOn(n)) return;
@@ -1064,24 +1112,52 @@ inline void PointsToGrid<BuildT>::processLeafNodes(const PtrT points)
         } else if constexpr(!BuildTraits<BuildT>::is_special) {
             leaf.mValues[n] = NanoLeaf<BuildT>::ValueType(0);// value of inactive voxels
         }
-    }, mDeviceData); cudaCheckError();
+    }
+};
+
+template <typename BuildT>
+template <typename PtrT>
+inline void PointsToGrid<BuildT>::processLeafNodes(const PtrT points)
+{
+    const uint8_t flags = static_cast<uint8_t>(mData.flags.data());// mIncludeStats ? 16u : 0u;// 4th bit indicates stats
+
+    if (mVerbose==2) mTimer.start("process leaf meta data");
+    // loop over leaf nodes and add it to its parent node
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], ProcessLeafMetaDataFunctor<BuildT>(), mDeviceData, flags);
+    cudaCheckError();
+
+    if (mVerbose==2) mTimer.restart("set active voxel state and values");
+    // loop over all active voxels and set LeafNode::mValueMask and LeafNode::mValues
+    util::cuda::lambdaKernel<<<numBlocks(mData.voxelCount), mNumThreads, 0, mStream>>>(mData.voxelCount, SetLeafActiveVoxelStateAndValuesFunctor<BuildT>(), mDeviceData);
+    cudaCheckError();
+
+    mMemPool.free(mData.d_keys, mStream);
+    mMemPool.free(mData.pointsPerVoxel, mStream);
+    mMemPool.free(mData.pointsPerVoxelPrefix, mStream);
+    mMemPool.free(mData.pointsPerLeafPrefix, mStream);
+    mMemPool.free(mData.pointsPerLeaf, mStream);
+
+    if (mVerbose==2) mTimer.restart("set inactive voxel values");
+    const uint64_t denseVoxelCount = mData.nodeCount[0] << 9;
+    util::cuda::lambdaKernel<<<numBlocks(denseVoxelCount), mNumThreads, 0, mStream>>>(denseVoxelCount, SetLeafInactiveVoxelValuesFunctor<BuildT>(), mDeviceData);
+    cudaCheckError();
 
     if constexpr(BuildTraits<BuildT>::is_onindex) {
         if (mVerbose==2) mTimer.restart("prefix-sum for index grid");
         uint64_t *devValueIndex = mMemPool.template alloc<uint64_t>(mData.nodeCount[0], mStream);
         auto devValueIndexPrefix = mMemPool.template alloc<uint64_t>(mData.nodeCount[0], mStream);
-        kernels::fillValueIndexKernel<BuildT><<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], devValueIndex, mDeviceData);
+        kernels::fillValueIndexKernel<BuildT><<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], 0, devValueIndex, mDeviceData);
         cudaCheckError();
         CALL_CUBS(DeviceScan::InclusiveSum, devValueIndex, devValueIndexPrefix, mData.nodeCount[0]);
         mMemPool.free(devValueIndex, mStream);
-        kernels::leafPrefixSumKernel<BuildT><<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], devValueIndexPrefix, mDeviceData);
+        kernels::leafPrefixSumKernel<BuildT><<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], 0, devValueIndexPrefix, mDeviceData);
         cudaCheckError();
         mMemPool.free(devValueIndexPrefix, mStream);
     }
 
     if constexpr(BuildTraits<BuildT>::is_indexmask) {
         if (mVerbose==2) mTimer.restart("leaf.mMask = leaf.mValueMask");
-        kernels::setMaskEqValMaskKernel<BuildT><<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], mDeviceData);
+        kernels::setMaskEqValMaskKernel<BuildT><<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], 0, mDeviceData);
         cudaCheckError();
     }
     if (mVerbose==2) mTimer.stop();
@@ -1159,6 +1235,68 @@ inline void PointsToGrid<Point>::processPoints(const PtrT points, size_t pointCo
 
 //-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 
+template <typename BuildT>
+struct ResetLowerNodeBBoxFunctor
+{
+    __device__
+    void operator()(size_t tid, typename PointsToGrid<BuildT>::Data *d_data) {
+        d_data->getLower(tid).mBBox = CoordBBox();
+    }
+};
+
+template <typename BuildT>
+struct UpdateAndPropagateLeafBBoxFunctor
+{
+    __device__
+    void operator()(size_t tid, typename PointsToGrid<BuildT>::Data *d_data) {
+        const uint64_t leafKey = d_data->d_leaf_keys[tid];
+        auto &upper = d_data->getUpper(leafKey >> 27);
+        auto &lower = *upper.getChild((leafKey >> 12) & 32767u);
+        auto &leaf = d_data->getLeaf(tid);
+        leaf.updateBBox();
+        lower.mBBox.expandAtomic(leaf.bbox());
+    }
+};
+
+template <typename BuildT>
+struct ResetUpperNodeBBoxFunctor
+{
+    __device__
+    void operator()(size_t tid, typename PointsToGrid<BuildT>::Data *d_data) {
+        d_data->getUpper(tid).mBBox = CoordBBox();
+    }
+};
+
+template <typename BuildT>
+struct PropagateLowerBBoxFunctor
+{
+    __device__
+    void operator()(size_t tid, typename PointsToGrid<BuildT>::Data *d_data) {
+        const uint64_t lowerKey = d_data->d_lower_keys[tid];
+        auto &upper = d_data->getUpper(lowerKey >> 15);
+        auto &lower = d_data->getLower(tid);
+        upper.mBBox.expandAtomic(lower.bbox());
+    }
+};
+
+template <typename BuildT>
+struct PropagateUpperBBoxFunctor
+{
+    __device__
+    void operator()(size_t tid, typename PointsToGrid<BuildT>::Data *d_data) {
+        d_data->getRoot().mBBox.expandAtomic(d_data->getUpper(tid).bbox());
+    }
+};
+
+template <typename BuildT>
+struct UpdateRootWorldBBoxFunctor
+{
+    __device__
+    void operator()(size_t tid, typename PointsToGrid<BuildT>::Data *d_data) {
+        d_data->getGrid().mWorldBBox = d_data->getRoot().mBBox.transform(d_data->map);
+    }
+};
+
 template <typename BuildT>
 inline void PointsToGrid<BuildT>::processBBox()
 {
@@ -1169,49 +1307,29 @@ inline void PointsToGrid<BuildT>::processBBox()
     }
 
     // reset bbox in lower nodes
-    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[1]), mNumThreads, 0, mStream>>>(mData.nodeCount[1], [=] __device__(size_t tid, Data *d_data) {
-        d_data->getLower(tid).mBBox = CoordBBox();
-    }, mDeviceData);
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[1]), mNumThreads, 0, mStream>>>(mData.nodeCount[1], ResetLowerNodeBBoxFunctor<BuildT>(), mDeviceData);
     cudaCheckError();
 
     // update and propagate bbox from leaf -> lower/parent nodes
-    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], [=] __device__(size_t tid, Data *d_data) {
-        const uint64_t leafKey = d_data->d_leaf_keys[tid];
-        auto &upper = d_data->getUpper(leafKey >> 27);
-        auto &lower = *upper.getChild((leafKey >> 12) & 32767u);
-        auto &leaf = d_data->getLeaf(tid);
-        leaf.updateBBox();
-        lower.mBBox.expandAtomic(leaf.bbox());
-    }, mDeviceData);
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], UpdateAndPropagateLeafBBoxFunctor<BuildT>(), mDeviceData);
     mMemPool.free(mData.d_leaf_keys, mStream);
     cudaCheckError();
 
     // reset bbox in upper nodes
-    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[2]), mNumThreads, 0, mStream>>>(mData.nodeCount[2], [=] __device__(size_t tid, Data *d_data) {
-        d_data->getUpper(tid).mBBox = CoordBBox();
-    }, mDeviceData);
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[2]), mNumThreads, 0, mStream>>>(mData.nodeCount[2], ResetUpperNodeBBoxFunctor<BuildT>(), mDeviceData);
     cudaCheckError();
 
     // propagate bbox from lower -> upper/parent node
-    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[1]), mNumThreads, 0, mStream>>>(mData.nodeCount[1], [=] __device__(size_t tid, Data *d_data) {
-        const uint64_t lowerKey = d_data->d_lower_keys[tid];
-        auto &upper = d_data->getUpper(lowerKey >> 15);
-        auto &lower = d_data->getLower(tid);
-        upper.mBBox.expandAtomic(lower.bbox());
-    }, mDeviceData);
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[1]), mNumThreads, 0, mStream>>>(mData.nodeCount[1], PropagateLowerBBoxFunctor<BuildT>(), mDeviceData);
     mMemPool.free(mData.d_lower_keys, mStream);
     cudaCheckError()
 
     // propagate bbox from upper -> root/parent node
-    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[2]), mNumThreads, 0, mStream>>>(mData.nodeCount[2], [=] __device__(size_t tid, Data *d_data) {
-        d_data->getRoot().mBBox.expandAtomic(d_data->getUpper(tid).bbox());
-    }, mDeviceData);
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[2]), mNumThreads, 0, mStream>>>(mData.nodeCount[2], PropagateUpperBBoxFunctor<BuildT>(), mDeviceData);
     cudaCheckError();
 
     // update the world-bbox in the root node
-    util::cuda::lambdaKernel<<<1, 1, 0, mStream>>>(1, [=] __device__(size_t, Data *d_data) {
-        d_data->getGrid().mWorldBBox = d_data->getRoot().mBBox.transform(d_data->map);
-    }, mDeviceData);
+    util::cuda::lambdaKernel<<<1, 1, 0, mStream>>>(1, UpdateRootWorldBBoxFunctor<BuildT>(), mDeviceData);
     cudaCheckError();
 }// PointsToGrid<BuildT>::processBBox
 
diff --git a/nanovdb/nanovdb/util/cuda/Util.h b/nanovdb/nanovdb/util/cuda/Util.h
index bf952832cb..132c41a35c 100644
--- a/nanovdb/nanovdb/util/cuda/Util.h
+++ b/nanovdb/nanovdb/util/cuda/Util.h
@@ -210,6 +210,17 @@ __global__ void lambdaKernel(const size_t numItems, Func func, Args... args)
     func(tid, args...);
 }// util::cuda::lambdaKernel
 
+/// @brief Cuda kernel that launches device lambda functions with a tid offset
+/// @param numItems Problem size
+/// @param offset Offset for thread id
+template<typename Func, typename... Args>
+__global__ void offsetLambdaKernel(size_t numItems, unsigned int offset, Func func, Args... args)
+{
+    const unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= numItems) return;
+    func(tid + offset, args...);
+}// util::cuda::offsetLambdaKernel
+
 #endif// __CUDACC__
 
 }// namespace util::cuda ============================================================

From fa1ac5feefdeecf76a51b842725cf118b3c5d22a Mon Sep 17 00:00:00 2001
From: Matthew Cong <mcong@nvidia.com>
Date: Thu, 2 Jan 2025 13:25:37 -0800
Subject: [PATCH 25/59] Add missing iomanip include for latest GTest

Signed-off-by: Matthew Cong <mcong@nvidia.com>
---
 nanovdb/nanovdb/unittest/TestNanoVDB.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nanovdb/nanovdb/unittest/TestNanoVDB.cc b/nanovdb/nanovdb/unittest/TestNanoVDB.cc
index 5e1aabe35e..23e73b18a0 100644
--- a/nanovdb/nanovdb/unittest/TestNanoVDB.cc
+++ b/nanovdb/nanovdb/unittest/TestNanoVDB.cc
@@ -13,6 +13,7 @@
 #include <algorithm> // for std::is_sorted
 #include <cmath>
 #include <cstdlib>
+#include <iomanip> // for std::setw, std::setfill
 
 #include <nanovdb/io/IO.h>
 #include <nanovdb/tools/CreateNanoGrid.h>

From 23ded25be22414d94e784a9465770c799621d493 Mon Sep 17 00:00:00 2001
From: Ken Museth <1495380+kmuseth@users.noreply.github.com>
Date: Mon, 6 Jan 2025 11:09:50 -0800
Subject: [PATCH 26/59] improved nanovdb_convert (#98)

* improved nanovdb_convert

Signed-off-by: Ken Museth <ken.museth@gmail.com>

* fixed typo

Signed-off-by: Ken Museth <ken.museth@gmail.com>

* added unit tests

Signed-off-by: Ken Museth <ken.museth@gmail.com>

* review feedback

Signed-off-by: Ken Museth <ken.museth@gmail.com>

---------

Signed-off-by: Ken Museth <ken.museth@gmail.com>
---
 .../nanovdb/cmd/convert/nanovdb_convert.cc    | 68 +++++++++++--------
 nanovdb/nanovdb/tools/CreateNanoGrid.h        | 65 +++++++++++++++++-
 nanovdb/nanovdb/unittest/TestOpenVDB.cc       | 34 ++++++++++
 pendingchanges/nanovdb.txt                    |  4 +-
 4 files changed, 140 insertions(+), 31 deletions(-)

diff --git a/nanovdb/nanovdb/cmd/convert/nanovdb_convert.cc b/nanovdb/nanovdb/cmd/convert/nanovdb_convert.cc
index e5b5981f5d..35de179fff 100644
--- a/nanovdb/nanovdb/cmd/convert/nanovdb_convert.cc
+++ b/nanovdb/nanovdb/cmd/convert/nanovdb_convert.cc
@@ -35,6 +35,8 @@ void usage [[noreturn]] (const std::string& progName, int exitStatus = EXIT_FAIL
               << "--fp8\tQuantize float grids to 8 bits\n"
               << "--fp16\tQuantize float grids to 16 bits\n"
               << "--fpN\tQuantize float grids to variable bit depth (use -a or -r to specify a tolerance)\n"
+              << "--index\tProduce an IndexGrid where all float values are encoded as side-car data\n"
+              << "--onIndex\tProduce an IndexGrid where active float values are encoded as side-car data\n"
               << "-g,--grid name\tConvert all grids matching the specified string name\n"
               << "-h,--help\tPrints this message\n"
               << "-r,--rel-error float\t Relative error tolerance used for variable bit depth quantization\n"
@@ -57,14 +59,14 @@ int main(int argc, char* argv[])
 {
     int exitStatus = EXIT_SUCCESS;
 
-    nanovdb::io::Codec       codec = nanovdb::io::Codec::NONE;// compression codec for the file
-    nanovdb::tools::StatsMode       sMode = nanovdb::tools::StatsMode::Default;
-    nanovdb::CheckMode    cMode = nanovdb::CheckMode::Default;
-    nanovdb::GridType        qMode = nanovdb::GridType::Unknown;//specify the quantization mode
-    bool                     verbose = false, overwrite = false, dither = false, absolute = true;
-    float                    tolerance = -1.0f;
-    std::string              gridName;
-    std::vector<std::string> fileNames;
+    nanovdb::io::Codec        codec = nanovdb::io::Codec::NONE;// compression codec for the file
+    nanovdb::tools::StatsMode sMode = nanovdb::tools::StatsMode::Default;
+    nanovdb::CheckMode        cMode = nanovdb::CheckMode::Default;
+    nanovdb::GridType         qMode = nanovdb::GridType::Unknown;//specify the quantization mode
+    bool                      verbose = false, overwrite = false, dither = false, absolute = true;
+    float                     tolerance = -1.0f;
+    std::string               gridName;
+    std::vector<std::string>  fileNames;
     auto toLowerCase = [](std::string &str) {
         std::transform(str.begin(), str.end(), str.begin(),[](unsigned char c){return std::tolower(c);});
     };
@@ -85,6 +87,10 @@ int main(int argc, char* argv[])
                 qMode = nanovdb::GridType::Fp16;
             } else if (arg == "--fpN") {
                 qMode = nanovdb::GridType::FpN;
+            } else if (arg == "--index") {
+                qMode = nanovdb::GridType::Index;
+            } else if (arg == "--onIndex") {
+                qMode = nanovdb::GridType::OnIndex;
             } else if (arg == "-h" || arg == "--help") {
                 usage(argv[0], EXIT_SUCCESS);
             } else if (arg == "-b" || arg == "--blosc") {
@@ -169,17 +175,22 @@ int main(int argc, char* argv[])
         std::cerr << "Expected at least one input file followed by exactly one output file\n" << std::endl;
         usage(argv[0]);
     }
-    const std::string outputFile = fileNames.back();
+    const std::string outputFile = fileNames.back();// last file is always the output file
     const std::string ext = outputFile.substr(outputFile.find_last_of(".") + 1);
     bool              toNanoVDB = false;
     if (ext == "nvdb") {
         toNanoVDB = true;
-    } else if (ext != "vdb") {
+    } else if (ext == "vdb") {
+        if (qMode != nanovdb::GridType::Unknown) {
+            std::cerr << "The options are incompatible an output of type OpenVDB" << std::endl;
+            usage(argv[0]);
+        }
+    } else {
         std::cerr << "Unrecognized file extension: \"" << ext << "\"\n" << std::endl;
         usage(argv[0]);
     }
 
-    fileNames.pop_back();
+    fileNames.pop_back();// remove the output file name
 
     if (!overwrite) {
         std::ifstream is(outputFile, std::ios::in | std::ios::binary);
@@ -203,9 +214,16 @@ int main(int argc, char* argv[])
 
     auto openToNano = [&](const openvdb::GridBase::Ptr& base)
     {
-        using SrcGridT = openvdb::FloatGrid;
-        if (auto floatGrid = openvdb::GridBase::grid<SrcGridT>(base)) {
-            nanovdb::tools::CreateNanoGrid<SrcGridT> s(*floatGrid);
+        const bool includeStats = false, includeTiles = false;
+        const int verb = verbose ? 1 : 0;
+        if (qMode == nanovdb::GridType::OnIndex) {
+            return nanovdb::tools::openToIndexVDB<nanovdb::ValueOnIndex>(base, 1u, includeStats, includeTiles, verb);
+        } else if (qMode == nanovdb::GridType::Index) {
+            return nanovdb::tools::openToIndexVDB<nanovdb::ValueIndex>(base, 1u, includeStats, includeTiles, verb);
+        }
+
+        if (auto floatGrid = openvdb::GridBase::grid<openvdb::FloatGrid>(base)) {
+            nanovdb::tools::CreateNanoGrid<openvdb::FloatGrid> s(*floatGrid);
             s.setStats(sMode);
             s.setChecksum(cMode);
             s.enableDithering(dither);
@@ -227,7 +245,7 @@ int main(int argc, char* argv[])
                 break;
             }// end of switch
         }
-        return nanovdb::tools::openToNanoVDB(base, sMode, cMode, verbose ? 1 : 0);
+        return nanovdb::tools::openToNanoVDB(base, sMode, cMode, verb);
     };
     try {
         if (toNanoVDB) { // OpenVDB -> NanoVDB
@@ -237,26 +255,21 @@ int main(int argc, char* argv[])
                     std::cerr << "Since the last file has extension .nvdb the remaining input files were expected to have extensions .vdb\n" << std::endl;
                     usage(argv[0]);
                 }
-                if (verbose)
-                    std::cout << "Opening OpenVDB file named \"" << inputFile << "\"" << std::endl;
+                if (verbose) std::cout << "Opening OpenVDB file named \"" << inputFile << "\"" << std::endl;
                 openvdb::io::File file(inputFile);
                 file.open(false); //disable delayed loading
                 if (gridName.empty()) {// convert all grid in the file
                     auto grids = file.getGrids();
                     std::vector<nanovdb::GridHandle<nanovdb::HostBuffer> > handles;
                     for (auto& grid : *grids) {
-                        if (verbose) {
-                            std::cout << "Converting OpenVDB grid named \"" << grid->getName() << "\" to NanoVDB" << std::endl;
-                        }
+                        if (verbose) std::cout << "Converting OpenVDB grid named \"" << grid->getName() << "\" to NanoVDB" << std::endl;
                         handles.push_back(openToNano(grid));
                     } // loop over OpenVDB grids in file
                     auto handle = nanovdb::mergeGrids<nanovdb::HostBuffer, std::vector>(handles);
                     nanovdb::io::writeGrid(os, handle, codec);
                 } else {// convert only grid with matching name
                     auto grid = file.readGrid(gridName);
-                    if (verbose) {
-                        std::cout << "Converting OpenVDB grid named \"" << grid->getName() << "\" to NanoVDB" << std::endl;
-                    }
+                    if (verbose) std::cout << "Converting OpenVDB grid named \"" << grid->getName() << "\" to NanoVDB" << std::endl;
                     auto handle = openToNano(grid);
                     nanovdb::io::writeGrid(os, handle, codec);
                 }
@@ -269,14 +282,12 @@ int main(int argc, char* argv[])
                     std::cerr << "Since the last file has extension .vdb the remaining input files were expected to have extensions .nvdb\n" << std::endl;
                     usage(argv[0]);
                 }
-                if (verbose)
-                    std::cout << "Opening NanoVDB file named \"" << inputFile << "\"" << std::endl;
+                if (verbose) std::cout << "Opening NanoVDB file named \"" << inputFile << "\"" << std::endl;
                 if (gridName.empty()) {
                     auto handles = nanovdb::io::readGrids(inputFile, verbose);
                     for (auto &h : handles) {
                         for (uint32_t i = 0; i < h.gridCount(); ++i) {
-                            if (verbose)
-                                std::cout << "Converting NanoVDB grid named \"" << h.gridMetaData(i)->shortGridName() << "\" to OpenVDB" << std::endl;
+                            if (verbose) std::cout << "Converting NanoVDB grid named \"" << h.gridMetaData(i)->shortGridName() << "\" to OpenVDB" << std::endl;
                             grids->push_back(nanovdb::tools::nanoToOpenVDB(h, 0, i));
                         }
                     }
@@ -286,8 +297,7 @@ int main(int argc, char* argv[])
                         std::cerr << "File did not contain a NanoVDB grid named \"" << gridName << "\"\n" << std::endl;
                         usage(argv[0]);
                     }
-                    if (verbose)
-                        std::cout << "Converting NanoVDB grid named \"" << handle.gridMetaData()->shortGridName() << "\" to OpenVDB" << std::endl;
+                    if (verbose) std::cout << "Converting NanoVDB grid named \"" << handle.gridMetaData()->shortGridName() << "\" to OpenVDB" << std::endl;
                     grids->push_back(nanovdb::tools::nanoToOpenVDB(handle));
                 }
             } // loop over input files
diff --git a/nanovdb/nanovdb/tools/CreateNanoGrid.h b/nanovdb/nanovdb/tools/CreateNanoGrid.h
index 6f1ce04076..40a3944bde 100644
--- a/nanovdb/nanovdb/tools/CreateNanoGrid.h
+++ b/nanovdb/nanovdb/tools/CreateNanoGrid.h
@@ -124,6 +124,23 @@ openToNanoVDB(const openvdb::GridBase::Ptr& base,
               StatsMode                     sMode = StatsMode::Default,
               CheckMode                     cMode = CheckMode::Default,
               int                           verbose = 0);
+
+/// @brief Forward declaration of free-standing function that converts an OpenVDB GridBase into a NanoVDB GridHandle with an IndexGrid
+/// @tparam DstBuildT Should be either nanovdb::ValueIndex or nanovdb::ValueOnIndex
+/// @tparam BufferT Type of the buffer used to allocate the destination grid
+/// @param base Shared pointer to a base openvdb grid to be converted
+/// @param channels Number of side-car channels with the values (active or all) in the source grid
+/// @param includeStats If true stats are also indexed
+/// @param includeTiles  If true tile values (active or all) are also indexed
+/// @param verbose Mode of verbosity
+/// @return Handle to the destination NanoGrid of type IndexGrid or OnIndexGrid
+template<typename DstBuildT = nanovdb::ValueOnIndex, typename BufferT = HostBuffer>
+typename util::enable_if<BuildTraits<DstBuildT>::is_index, GridHandle<BufferT>>::type
+openToIndexVDB(const openvdb::GridBase::Ptr& base,
+               uint32_t                      channels = 1u,
+               bool                          includeStats = true,
+               bool                          includeTiles = true,
+               int                           verbose = 0);
 #endif
 
 //================================================================================================
@@ -2024,7 +2041,7 @@ template<typename BufferT>
 GridHandle<BufferT>
 openToNanoVDB(const openvdb::GridBase::Ptr& base,
               StatsMode                     sMode,
-              CheckMode                  cMode,
+              CheckMode                     cMode,
               int                           verbose)
 {
     // We need to define these types because they are not defined in OpenVDB
@@ -2064,6 +2081,52 @@ openToNanoVDB(const openvdb::GridBase::Ptr& base,
         OPENVDB_THROW(openvdb::RuntimeError, "Unrecognized OpenVDB grid type");
     }
 }// openToNanoVDB
+
+template<typename DstBuildT, typename BufferT>
+typename util::enable_if<BuildTraits<DstBuildT>::is_index, GridHandle<BufferT>>::type
+openToIndexVDB(const openvdb::GridBase::Ptr& base,
+              uint32_t                       channels,
+              bool                           includeStats,
+              bool                           includeTiles,
+              int                            verbose)
+{
+    // We need to define these types because they are not defined in OpenVDB
+    using openvdb_Vec4fTree = typename openvdb::tree::Tree4<openvdb::Vec4f, 5, 4, 3>::Type;
+    using openvdb_Vec4dTree = typename openvdb::tree::Tree4<openvdb::Vec4d, 5, 4, 3>::Type;
+    using openvdb_Vec4fGrid = openvdb::Grid<openvdb_Vec4fTree>;
+    using openvdb_Vec4dGrid = openvdb::Grid<openvdb_Vec4dTree>;
+    using openvdb_UInt32Grid = openvdb::Grid<openvdb::UInt32Tree>;
+
+    if (auto grid = openvdb::GridBase::grid<openvdb::FloatGrid>(base)) {
+        return createNanoGrid<openvdb::FloatGrid, DstBuildT, BufferT>(*grid, channels, includeStats, includeTiles, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb::DoubleGrid>(base)) {
+        return createNanoGrid<openvdb::DoubleGrid, DstBuildT, BufferT>(*grid, channels, includeStats, includeTiles, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb::Int32Grid>(base)) {
+        return createNanoGrid<openvdb::Int32Grid, DstBuildT,BufferT>(*grid, channels, includeStats, includeTiles, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb::Int64Grid>(base)) {
+        return createNanoGrid<openvdb::Int64Grid, DstBuildT, BufferT>(*grid, includeStats, includeTiles, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb_UInt32Grid>(base)) {
+        return createNanoGrid<openvdb_UInt32Grid, DstBuildT, BufferT>(*grid, channels, includeStats, includeTiles, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb::Vec3fGrid>(base)) {
+        return createNanoGrid<openvdb::Vec3fGrid, DstBuildT, BufferT>(*grid, channels, includeStats, includeTiles, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb::Vec3dGrid>(base)) {
+        return createNanoGrid<openvdb::Vec3dGrid, DstBuildT, BufferT>(*grid, channels, includeStats, includeTiles, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb::tools::PointIndexGrid>(base)) {
+        return createNanoGrid<openvdb::tools::PointIndexGrid, DstBuildT, BufferT>(*grid, channels, includeStats, includeTiles, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb::points::PointDataGrid>(base)) {
+        return createNanoGrid<openvdb::points::PointDataGrid, DstBuildT, BufferT>(*grid, channels, includeStats, includeTiles, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb::MaskGrid>(base)) {
+        return createNanoGrid<openvdb::MaskGrid, DstBuildT, BufferT>(*grid, channels, includeStats, includeTiles, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb::BoolGrid>(base)) {
+        return createNanoGrid<openvdb::BoolGrid, DstBuildT, BufferT>(*grid, channels, includeStats, includeTiles, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb_Vec4fGrid>(base)) {
+        return createNanoGrid<openvdb_Vec4fGrid, DstBuildT, BufferT>(*grid, channels, includeStats, includeTiles, verbose);
+    } else if (auto grid = openvdb::GridBase::grid<openvdb_Vec4dGrid>(base)) {
+        return createNanoGrid<openvdb_Vec4dGrid, DstBuildT, BufferT>(*grid, channels, includeStats, includeTiles, verbose);
+    } else {
+        OPENVDB_THROW(openvdb::RuntimeError, "Unrecognized OpenVDB grid type");
+    }
+}// openToIndexVDB
 #endif
 
 }// namespace tools ===============================================================================
diff --git a/nanovdb/nanovdb/unittest/TestOpenVDB.cc b/nanovdb/nanovdb/unittest/TestOpenVDB.cc
index a2d44a01d2..07c1e412c4 100644
--- a/nanovdb/nanovdb/unittest/TestOpenVDB.cc
+++ b/nanovdb/nanovdb/unittest/TestOpenVDB.cc
@@ -2807,6 +2807,40 @@ TEST_F(TestOpenVDB, BBox)
     }
 }// BBox
 
+TEST_F(TestOpenVDB, CreateIndexGridFromOpen)
+{
+    using SrcGridT = openvdb::FloatGrid;
+    auto openGrid = this->getSrcGrid(false);// level set dragon or sphere
+    {// create and save an index grid with active values only using the old API
+        using DstBuildT = nanovdb::ValueOnIndex;
+        auto handle = nanovdb::tools::createNanoGrid<SrcGridT, DstBuildT>(*openGrid, 1);// include SDF values in channel 1
+        auto* nanoGrid = handle.grid<DstBuildT>();
+        EXPECT_TRUE(nanoGrid);
+        nanovdb::io::writeGrid("data/ls_dragon_onindex1.nvdb", handle, this->getCodec());
+    }
+    {// create and save an index grid with active values only using the new API
+        using DstBuildT = nanovdb::ValueOnIndex;
+        auto handle = nanovdb::tools::openToIndexVDB<DstBuildT>(openGrid);
+        auto* nanoGrid = handle.grid<DstBuildT>();
+        EXPECT_TRUE(nanoGrid);
+        nanovdb::io::writeGrid("data/ls_dragon_onindex2.nvdb", handle, this->getCodec());
+    }
+    {// create and save an index grid with both active and inactive values using the old API
+        using DstBuildT = nanovdb::ValueIndex;
+        auto handle = nanovdb::tools::createNanoGrid<SrcGridT, DstBuildT>(*openGrid, 1);// include SDF values in channel 1
+        auto* nanoGrid = handle.grid<DstBuildT>();
+        EXPECT_TRUE(nanoGrid);
+        nanovdb::io::writeGrid("data/ls_dragon_index1.nvdb", handle, this->getCodec());
+    }
+    {// create and save an index grid with both active and inactive values using the new API
+        using DstBuildT = nanovdb::ValueIndex;
+        auto handle = nanovdb::tools::openToIndexVDB<DstBuildT>(openGrid);
+        auto* nanoGrid = handle.grid<DstBuildT>();
+        EXPECT_TRUE(nanoGrid);
+        nanovdb::io::writeGrid("data/ls_dragon_index2.nvdb", handle, this->getCodec());
+    }
+}// CreateIndexGridFromOpen
+
 int main(int argc, char** argv)
 {
     ::testing::InitGoogleTest(&argc, argv);
diff --git a/pendingchanges/nanovdb.txt b/pendingchanges/nanovdb.txt
index 83df0ab718..1fca837146 100644
--- a/pendingchanges/nanovdb.txt
+++ b/pendingchanges/nanovdb.txt
@@ -1,3 +1,5 @@
+Added nanovdb::openToIndexVDB, which can convert an OpenVDB grid into a NanoVDB IndexGrid.
+Added new options "--index" and "--onIndex" to the command-line tool nanovdb_convert so it can produce IndexGrids 
 Added support for multiple GPUs to DeviceBuffer
 Added a UnifiedBuffer class that wraps CUDA unified memory
 Added example for multiGPU sparse convolution
@@ -5,4 +7,4 @@ Added CUDA utility functions for device queries
 Added functions to independently stop and compute the elapsed time for timer classes
 
 Fixed ostream specializations being hidden within the nanovdb namespace
-Replaced CUB's CachingDeviceAllocator with the default asynchronous stream ordered allocator in PointsToGrid for improved performance
+Replaced CUB's CachingDeviceAllocator with the default asynchronous stream ordered allocator in PointsToGrid for improved performance
\ No newline at end of file

From 5ecb9c3eadaec00695ca25a2fcd5be7ae55d4984 Mon Sep 17 00:00:00 2001
From: Matthew Cong <1372750+matthewdcong@users.noreply.github.com>
Date: Mon, 6 Jan 2025 11:15:25 -0800
Subject: [PATCH 27/59] Fix format strings and Windows conversion warning
 (#101)

Signed-off-by: Matthew Cong <mcong@nvidia.com>
---
 nanovdb/nanovdb/tools/CreatePrimitives.h    | 4 ++--
 nanovdb/nanovdb/tools/cuda/PointsToGrid.cuh | 3 ++-
 nanovdb/nanovdb/util/cuda/Util.h            | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/nanovdb/nanovdb/tools/CreatePrimitives.h b/nanovdb/nanovdb/tools/CreatePrimitives.h
index 95b8491879..6a9d1a0d1c 100644
--- a/nanovdb/nanovdb/tools/CreatePrimitives.h
+++ b/nanovdb/nanovdb/tools/CreatePrimitives.h
@@ -617,7 +617,7 @@ initSphere(double              radius, // radius of sphere in world units
     grid->setTransform(voxelSize, origin);
 
     // Define radius of sphere with narrow-band in voxel units
-    const ValueT r0 = radius / ValueT(voxelSize), rmax = r0 + ValueT(halfWidth);
+    const ValueT r0 = ValueT(radius / voxelSize), rmax = r0 + ValueT(halfWidth);
 
     // Radius below the Nyquist frequency
     if (r0 < ValueT(1.5f)) return grid;
@@ -689,7 +689,7 @@ initTorus(double              radius1, // major radius of torus in world units
     grid->setTransform(voxelSize, origin);
 
     // Define size of torus with narrow-band in voxel units
-    const ValueT r1 = radius1 / ValueT(voxelSize), r2 = radius2 / ValueT(voxelSize), rmax1 = r1 + r2 + ValueT(halfWidth), rmax2 = r2 + ValueT(halfWidth);
+    const ValueT r1 = ValueT(radius1 / voxelSize), r2 = ValueT(radius2 / voxelSize), rmax1 = r1 + r2 + ValueT(halfWidth), rmax2 = r2 + ValueT(halfWidth);
 
     // Radius below the Nyquist frequency
     if (r2 < ValueT(1.5)) return grid;
diff --git a/nanovdb/nanovdb/tools/cuda/PointsToGrid.cuh b/nanovdb/nanovdb/tools/cuda/PointsToGrid.cuh
index fdfe1267ce..e77edad355 100644
--- a/nanovdb/nanovdb/tools/cuda/PointsToGrid.cuh
+++ b/nanovdb/nanovdb/tools/cuda/PointsToGrid.cuh
@@ -19,6 +19,7 @@
 #include <cub/util_allocator.cuh>
 #include <vector>
 #include <tuple>
+#include <cinttypes>
 
 #include <nanovdb/NanoVDB.h>
 #include <nanovdb/cuda/DeviceBuffer.h>
@@ -705,7 +706,7 @@ jump:// this marks the beginning of the actual algorithm
             } else {// maxPointsPerVoxel = 1 so increase dx significantly
                 dx *= 10.0;
             }
-            if (mVerbose==2) printf("\ntarget density = %u, current density = %u current dx = %f, next dx = %f\n", mMaxPointsPerVoxel, maxPointsPerVoxel, tmp.dx, dx);
+            if (mVerbose==2) printf("\ntarget density = %" PRIu32 ", current density = %" PRIu32 ", current dx = %f, next dx = %f\n", mMaxPointsPerVoxel, maxPointsPerVoxel, tmp.dx, dx);
             mData.map = Map(dx);
             mMemPool.free(mData.d_keys, mStream);
             mMemPool.free(mData.d_indx, mStream);
diff --git a/nanovdb/nanovdb/util/cuda/Util.h b/nanovdb/nanovdb/util/cuda/Util.h
index 132c41a35c..8233ca276b 100644
--- a/nanovdb/nanovdb/util/cuda/Util.h
+++ b/nanovdb/nanovdb/util/cuda/Util.h
@@ -142,7 +142,7 @@ inline void printDevInfo(int device, const char *preMsg = nullptr, std::FILE* fi
     cudaDeviceProp prop;
     cudaGetDeviceProperties(&prop, device);
     if (preMsg) fprintf(file, "%s ", preMsg);
-    fprintf(file,"GPU #%d, named \"%s\", compute capability %d.%d, %lu GB of VRAM\n",
+    fprintf(file,"GPU #%d, named \"%s\", compute capability %d.%d, %zu GB of VRAM\n",
             device, prop.name, prop.major, prop.minor, prop.totalGlobalMem >> 30);
 }
 

From 59d2a5c7e38eaacc65bd6f5cdb00b730fdf479ef Mon Sep 17 00:00:00 2001
From: Matthew Cong <1372750+matthewdcong@users.noreply.github.com>
Date: Mon, 6 Jan 2025 12:09:18 -0800
Subject: [PATCH 28/59] Fix ODR violation due to anonymous namespace in
 SignedFloodFill (#97)

Signed-off-by: Matthew Cong <mcong@nvidia.com>
---
 nanovdb/nanovdb/tools/cuda/SignedFloodFill.cuh | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/nanovdb/nanovdb/tools/cuda/SignedFloodFill.cuh b/nanovdb/nanovdb/tools/cuda/SignedFloodFill.cuh
index f214247a82..af37422455 100644
--- a/nanovdb/nanovdb/tools/cuda/SignedFloodFill.cuh
+++ b/nanovdb/nanovdb/tools/cuda/SignedFloodFill.cuh
@@ -41,8 +41,6 @@ template<typename BuildT>
 typename util::enable_if<BuildTraits<BuildT>::is_float, void>::type
 signedFloodFill(NanoGrid<BuildT> *d_grid, bool verbose = false, cudaStream_t stream = 0);
 
-namespace {// anonymous namespace
-
 template<typename BuildT>
 class SignedFloodFill
 {
@@ -65,6 +63,8 @@ private:
 
 //================================================================================================
 
+namespace kernels {// kernels namespace
+
 template<typename BuildT>
 __global__ void processRootKernel(NanoTree<BuildT> *tree)
 {
@@ -145,7 +145,7 @@ __global__ void cpyNodeCountKernel(NanoGrid<BuildT> *d_grid, uint64_t *d_count)
     *d_count = d_grid->tree().root().tileCount();
 }
 
-}// anonymous namespace
+}// kernels namespace
 
 //================================================================================================
 
@@ -156,7 +156,7 @@ void SignedFloodFill<BuildT>::operator()(NanoGrid<BuildT> *d_grid)
     NANOVDB_ASSERT(d_grid);
     uint64_t count[4], *d_count = nullptr;
     cudaCheck(util::cuda::mallocAsync((void**)&d_count, 4*sizeof(uint64_t), mStream));
-    cpyNodeCountKernel<BuildT><<<1, 1, 0, mStream>>>(d_grid, d_count);
+    kernels::cpyNodeCountKernel<BuildT><<<1, 1, 0, mStream>>>(d_grid, d_count);
     cudaCheckError();
     cudaCheck(cudaMemcpyAsync(&count, d_count, 4*sizeof(uint64_t), cudaMemcpyDeviceToHost, mStream));
     cudaCheck(util::cuda::freeAsync(d_count, mStream));
@@ -166,19 +166,19 @@ void SignedFloodFill<BuildT>::operator()(NanoGrid<BuildT> *d_grid)
     auto *tree = reinterpret_cast<NanoTree<BuildT>*>(d_grid + 1);
 
     if (mVerbose) mTimer.start("\nProcess leaf nodes");
-    processLeafKernel<BuildT><<<blocksPerGrid(count[0]<<9), threadsPerBlock, 0, mStream>>>(tree, count[0]<<9);
+    kernels::processLeafKernel<BuildT><<<blocksPerGrid(count[0]<<9), threadsPerBlock, 0, mStream>>>(tree, count[0]<<9);
     cudaCheckError();
 
     if (mVerbose) mTimer.restart("Process lower internal nodes");
-    processNodeKernel<BuildT,1><<<blocksPerGrid(count[1]<<12), threadsPerBlock, 0, mStream>>>(tree, count[1]<<12);
+    kernels::processNodeKernel<BuildT,1><<<blocksPerGrid(count[1]<<12), threadsPerBlock, 0, mStream>>>(tree, count[1]<<12);
     cudaCheckError();
 
     if (mVerbose) mTimer.restart("Process upper internal nodes");
-    processNodeKernel<BuildT,2><<<blocksPerGrid(count[2]<<15), threadsPerBlock, 0, mStream>>>(tree, count[2]<<15);
+    kernels::processNodeKernel<BuildT,2><<<blocksPerGrid(count[2]<<15), threadsPerBlock, 0, mStream>>>(tree, count[2]<<15);
     cudaCheckError();
 
     //if (mVerbose) mTimer.restart("Process root node");
-    //processRootKernel<BuildT><<<1, 1, 0, mStream>>>(tree);
+    //kernels::processRootKernel<BuildT><<<1, 1, 0, mStream>>>(tree);
     if (mVerbose) mTimer.stop();
     cudaCheckError();
 }// SignedFloodFill::operator()

From 8d2894cd5d2a0fd425fa2bb8aa4c5acb5b1936b3 Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Tue, 7 Jan 2025 11:08:53 +1300
Subject: [PATCH 29/59] Updating fvdb and whitespace actions for PR bot
 workflow

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 .github/workflows/fvdb.yml           | 8 +++-----
 .github/workflows/fvdb_codestyle.yml | 6 ++----
 .github/workflows/whitespace.yml     | 6 ++----
 3 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/fvdb.yml b/.github/workflows/fvdb.yml
index 3f67b8e062..9325c9a927 100644
--- a/.github/workflows/fvdb.yml
+++ b/.github/workflows/fvdb.yml
@@ -1,11 +1,9 @@
 name: fVDB Unit Tests
 
 on:
-  pull_request:
+  push:
     branches:
-        - 'master'
-        - 'feature/**'
-        - 'pr/**'
+      - "pull-request/[0-9]+"
     paths-ignore:
         - 'CHANGES'
         - 'CODEOWNERS'
@@ -233,4 +231,4 @@ jobs:
         run: |
           echo "Cleaning up /__w/_temp directory"
           sudo rm -rf /__w/_temp/*
-          echo "Cleanup completed"
\ No newline at end of file
+          echo "Cleanup completed"
diff --git a/.github/workflows/fvdb_codestyle.yml b/.github/workflows/fvdb_codestyle.yml
index c3cb436dd4..525b67c98b 100644
--- a/.github/workflows/fvdb_codestyle.yml
+++ b/.github/workflows/fvdb_codestyle.yml
@@ -1,10 +1,8 @@
 name: fVDB Code Style
 on:
-  pull_request:
+  push:
     branches:
-        - 'master'
-        - 'feature/**'
-        - 'pr/**'
+      - "pull-request/[0-9]+"
     paths-ignore:
         - 'CHANGES'
         - 'CODEOWNERS'
diff --git a/.github/workflows/whitespace.yml b/.github/workflows/whitespace.yml
index 57c3f74b00..213da0e5c6 100644
--- a/.github/workflows/whitespace.yml
+++ b/.github/workflows/whitespace.yml
@@ -2,10 +2,8 @@ name: Whitespace
 
 on:
   push:
-    paths-ignore:
-      - 'pendingchanges/**'
-      - 'tsc/meetings/**'
-  pull_request:
+    branches:
+      - "pull-request/[0-9]+"
     paths-ignore:
       - 'pendingchanges/**'
       - 'tsc/meetings/**'

From 182a9e584e06815bad03fad7cd4df379a3005f4a Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Tue, 7 Jan 2025 12:15:40 +1300
Subject: [PATCH 30/59] Commenting out fvdb paths-ignore

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 .github/workflows/fvdb.yml | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/fvdb.yml b/.github/workflows/fvdb.yml
index 9325c9a927..fae24d9e11 100644
--- a/.github/workflows/fvdb.yml
+++ b/.github/workflows/fvdb.yml
@@ -4,23 +4,23 @@ on:
   push:
     branches:
       - "pull-request/[0-9]+"
-    paths-ignore:
-        - 'CHANGES'
-        - 'CODEOWNERS'
-        - 'doc/**'
-        - 'openvdb/**'
-        - 'openvdb_cmd/**'
-        - 'openvdb_ax/**'
-        - 'openvdb_maya/**'
-        - 'openvdb_houdini/**'
-        - 'nanovdb/**'
-        - 'pendingchanges/**'
-        - '**.md'
-        - 'fvdb/debug/**'
-        - 'fvdb/docs/**'
-        - 'fvdb/examples/**'
-        - 'fvdb/notebooks/**'
-        - 'fvdb/scripts/**'
+    # paths-ignore:
+    #     - 'CHANGES'
+    #     - 'CODEOWNERS'
+    #     - 'doc/**'
+    #     - 'openvdb/**'
+    #     - 'openvdb_cmd/**'
+    #     - 'openvdb_ax/**'
+    #     - 'openvdb_maya/**'
+    #     - 'openvdb_houdini/**'
+    #     - 'nanovdb/**'
+    #     - 'pendingchanges/**'
+    #     - '**.md'
+    #     - 'fvdb/debug/**'
+    #     - 'fvdb/docs/**'
+    #     - 'fvdb/examples/**'
+    #     - 'fvdb/notebooks/**'
+    #     - 'fvdb/scripts/**'
 
 # Allow subsequent pushes to the same PR or REF to cancel any previous jobs.
 concurrency:

From c84d4e2848c3e26e552b485dd3a4b07f413a6e1a Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Tue, 7 Jan 2025 13:10:02 +1300
Subject: [PATCH 31/59] Uncomment paths-ignore in fvdb action

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 .github/workflows/fvdb.yml | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/fvdb.yml b/.github/workflows/fvdb.yml
index fae24d9e11..9325c9a927 100644
--- a/.github/workflows/fvdb.yml
+++ b/.github/workflows/fvdb.yml
@@ -4,23 +4,23 @@ on:
   push:
     branches:
       - "pull-request/[0-9]+"
-    # paths-ignore:
-    #     - 'CHANGES'
-    #     - 'CODEOWNERS'
-    #     - 'doc/**'
-    #     - 'openvdb/**'
-    #     - 'openvdb_cmd/**'
-    #     - 'openvdb_ax/**'
-    #     - 'openvdb_maya/**'
-    #     - 'openvdb_houdini/**'
-    #     - 'nanovdb/**'
-    #     - 'pendingchanges/**'
-    #     - '**.md'
-    #     - 'fvdb/debug/**'
-    #     - 'fvdb/docs/**'
-    #     - 'fvdb/examples/**'
-    #     - 'fvdb/notebooks/**'
-    #     - 'fvdb/scripts/**'
+    paths-ignore:
+        - 'CHANGES'
+        - 'CODEOWNERS'
+        - 'doc/**'
+        - 'openvdb/**'
+        - 'openvdb_cmd/**'
+        - 'openvdb_ax/**'
+        - 'openvdb_maya/**'
+        - 'openvdb_houdini/**'
+        - 'nanovdb/**'
+        - 'pendingchanges/**'
+        - '**.md'
+        - 'fvdb/debug/**'
+        - 'fvdb/docs/**'
+        - 'fvdb/examples/**'
+        - 'fvdb/notebooks/**'
+        - 'fvdb/scripts/**'
 
 # Allow subsequent pushes to the same PR or REF to cancel any previous jobs.
 concurrency:

From 615a0f1ab94b297e1ac5db9023d705bd7891333c Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Tue, 7 Jan 2025 14:29:54 +1300
Subject: [PATCH 32/59] Fix carriage return typo

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 nanovdb/nanovdb/examples/ex_make_mgpu_nanovdb/main.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nanovdb/nanovdb/examples/ex_make_mgpu_nanovdb/main.cu b/nanovdb/nanovdb/examples/ex_make_mgpu_nanovdb/main.cu
index 98ca45baa8..1f744b23fd 100644
--- a/nanovdb/nanovdb/examples/ex_make_mgpu_nanovdb/main.cu
+++ b/nanovdb/nanovdb/examples/ex_make_mgpu_nanovdb/main.cu
@@ -1,5 +1,6 @@
 // Copyright Contributors to the OpenVDB Project
-// SPDX-License-Identifier: Apache-2.0#include <nanovdb/NanoVDB.h>
+// SPDX-License-Identifier: Apache-2.0
+#include <nanovdb/NanoVDB.h>
 #include <nanovdb/tools/CreatePrimitives.h>
 #include <nanovdb/cuda/UnifiedBuffer.h>
 #include <nanovdb/cuda/DeviceStreamMap.h>

From 07b3ee731d9cd48acc8a4256dc7af398d0346fdf Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Tue, 7 Jan 2025 14:33:27 +1300
Subject: [PATCH 33/59] Attemping to fix fvdb action memory issue

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 .github/workflows/fvdb.yml       | 2 +-
 fvdb/.github/workflows/tests.yml | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/fvdb.yml b/.github/workflows/fvdb.yml
index 9325c9a927..84e7eafcfc 100644
--- a/.github/workflows/fvdb.yml
+++ b/.github/workflows/fvdb.yml
@@ -60,7 +60,7 @@ jobs:
       - name: Buid fvdb
         run: |
           cd fvdb;
-          TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6+PTX" MAX_JOBS=$(($(nproc) < $(free -g | awk '/^Mem:/{jobs=int($7/2.5); if(jobs<1) jobs=1; print jobs}') ? $(nproc) : $(free -g | awk '/^Mem:/{jobs=int($7/2.5); if(jobs<1) jobs=1; print jobs}'))) conda run --no-capture-output -n fvdb_build python setup.py bdist_wheel --dist-dir=dist
+          TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6+PTX" MAX_JOBS=$(($(nproc) < $(free -g | awk '/^Mem:/{jobs=int($4/2.5); if(jobs<1) jobs=1; print jobs}') ? $(nproc) : $(free -g | awk '/^Mem:/{jobs=int($4/2.5); if(jobs<1) jobs=1; print jobs}'))) conda run --no-capture-output -n fvdb_build python setup.py bdist_wheel --dist-dir=dist
 
       - name: Upload package
         uses: actions/upload-artifact@v4
diff --git a/fvdb/.github/workflows/tests.yml b/fvdb/.github/workflows/tests.yml
index c6b71fb8e7..9b6cd909a0 100644
--- a/fvdb/.github/workflows/tests.yml
+++ b/fvdb/.github/workflows/tests.yml
@@ -47,7 +47,7 @@ jobs:
 
       - name: Buid fvdb
         run: |
-          TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6+PTX" MAX_JOBS=$(($(nproc) < $(free -g | awk '/^Mem:/{jobs=int($7/2.5); if(jobs<1) jobs=1; print jobs}') ? $(nproc) : $(free -g | awk '/^Mem:/{jobs=int($7/2.5); if(jobs<1) jobs=1; print jobs}'))) conda run --no-capture-output -n fvdb_build python setup.py bdist_wheel --dist-dir=dist
+          TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6+PTX" MAX_JOBS=$(($(nproc) < $(free -g | awk '/^Mem:/{jobs=int($4/2.5); if(jobs<1) jobs=1; print jobs}') ? $(nproc) : $(free -g | awk '/^Mem:/{jobs=int($4/2.5); if(jobs<1) jobs=1; print jobs}'))) conda run --no-capture-output -n fvdb_build python setup.py bdist_wheel --dist-dir=dist
 
       - name: Upload package
         uses: actions/upload-artifact@v4
@@ -218,4 +218,4 @@ jobs:
         run: |
           echo "Cleaning up /__w/_temp directory"
           sudo rm -rf /__w/_temp/*
-          echo "Cleanup completed"
\ No newline at end of file
+          echo "Cleanup completed"

From 3f9b95641891e2424b6eff446bd2713434a0272d Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Tue, 7 Jan 2025 17:54:36 +1300
Subject: [PATCH 34/59] Fix BuildDeviceGrid's calls to NanoVDB voxelsToGrid
 which removed an allocator type as an optional template argument.

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 fvdb/src/detail/ops/BuildDeviceGrid.cu | 48 ++++++--------------------
 1 file changed, 10 insertions(+), 38 deletions(-)

diff --git a/fvdb/src/detail/ops/BuildDeviceGrid.cu b/fvdb/src/detail/ops/BuildDeviceGrid.cu
index d3f88bfd8b..0c879a328b 100644
--- a/fvdb/src/detail/ops/BuildDeviceGrid.cu
+++ b/fvdb/src/detail/ops/BuildDeviceGrid.cu
@@ -132,32 +132,6 @@ ijkForDense(nanovdb::Coord origin, nanovdb::Coord size, TorchRAcc32<int32_t, 2>
     outIJKAccessor[tid][2] = zi + origin[2];
 }
 
-struct NanoVDBGridBuilderTorchAllocator {
-    std::set<void *> mAllocatedData;
-
-    cudaError_t
-    DeviceAllocate(void **ptr, size_t size, cudaStream_t stream) {
-        *ptr = c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(size, stream);
-        mAllocatedData.insert(*ptr);
-        return (cudaError_t)CUDA_SUCCESS;
-    }
-
-    cudaError_t
-    DeviceFree(void *ptr) {
-        c10::cuda::CUDACachingAllocator::raw_delete(ptr);
-        mAllocatedData.erase(ptr);
-        return (cudaError_t)CUDA_SUCCESS;
-    }
-
-    void
-    FreeAllCached() {
-        for (void *ptr: mAllocatedData) {
-            c10::cuda::CUDACachingAllocator::raw_delete(ptr);
-        }
-        mAllocatedData.clear();
-    }
-};
-
 template <>
 nanovdb::GridHandle<TorchDeviceBuffer>
 dispatchCreateNanoGridFromIJK<torch::kCUDA>(const JaggedTensor &ijk, bool isMutable) {
@@ -194,12 +168,11 @@ dispatchCreateNanoGridFromIJK<torch::kCUDA>(const JaggedTensor &ijk, bool isMuta
             // torch::Tensor ijkDataSlice = ijkData.narrow(0, startIdx, nVoxels);
             const int32_t *dataPtr = ijkData.data_ptr<int32_t>() + 3 * startIdx;
 
-            handles.push_back(
-                nVoxels == 0 ? build::buildEmptyGrid(guide.device(), isMutable)
-                             : nanovdb::tools::cuda::voxelsToGrid<GridType, nanovdb::Coord *,
-                                                                  TorchDeviceBuffer,
-                                                                  NanoVDBGridBuilderTorchAllocator>(
-                                   (nanovdb::Coord *)dataPtr, nVoxels, 1.0, guide));
+            handles.push_back(nVoxels == 0
+                                  ? build::buildEmptyGrid(guide.device(), isMutable)
+                                  : nanovdb::tools::cuda::voxelsToGrid<GridType, nanovdb::Coord *,
+                                                                       TorchDeviceBuffer>(
+                                        (nanovdb::Coord *)dataPtr, nVoxels, 1.0, guide));
             C10_CUDA_KERNEL_LAUNCH_CHECK();
         }
 
@@ -260,12 +233,11 @@ dispatchCreateNanoGridFromDense<torch::kCUDA>(uint32_t batchSize, nanovdb::Coord
         std::vector<nanovdb::GridHandle<TorchDeviceBuffer>> handles;
         for (int i = 0; i < batchSize; i += 1) {
             const int64_t nVoxels = ijkData.size(0);
-            handles.push_back(
-                nVoxels == 0 ? build::buildEmptyGrid(guide.device(), isMutable)
-                             : nanovdb::tools::cuda::voxelsToGrid<GridType, nanovdb::Coord *,
-                                                                  TorchDeviceBuffer,
-                                                                  NanoVDBGridBuilderTorchAllocator>(
-                                   (nanovdb::Coord *)ijkData.data_ptr(), nVoxels, 1.0, guide));
+            handles.push_back(nVoxels == 0
+                                  ? build::buildEmptyGrid(guide.device(), isMutable)
+                                  : nanovdb::tools::cuda::voxelsToGrid<GridType, nanovdb::Coord *,
+                                                                       TorchDeviceBuffer>(
+                                        (nanovdb::Coord *)ijkData.data_ptr(), nVoxels, 1.0, guide));
             C10_CUDA_KERNEL_LAUNCH_CHECK();
         }
 

From afbd5a9fcc09e7bcb9a770bc8d96940b73b17fa2 Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Thu, 19 Dec 2024 15:32:18 +1300
Subject: [PATCH 35/59] Adding garfield source from research repo

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 .../panoptic_segmentation/garfield/.gitignore |   4 +
 .../panoptic_segmentation/garfield/LICENSE    |  21 +
 .../panoptic_segmentation/garfield/README.md  | 110 +++
 .../garfield/garfield/garfield_config.py      | 143 ++++
 .../garfield/garfield/garfield_datamanager.py | 340 +++++++++
 .../garfield/garfield/garfield_field.py       | 155 +++++
 .../garfield/garfield_gaussian_pipeline.py    | 648 ++++++++++++++++++
 .../garfield/garfield/garfield_interaction.py | 146 ++++
 .../garfield/garfield/garfield_model.py       | 284 ++++++++
 .../garfield/garfield/garfield_pipeline.py    | 186 +++++
 .../garfield/garfield_pixel_sampler.py        |  78 +++
 .../garfield/garfield/img_group_model.py      | 102 +++
 .../garfield/pyproject.toml                   |  22 +
 13 files changed, 2239 insertions(+)
 create mode 100644 fvdb/projects/panoptic_segmentation/garfield/.gitignore
 create mode 100644 fvdb/projects/panoptic_segmentation/garfield/LICENSE
 create mode 100644 fvdb/projects/panoptic_segmentation/garfield/README.md
 create mode 100644 fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_config.py
 create mode 100644 fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_datamanager.py
 create mode 100644 fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_field.py
 create mode 100644 fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_gaussian_pipeline.py
 create mode 100644 fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_interaction.py
 create mode 100644 fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_model.py
 create mode 100644 fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_pipeline.py
 create mode 100644 fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_pixel_sampler.py
 create mode 100644 fvdb/projects/panoptic_segmentation/garfield/garfield/img_group_model.py
 create mode 100644 fvdb/projects/panoptic_segmentation/garfield/pyproject.toml

diff --git a/fvdb/projects/panoptic_segmentation/garfield/.gitignore b/fvdb/projects/panoptic_segmentation/garfield/.gitignore
new file mode 100644
index 0000000000..48702ef2fc
--- /dev/null
+++ b/fvdb/projects/panoptic_segmentation/garfield/.gitignore
@@ -0,0 +1,4 @@
+# ignore all __pycache__ directories, in .gitignore
+__pycache__/
+*.egg-info/
+outputs/
diff --git a/fvdb/projects/panoptic_segmentation/garfield/LICENSE b/fvdb/projects/panoptic_segmentation/garfield/LICENSE
new file mode 100644
index 0000000000..9b34235b5f
--- /dev/null
+++ b/fvdb/projects/panoptic_segmentation/garfield/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 UC Berkeley
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/fvdb/projects/panoptic_segmentation/garfield/README.md b/fvdb/projects/panoptic_segmentation/garfield/README.md
new file mode 100644
index 0000000000..b42aef587a
--- /dev/null
+++ b/fvdb/projects/panoptic_segmentation/garfield/README.md
@@ -0,0 +1,110 @@
+# <img src="https://www.garfield.studio/data/favicon.png" height="30px"> GARField: Group Anything with Radiance Fields
+
+This is the official implementation for [GARField](https://www.garfield.studio).
+
+Tested on Python 3.10, cuda 12.0, using conda. 
+
+<div align='center'>
+<img src="https://www.garfield.studio/data/garfield_training.jpg" height="230px">
+</div>
+
+## Installation
+1. Install nerfstudio from source, and its dependencies. This project requires the latest version of nerfstudio
+(more specifically, the new viewer based on viser).
+```
+# install dependencies
+pip3 install torch torchvision torchaudio
+conda install -c "nvidia/label/cuda-12.0.0" cuda-toolkit
+pip install ninja git+https://github.com/NVlabs/tiny-cuda-nn/#subdirectory=bindings/torch
+
+# install nerfstudio!
+git clone git@github.com:nerfstudio-project/nerfstudio.git
+cd nerfstudio
+pip install -e .
+```
+
+2. To use GARField with Gaussian Splatting, [`cuml`](https://docs.rapids.ai/install) is required (for global clustering).
+The best way to install it is through conda: `conda install -c rapidsai -c conda-forge -c nvidia cuml`
+
+, or with pip: `pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12==24.2.* cuml-cu12==24.2.*`.
+
+Important: I used [`libmamba`](https://www.anaconda.com/blog/a-faster-conda-for-a-growing-community) for conda. I have been told multiple times that the conda solver is very slow / gets stuck, but this seems to be key. 
+
+If you get `ClobberError`, try `conda clean --all` -- see [here](https://stackoverflow.com/questions/51217876/conda-update-anaconda-fails-clobbererror). It seems that `pip` installed packages from `nerfstudio` may conflict with the `conda` install here. 
+
+3. Install GARField!
+```
+git clone git@github.com:chungmin99/garfield.git
+pip install -e .
+```
+
+This installs both `garfield` (NeRF geometry), and `garfield-gauss` (Gaussian geometry).
+Note that `garfield-gauss` requires reference to a fully trained `garfield` checkpoint,
+as it relies on the affinity field from `garfield`. See the main paper for more details.
+
+4. (Optional) If you wish to use a different version of the SAM model (by default, the Hugging Face Transformer's SAM model facebook/sam-vit-huge is used), please install the 'segment_anything' package.
+
+```
+pip install git+https://github.com/facebookresearch/segment-anything.git
+```
+
+## Running GARField
+
+Note: using colmap-based image data makes it more convenient to run both `garfield` and `garfield-gauss` on the same dataset. Although `splatfacto` (Gaussian Splatting in nerfstudio) is supported with `NerfstudioDataParser`, and also supports random point initialization with non-colmap datasets, the NeRF and GS geometries will align better with colmap since 1) we will start from colmap points and 2) camera optimization is minimized.
+
+You can use it like any other third-party nerfstudio project.
+```
+ns-train garfield --data /your/data/here
+```
+Note that GARField will pause to generate groups using Segment-Anything at around 2000 steps
+(set by default, this can be set in GarfieldPipeline).
+Afterwards, you can start interacting with the affinity field.
+1. PCA visualization of affinity field: select `instance` as the output type,
+   and change the value of `scale` slider.
+
+https://github.com/chungmin99/garfield/assets/10284938/e193d7e8-da7c-4176-b7c5-a7ec75513c16
+
+2. Affinity visualization between 3D point and scene: use "Click" button to
+   select the point, and select `instance_interact` as the output type. 
+   You might need to drag the viewer window slightly to see this output type.
+   Again, interact with the `scale` slider!
+Here, with `invert` True and output unnormalized, red color means high affinity (i.e., features at click point and rendered point are close to each other). Blue means low affinity. 
+
+https://github.com/chungmin99/garfield/assets/10284938/6edbdad6-d356-4b32-b44e-0df8ec1dca16
+
+Also, note: the results can change a lot between 2k to 30k steps. 
+
+Once the model is trained to completion, you can use the outputted config file for `garfield-gauss`.
+
+## Running GARField with Gaussian Splatting geometry!
+Although GARField's affinity field is optimized using NeRF geometry, it can be
+used to group and cluster gaussians in 3D!
+```
+ns-train garfield-gauss --data /your/data/here --pipeline.garfield-ckpt outputs/your/data/garfield/.../config.yml
+```
+
+There are two main ways to interact with the scene -- make sure to pause training first!
+1. Interactive selection: click anywhere in the scene, and use "Crop to Click" button to retrieve different groups (scale=group level*0.05). Use "Drag Current Crop" to move it around!
+
+
+https://github.com/chungmin99/garfield/assets/10284938/82ea7145-d8d1-485d-bab2-f6e8b0ebd632
+
+
+2. Global clustering: cluster the currently visible gaussians (either globally or just for the crop), at the scale specified by "Cluster Scale".
+
+
+https://github.com/chungmin99/garfield/assets/10284938/541fe037-925c-418f-929d-a9397f8d57d3
+
+
+   
+## Citation
+If you use this work or find it helpful, please consider citing: (bibtex)
+
+```
+@inproceedings{garfield2024,
+ author = {Kim, Chung Min* and Wu, Mingxuan* and Kerr, Justin* and Tancik, Matthew and Goldberg, Ken and Kanazawa, Angjoo},
+ title = {GARField: Group Anything with Radiance Fields},
+ booktitle = {arXiv},
+ year = {2024},
+}
+```
diff --git a/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_config.py b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_config.py
new file mode 100644
index 0000000000..7cd1c2f9b2
--- /dev/null
+++ b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_config.py
@@ -0,0 +1,143 @@
+from nerfstudio.configs.base_config import ViewerConfig
+from nerfstudio.data.dataparsers.nerfstudio_dataparser import NerfstudioDataParserConfig
+from nerfstudio.engine.optimizers import AdamOptimizerConfig
+from nerfstudio.engine.schedulers import ExponentialDecaySchedulerConfig
+from nerfstudio.plugins.types import MethodSpecification
+from nerfstudio.pipelines.base_pipeline import VanillaPipelineConfig
+from nerfstudio.data.datamanagers.base_datamanager import VanillaDataManagerConfig
+from nerfstudio.engine.trainer import TrainerConfig
+
+# For Gaussian Splatting
+from nerfstudio.data.datamanagers.full_images_datamanager import (
+    FullImageDatamanagerConfig,
+)
+from nerfstudio.models.splatfacto import SplatfactoModelConfig
+from nerfstudio.data.dataparsers.colmap_dataparser import ColmapDataParserConfig
+
+from garfield.garfield_pipeline import GarfieldPipelineConfig
+from garfield.garfield_datamanager import GarfieldDataManagerConfig
+from garfield.garfield_pixel_sampler import GarfieldPixelSamplerConfig
+from garfield.garfield_model import GarfieldModelConfig
+from garfield.garfield_field import GarfieldFieldConfig
+from garfield.img_group_model import ImgGroupModelConfig
+from garfield.garfield_gaussian_pipeline import GarfieldGaussianPipelineConfig
+
+
+garfield_method = MethodSpecification(
+    config=TrainerConfig(
+        method_name="garfield",
+        steps_per_eval_image=100,
+        steps_per_eval_batch=100,
+        steps_per_save=2000,
+        steps_per_eval_all_images=100000,
+        max_num_iterations=30000,
+        mixed_precision=False,
+        pipeline=GarfieldPipelineConfig(
+            datamanager=GarfieldDataManagerConfig(
+                dataparser=NerfstudioDataParserConfig(train_split_fraction=0.99),
+                train_num_rays_per_batch=4096,
+                eval_num_rays_per_batch=4096,
+                pixel_sampler=GarfieldPixelSamplerConfig(
+                    num_rays_per_image=256,  # 4096/256 = 16 images per batch
+                ),
+                img_group_model=ImgGroupModelConfig(
+                    model_type="sam_hf",  
+                    # Can choose out of "sam_fb", "sam_hf", "maskformer"
+                    # Used sam_fb for the paper, see `img_group_model.py`. 
+                    device="cuda",
+                ),
+            ),
+            model=GarfieldModelConfig(
+                instance_field=GarfieldFieldConfig(
+                    n_instance_dims=256  # 256 in original
+                )
+            ),
+        ),
+        optimizers={
+            "proposal_networks": {
+                "optimizer": AdamOptimizerConfig(lr=1e-2, eps=1e-15),
+                "scheduler": None,
+            },
+            "fields": {
+                "optimizer": AdamOptimizerConfig(lr=1e-2, eps=1e-15),
+                "scheduler": ExponentialDecaySchedulerConfig(
+                    lr_final=1e-3, max_steps=30000
+                ),
+            },
+            "garfield": {
+                "optimizer": AdamOptimizerConfig(
+                    lr=1e-4, eps=1e-15, weight_decay=1e-6, max_norm=1.0
+                ),
+                # TODO the warmup_steps == pipeline.start_grouping_step, but would be good to not hardcode it
+                "scheduler": ExponentialDecaySchedulerConfig(
+                    lr_final=1e-5, max_steps=10000, warmup_steps=2000
+                ),
+            },
+            "camera_opt": {
+                "optimizer": AdamOptimizerConfig(lr=1e-3, eps=1e-15),
+                "scheduler": ExponentialDecaySchedulerConfig(
+                    lr_final=1e-4, max_steps=5000
+                ),
+            },
+        },
+        viewer=ViewerConfig(num_rays_per_chunk=1 << 15),
+        vis="viewer",
+    ),
+    description="Group Anything with Radiance Fields",
+)
+
+garfield_gauss_method = MethodSpecification(
+    config=TrainerConfig(
+        method_name="garfield-gauss",
+        steps_per_eval_image=100,
+        steps_per_eval_batch=100,
+        steps_per_save=2000,
+        steps_per_eval_all_images=100000, 
+        max_num_iterations=30000,
+        mixed_precision=False,
+        gradient_accumulation_steps = {'camera_opt': 100,'color':10,'shs':10},
+
+        pipeline=GarfieldGaussianPipelineConfig(
+            datamanager=FullImageDatamanagerConfig(
+                dataparser=NerfstudioDataParserConfig(load_3D_points=True),
+            ),
+            model=SplatfactoModelConfig(
+                cull_alpha_thresh=0.2,
+                use_scale_regularization=True,
+            ),
+        ),
+        optimizers={
+            "means": {
+                "optimizer": AdamOptimizerConfig(lr=1.6e-4, eps=1e-15),
+                "scheduler": ExponentialDecaySchedulerConfig(
+                    lr_final=1.6e-6,
+                    max_steps=30000,
+                ),
+            },
+            "features_dc": {
+                "optimizer": AdamOptimizerConfig(lr=0.0025, eps=1e-15),
+                "scheduler": None,
+            },
+            "features_rest": {
+                "optimizer": AdamOptimizerConfig(lr=0.0025 / 20, eps=1e-15),
+                "scheduler": None,
+            },
+            "opacities": {
+                "optimizer": AdamOptimizerConfig(lr=0.05, eps=1e-15),
+                "scheduler": None,
+            },
+            "scales": {
+                "optimizer": AdamOptimizerConfig(lr=0.005, eps=1e-15),
+                "scheduler": None,
+            },
+            "quats": {"optimizer": AdamOptimizerConfig(lr=0.001, eps=1e-15), "scheduler": None},
+            "camera_opt": {
+                "optimizer": AdamOptimizerConfig(lr=1e-3, eps=1e-15),
+                "scheduler": ExponentialDecaySchedulerConfig(lr_final=5e-5, max_steps=30000),
+            },
+        },
+        viewer=ViewerConfig(num_rays_per_chunk=1 << 15),
+        vis="viewer",
+    ),
+    description="anythingnerf with gauss",
+)
\ No newline at end of file
diff --git a/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_datamanager.py b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_datamanager.py
new file mode 100644
index 0000000000..807bf47392
--- /dev/null
+++ b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_datamanager.py
@@ -0,0 +1,340 @@
+"""
+Datamanager.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union
+from typing_extensions import TypeVar
+
+import torch
+from nerfstudio.cameras.rays import RayBundle
+from nerfstudio.data.datasets.base_dataset import InputDataset
+from rich.progress import Console
+
+CONSOLE = Console(width=120)
+
+import h5py
+import os
+import os.path as osp
+
+
+import numpy as np
+from nerfstudio.data.datamanagers.base_datamanager import (
+    VanillaDataManager,
+    VanillaDataManagerConfig,
+)
+
+from garfield.img_group_model import ImgGroupModelConfig, ImgGroupModel
+from garfield.garfield_pixel_sampler import GarfieldPixelSampler
+
+
+@dataclass
+class GarfieldDataManagerConfig(VanillaDataManagerConfig):
+    _target: Type = field(default_factory=lambda: GarfieldDataManager)
+    """The datamanager class to use."""
+    img_group_model: ImgGroupModelConfig = field(default_factory=lambda: ImgGroupModelConfig())
+    """The SAM model to use. This can be any other model that outputs masks..."""
+
+
+TDataset = TypeVar("TDataset", bound=InputDataset, default=InputDataset)
+
+
+class GarfieldDataManager(VanillaDataManager):  # pylint: disable=abstract-method
+    """
+    Tacking on grouping info to the normal VanillaDataManager.
+    """
+
+    config: GarfieldDataManagerConfig
+    train_pixel_sampler: Optional[GarfieldPixelSampler] = None
+
+    def __init__(
+        self,
+        config: GarfieldDataManagerConfig,
+        device: Union[torch.device, str] = "cpu",
+        test_mode: Literal["test", "val", "inference"] = "val",
+        world_size: int = 1,
+        local_rank: int = 0,
+        **kwargs,  # pylint: disable=unused-argument
+    ):
+        super().__init__(
+            config=config,
+            device=device,
+            test_mode=test_mode,
+            world_size=world_size,
+            local_rank=local_rank,
+            **kwargs,
+        )
+        self.img_group_model: ImgGroupModel = self.config.img_group_model.setup(device=self.device)
+
+        # This is where all the group data + statistics is stored.
+        # Note that this can get quite big (~10GB if 300 images, ...)
+        cache_dir = f"outputs/{self.config.dataparser.data.name}"
+        self.sam_data_path = Path(cache_dir) / "sam_data.hdf5"
+
+        self.pixel_level_keys = None
+        self.scale_3d = None
+        self.group_cdf = None
+        self.scale_3d_statistics = None
+
+    def load_sam_data(self) -> bool:
+        """
+        Loads the SAM data (masks, 3D scales, etc.) through hdf5.
+        If the file doesn't exist, returns False.
+        """
+        prefix = self.img_group_model.config.model_type
+        if osp.exists(self.sam_data_path):
+            sam_data = h5py.File(self.sam_data_path, "r")
+            if prefix not in sam_data.keys():
+                return False
+
+            sam_data = sam_data[prefix]
+
+            pixel_level_keys_list, scales_3d_list, group_cdf_list = [], [], []
+
+            num_entries = len(sam_data["pixel_level_keys"].keys())
+            for i in range(num_entries):
+                pixel_level_keys_list.append(
+                    torch.from_numpy(sam_data["pixel_level_keys"][str(i)][...])
+                )
+            self.pixel_level_keys = torch.nested.nested_tensor(pixel_level_keys_list)
+            del pixel_level_keys_list
+
+            for i in range(num_entries):
+                scales_3d_list.append(torch.from_numpy(sam_data["scale_3d"][str(i)][...]))
+            self.scale_3d = torch.nested.nested_tensor(scales_3d_list)
+            self.scale_3d_statistics = torch.cat(scales_3d_list)
+            del scales_3d_list
+
+            for i in range(num_entries):
+                group_cdf_list.append(torch.from_numpy(sam_data["group_cdf"][str(i)][...]))
+            self.group_cdf = torch.nested.nested_tensor(group_cdf_list)
+            del group_cdf_list
+
+            return True
+
+        return False
+
+    def save_sam_data(self, pixel_level_keys, scale_3d, group_cdf):
+        """Save the SAM grouping data to hdf5."""
+        prefix = self.img_group_model.config.model_type
+        # make the directory if it doesn't exist
+        if not osp.exists(self.sam_data_path.parent):
+            os.makedirs(self.sam_data_path.parent)
+
+        # Append, not overwrite -- in case of multiple runs with different settings.
+        with h5py.File(self.sam_data_path, "a") as f:
+            for i in range(len(pixel_level_keys)):
+                f.create_dataset(f"{prefix}/pixel_level_keys/{i}", data=pixel_level_keys[i])
+                f.create_dataset(f"{prefix}/scale_3d/{i}", data=scale_3d[i])
+                f.create_dataset(f"{prefix}/group_cdf/{i}", data=group_cdf[i])
+
+    @staticmethod
+    def create_pixel_mask_array(masks: torch.Tensor):
+        """
+        Create per-pixel data structure for grouping supervision.
+        pixel_mask_array[x, y] = [m1, m2, ...] means that pixel (x, y) belongs to masks m1, m2, ...
+        where Area(m1) < Area(m2) < ... (sorted by area).
+        """
+        max_masks = masks.sum(dim=0).max().item()
+        image_shape = masks.shape[1:]
+        pixel_mask_array = torch.full(
+            (max_masks, image_shape[0], image_shape[1]), -1, dtype=torch.int
+        ).to(masks.device)
+
+        for m, mask in enumerate(masks):
+            mask_clone = mask.clone()
+            for i in range(max_masks):
+                free = pixel_mask_array[i] == -1
+                masked_area = mask_clone == 1
+                right_index = free & masked_area
+                if len(pixel_mask_array[i][right_index]) != 0:
+                    pixel_mask_array[i][right_index] = m
+                mask_clone[right_index] = 0
+        pixel_mask_array = pixel_mask_array.permute(1, 2, 0)
+
+        return pixel_mask_array
+
+    def _calculate_3d_groups(
+        self,
+        rgb: torch.Tensor,
+        depth: torch.Tensor,
+        point: torch.Tensor,
+        max_scale: float = 2.0,
+    ):
+        """
+        Calculate the set of groups and their 3D scale for each pixel, and the cdf.
+        Returns:
+            - pixel_level_keys: [H, W, max_masks]
+            - scale: [num_masks, 1]
+            - mask_cdf: [H, W, max_masks]
+        max_masks is the maximum number of masks that was assigned to a pixel in the image,
+         padded with -1s. mask_cdf does *not* include the -1s.
+        Refer to the main paper for more details.
+        """
+        image_shape = rgb.shape[:2]
+        depth = depth.view(-1, 1)  # (H*W, 1)
+        point = point.view(-1, 3)  # (H*W, 3)
+
+        def helper_return_no_masks():
+            # Fail gracefully when no masks are found.
+            # Create dummy data (all -1s), which will be ignored later.
+            # See: `get_loss_dict_group` in `garfield_model.py`
+            pixel_level_keys = torch.full(
+                (image_shape[0], image_shape[1], 1), -1, dtype=torch.int
+            )
+            scale = torch.Tensor([0.0]).view(-1, 1)
+            mask_cdf = torch.full(
+                (image_shape[0], image_shape[1], 1), 1, dtype=torch.float
+            )
+            return (pixel_level_keys, scale, mask_cdf)
+
+        # Calculate SAM masks
+        masks = self.img_group_model((rgb.numpy() * 255).astype(np.uint8))
+
+        # If no masks are found, return dummy data.
+        if len(masks) == 0:
+            return helper_return_no_masks()
+
+        sam_mask = []
+        scale = []
+
+        # For all 2D groups,
+        # 1) Denoise the masks (through eroding)
+        all_masks = torch.stack(
+            # [torch.from_numpy(_["segmentation"]).to(self.device) for _ in masks]
+            [torch.from_numpy(_).to(self.device) for _ in masks]
+        )
+        # erode all masks using 3x3 kernel
+        eroded_masks = torch.conv2d(
+            all_masks.unsqueeze(1).float(),
+            torch.full((3, 3), 1.0).view(1, 1, 3, 3).to("cuda"),
+            padding=1,
+        )
+        eroded_masks = (eroded_masks >= 5).squeeze(1)  # (num_masks, H, W)
+
+        # 2) Calculate 3D scale
+        # Don't include groups with scale > max_scale (likely to be too noisy to be useful)
+        for i in range(len(masks)):
+            curr_mask = eroded_masks[i]
+            curr_mask = curr_mask.flatten()
+            curr_points = point[curr_mask]
+            extent = (curr_points.std(dim=0) * 2).norm()
+            if extent.item() < max_scale:
+                sam_mask.append(curr_mask.reshape(image_shape))
+                scale.append(extent.item())
+
+        # If no masks are found, after postprocessing, return dummy data.
+        if len(sam_mask) == 0:
+            return helper_return_no_masks()
+
+        sam_mask = torch.stack(sam_mask)  # (num_masks, H, W)
+        scale = torch.Tensor(scale).view(-1, 1).to(self.device)  # (num_masks, 1)
+
+        # Calculate "pixel level keys", which is a 2D array of shape (H, W, max_masks)
+        # Each pixel has a list of group indices that it belongs to, in order of increasing scale.
+        pixel_level_keys = self.create_pixel_mask_array(
+            sam_mask
+        ).long()  # (H, W, max_masks)
+
+        # Calculate group sampling CDF, to bias sampling towards smaller groups
+        # Be careful to not include -1s in the CDF (padding, or unlabeled pixels)
+        # Inversely proportional to log of mask size.
+        mask_inds, counts = torch.unique(pixel_level_keys, return_counts=True)
+        mask_sorted = torch.argsort(counts)
+        mask_inds, counts = mask_inds[mask_sorted], counts[mask_sorted]
+        counts[0] = 0  # don't include -1
+        probs = counts / counts.sum()  # [-1, 0, ...]
+        mask_probs = torch.gather(probs, 0, pixel_level_keys.reshape(-1) + 1).view(
+            pixel_level_keys.shape
+        )
+        mask_log_probs = torch.log(mask_probs)
+        never_masked = mask_log_probs.isinf()
+        mask_log_probs[never_masked] = 0.0
+        mask_log_probs = mask_log_probs / (
+            mask_log_probs.sum(dim=-1, keepdim=True) + 1e-6
+        )
+        mask_cdf = torch.cumsum(mask_log_probs, dim=-1)
+        mask_cdf[never_masked] = 1.0
+
+        return (pixel_level_keys.cpu(), scale.cpu(), mask_cdf.cpu())
+
+    def next_group(self, ray_bundle: RayBundle, batch: Dict[str, Any]):
+        """Returns the rays' mask and 3D scales for grouping.
+        We add to `batch` the following:
+            - "mask_id": [batch_size,]
+            - "scale": [batch_size,]
+            - "nPxImg": int == `num_rays_per_image`
+        This function also adds `scale` to `ray_bundle.metadata`.
+
+        We're using torch nested tensors -- this means that it's difficult to index into them.
+        At least now, it seems possible to index normally into a leaf tensor.
+        """
+        indices = batch["indices"].long().detach().cpu()
+        npximg = self.train_pixel_sampler.num_rays_per_image
+        img_ind = indices[:, 0]
+        x_ind = indices[:, 1]
+        y_ind = indices[:, 2]
+
+        # sampled_imgs = img_ind[::npximg]
+        mask_id = torch.zeros((indices.shape[0],), device=self.device)
+        scale = torch.zeros((indices.shape[0],), device=self.device)
+
+        random_vec_sampling = (torch.rand((1,)) * torch.ones((npximg,))).view(-1, 1)
+        random_vec_densify = (torch.rand((1,)) * torch.ones((npximg,))).view(-1, 1)
+
+        for i in range(0, indices.shape[0], npximg):
+            img_idx = img_ind[i]
+
+            # Use `random_vec` to choose a group for each pixel.
+            per_pixel_index = self.pixel_level_keys[img_idx][
+                x_ind[i : i + npximg], y_ind[i : i + npximg]
+            ]
+            random_index = torch.sum(
+                random_vec_sampling.view(-1, 1)
+                > self.group_cdf[img_idx][x_ind[i : i + npximg], y_ind[i : i + npximg]],
+                dim=-1,
+            )
+
+            # `per_pixel_index` encodes the list of groups that each pixel belongs to.
+            # If there's only one group, then `per_pixel_index` is a 1D tensor
+            # -- this will mess up the future `gather` operations.
+            if per_pixel_index.shape[-1] == 1:
+                per_pixel_mask = per_pixel_index.squeeze()
+            else:
+                per_pixel_mask = torch.gather(
+                    per_pixel_index, 1, random_index.unsqueeze(-1)
+                ).squeeze()
+                per_pixel_mask_ = torch.gather(
+                    per_pixel_index,
+                    1,
+                    torch.max(random_index.unsqueeze(-1) - 1, torch.Tensor([0]).int()),
+                ).squeeze()
+
+            mask_id[i : i + npximg] = per_pixel_mask.to(self.device)
+
+            # interval scale supervision
+            curr_scale = self.scale_3d[img_idx][per_pixel_mask]
+            curr_scale[random_index == 0] = (
+                self.scale_3d[img_idx][per_pixel_mask][random_index == 0]
+                * random_vec_densify[random_index == 0]
+            )
+            for j in range(1, self.group_cdf[img_idx].shape[-1]):
+                if (random_index == j).sum() == 0:
+                    continue
+                curr_scale[random_index == j] = (
+                    self.scale_3d[img_idx][per_pixel_mask_][random_index == j]
+                    + (
+                        self.scale_3d[img_idx][per_pixel_mask][random_index == j]
+                        - self.scale_3d[img_idx][per_pixel_mask_][random_index == j]
+                    )
+                    * random_vec_densify[random_index == j]
+                )
+            scale[i : i + npximg] = curr_scale.squeeze().to(self.device)
+
+        batch["mask_id"] = mask_id
+        batch["scale"] = scale
+        batch["nPxImg"] = npximg
+        ray_bundle.metadata["scale"] = batch["scale"]
diff --git a/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_field.py b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_field.py
new file mode 100644
index 0000000000..71f7e7e433
--- /dev/null
+++ b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_field.py
@@ -0,0 +1,155 @@
+from enum import Enum
+from typing import Dict, List, Optional, Tuple, Callable, Any, Type
+from dataclasses import dataclass, field
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from nerfstudio.cameras.rays import RaySamples
+from nerfstudio.configs.base_config import InstantiateConfig
+from nerfstudio.data.scene_box import SceneBox
+from nerfstudio.field_components.activations import trunc_exp
+from nerfstudio.field_components.field_heads import FieldHeadNames
+from nerfstudio.field_components.spatial_distortions import SceneContraction
+from nerfstudio.fields.base_field import Field
+from torch import nn
+from torch.nn.parameter import Parameter
+from torchtyping import TensorType
+
+try:
+    import tinycudann as tcnn
+except ImportError:
+    pass
+
+
+class GarfieldFieldHeadNames(Enum):
+    """Possible field outputs"""
+
+    INSTANCE = "instance"
+    HASHGRID = "hashgrid"
+
+
+@dataclass
+class GarfieldFieldConfig(InstantiateConfig):
+    _target: Type = field(default_factory=lambda: GarfieldField)
+    """The field class to instantiate."""
+
+    n_instance_dims: int = 256
+    hashgrid_cfg: Dict[str, Any] = field(
+        default_factory=lambda: {
+            "resolution_range": [(16, 256), (256, 2048)],
+            "level": [12, 12],
+        }
+    )
+    """Field parameters. """
+
+    use_single_scale: bool = False
+    """For single-scale ablation. For full GARField, set to False."""
+
+
+class GarfieldField(Field):
+    quantile_transformer: Callable[[TensorType], TensorType]
+    config: GarfieldFieldConfig
+
+    def __init__(
+        self,
+        config: GarfieldFieldConfig,
+    ):
+        super().__init__()
+        self.config = config
+        self.spatial_distortion: SceneContraction = SceneContraction()
+        self.use_single_scale = self.config.use_single_scale
+        hashgrid_cfg = self.config.hashgrid_cfg
+        instance_n_dims = self.config.n_instance_dims
+        use_single_scale = self.config.use_single_scale
+
+        # This is a trick to make the hashgrid encoding work with the TCNN library.
+        self.enc_list = torch.nn.ModuleList(
+            [
+                self._get_encoding(
+                    hashgrid_cfg["resolution_range"][i], hashgrid_cfg["level"][i]
+                )
+                for i in range(len(hashgrid_cfg["level"]))
+            ]
+        )
+        tot_out_dims = sum([e.n_output_dims for e in self.enc_list])
+
+        # This is the MLP that takes the hashgrid encoding as input.
+        # Note the +1 for the scale input.
+        self.instance_net = tcnn.Network(
+            n_input_dims=tot_out_dims + (0 if use_single_scale else 1),
+            n_output_dims=instance_n_dims,
+            network_config={
+                "otype": "CutlassMLP",
+                "activation": "ReLU",
+                "output_activation": "None",
+                "n_neurons": 256,
+                "n_hidden_layers": 4,
+            },
+        )
+        self.quantile_transformer = None  # for scale normalization
+
+    @staticmethod
+    def _get_encoding(
+        res_range: Tuple[int, int], levels: int, indim=3, hash_size=19
+    ) -> tcnn.Encoding:
+        """
+        Helper function to create a HashGrid encoding.
+        """
+        start_res, end_res = res_range
+        growth = np.exp((np.log(end_res) - np.log(start_res)) / (levels - 1))
+        enc = tcnn.Encoding(
+            n_input_dims=indim,
+            encoding_config={
+                "otype": "HashGrid",
+                "n_levels": levels,
+                "n_features_per_level": 8,
+                "log2_hashmap_size": hash_size,
+                "base_resolution": start_res,
+                "per_level_scale": growth,
+            },
+        )
+        return enc
+
+    def get_outputs(self, ray_samples: RaySamples) -> Dict[FieldHeadNames, TensorType]:
+        """
+        This function is not supported for GARField -- please use get_hash and get_mlp instead.
+        get_mlp assumes that hash values are normalized, which requires the renderer (in the model).
+        """
+        raise NotImplementedError
+
+    def get_hash(self, ray_samples: RaySamples) -> TensorType:
+        """Get the hashgrid encoding. Note that this function does *not* normalize the hash values."""
+        positions = ray_samples.frustums.get_positions().detach()
+        positions = self.spatial_distortion(positions)
+        positions = (positions + 2.0) / 4.0
+
+        xs = [e(positions.view(-1, 3)) for e in self.enc_list]
+        x = torch.concat(xs, dim=-1)
+        hash = x.view(*ray_samples.frustums.shape, -1)
+        return hash
+
+    def get_mlp(self, hash: TensorType, instance_scales: TensorType) -> TensorType:
+        """
+        Get the GARField affinity field outputs. Note that this is scale-conditioned.
+        This function *does* assume that the hash values are normalized.
+        The MLP output is normalized to unit length.
+        """
+        assert self.quantile_transformer is not None
+
+        # Check that # of rays is the same as # of scales
+        assert hash.shape[0] == instance_scales.shape[0]
+
+        epsilon = 1e-5
+        if self.use_single_scale:
+            instance_pass = self.instance_net(hash)
+            return instance_pass / (instance_pass.norm(dim=-1, keepdim=True) + epsilon)
+
+        scales = instance_scales.contiguous().view(-1, 1)
+
+        # Normalize scales before passing to MLP
+        scales = self.quantile_transformer(scales)
+        instance_pass = self.instance_net(torch.cat([hash, scales], dim=-1))
+
+        norms = instance_pass.norm(dim=-1, keepdim=True)
+        return instance_pass / (norms + epsilon)
diff --git a/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_gaussian_pipeline.py b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_gaussian_pipeline.py
new file mode 100644
index 0000000000..c4bcc21772
--- /dev/null
+++ b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_gaussian_pipeline.py
@@ -0,0 +1,648 @@
+import typing
+from dataclasses import dataclass, field
+from typing import Literal, Type, Mapping, Any, Optional, List, Dict
+from torchtyping import TensorType
+from pathlib import Path
+import trimesh
+import viser
+import viser.transforms as vtf
+import open3d as o3d
+import cv2
+import time
+
+import torch
+from nerfstudio.pipelines.base_pipeline import VanillaPipeline, VanillaPipelineConfig
+from torch.cuda.amp.grad_scaler import GradScaler
+from nerfstudio.viewer.viewer_elements import *
+from nerfstudio.viewer.viewer import VISER_NERFSTUDIO_SCALE_RATIO
+from nerfstudio.models.splatfacto import SplatfactoModel
+
+from cuml.cluster.hdbscan import HDBSCAN
+from nerfstudio.models.splatfacto import RGB2SH
+
+import tqdm
+
+from sklearn.preprocessing import QuantileTransformer
+from sklearn.neighbors import NearestNeighbors
+
+from scipy.spatial.transform import Rotation as Rot
+
+from garfield.garfield_datamanager import GarfieldDataManagerConfig, GarfieldDataManager
+from garfield.garfield_model import GarfieldModel, GarfieldModelConfig
+from garfield.garfield_pipeline import GarfieldPipelineConfig, GarfieldPipeline
+
+def quat_to_rotmat(quat):
+    assert quat.shape[-1] == 4, quat.shape
+    w, x, y, z = torch.unbind(quat, dim=-1)
+    mat = torch.stack(
+        [
+            1 - 2 * (y**2 + z**2),
+            2 * (x * y - w * z),
+            2 * (x * z + w * y),
+            2 * (x * y + w * z),
+            1 - 2 * (x**2 + z**2),
+            2 * (y * z - w * x),
+            2 * (x * z - w * y),
+            2 * (y * z + w * x),
+            1 - 2 * (x**2 + y**2),
+        ],
+        dim=-1,
+    )
+    return mat.reshape(quat.shape[:-1] + (3, 3))
+
+def generate_random_colors(N=5000) -> torch.Tensor:
+    """Generate random colors for visualization"""
+    hs = np.random.uniform(0, 1, size=(N, 1))
+    ss = np.random.uniform(0.6, 0.61, size=(N, 1))
+    vs = np.random.uniform(0.84, 0.95, size=(N, 1))
+    hsv = np.concatenate([hs, ss, vs], axis=-1)
+    # convert to rgb
+    rgb = cv2.cvtColor((hsv * 255).astype(np.uint8)[None, ...], cv2.COLOR_HSV2RGB)
+    return torch.Tensor(rgb.squeeze() / 255.0)
+
+
+@dataclass
+class GarfieldGaussianPipelineConfig(VanillaPipelineConfig):
+    """Gaussian Splatting, but also loading GARField grouping field from ckpt."""
+    _target: Type = field(default_factory=lambda: GarfieldGaussianPipeline)
+    garfield_ckpt: Optional[Path] = None  # Need to specify this
+
+
+class GarfieldGaussianPipeline(VanillaPipeline):
+    """
+    Trains a Gaussian Splatting model, but also loads a GARField grouping field from ckpt.
+    This grouping field allows you to:
+     - interactive click-based group selection (you can drag it around)
+     - scene clustering, then group selection (also can drag it around)
+
+    Note that the pipeline training must be stopped before you can interact with the scene!!
+    """
+    model: SplatfactoModel
+    garfield_pipeline: List[GarfieldPipeline]  # To avoid importing Viewer* from nerf pipeline
+    state_stack: List[Dict[str, TensorType]]  # To revert to previous state
+    click_location: Optional[TensorType]  # For storing click location
+    click_handle: Optional[viser.GlbHandle]  # For storing click handle
+    crop_group_list: List[TensorType]  # For storing gaussian crops (based on click point)
+    crop_transform_handle: Optional[viser.TransformControlsHandle]  # For storing scene transform handle -- drag!
+    cluster_labels: Optional[TensorType]  # For storing cluster labels
+
+    def __init__(
+        self,
+        config: GarfieldGaussianPipelineConfig,
+        device: str,
+        test_mode: Literal["test", "val", "inference"] = "val",
+        world_size: int = 1,
+        local_rank: int = 0,
+        grad_scaler: typing.Optional[GradScaler] = None,
+    ):
+        super().__init__(config, device, test_mode, world_size, local_rank, grad_scaler)
+
+        print("Loading instance feature model...")
+        assert config.garfield_ckpt is not None, "Need to specify garfield checkpoint"
+        from nerfstudio.utils.eval_utils import eval_setup
+        _, garfield_pipeline, _, _ = eval_setup(
+            config.garfield_ckpt, test_mode="inference"
+        )
+        self.garfield_pipeline = [garfield_pipeline]
+        self.state_stack = []
+
+        self.colormap = generate_random_colors()
+
+        self.viewer_control = ViewerControl()
+
+        self.a_interaction_method = ViewerDropdown(
+            "Interaction Method",
+            default_value="Interactive",
+            options=["Interactive", "Clustering"],
+            cb_hook=self._update_interaction_method
+        )
+
+        self.click_gaussian = ViewerButton(name="Click", cb_hook=self._click_gaussian)
+        self.click_location = None
+        self.click_handle = None
+
+        self.crop_to_click = ViewerButton(name="Crop to Click", cb_hook=self._crop_to_click, disabled=True)
+        self.crop_to_group_level = ViewerSlider(name="Group Level", min_value=0, max_value=29, step=1, default_value=0, cb_hook=self._update_crop_vis, disabled=True)
+        self.crop_group_list = []
+
+        self.move_current_crop = ViewerButton(name="Drag Current Crop", cb_hook=self._drag_current_crop, disabled=True)
+        self.crop_transform_handle = None
+
+        self.cluster_scene = ViewerButton(name="Cluster Scene", cb_hook=self._cluster_scene, disabled=False, visible=False)
+        self.cluster_scene_scale = ViewerSlider(name="Cluster Scale", min_value=0.0, max_value=2.0, step=0.01, default_value=0.0, disabled=False, visible=False)
+        self.cluster_scene_shuffle_colors = ViewerButton(name="Reshuffle Cluster Colors", cb_hook=self._reshuffle_cluster_colors, disabled=False, visible=False)
+        self.cluster_labels = None
+
+        self.reset_state = ViewerButton(name="Reset State", cb_hook=self._reset_state, disabled=True)
+
+        self.z_export_options = ViewerCheckbox(name="Export Options", default_value=False, cb_hook=self._update_export_options)
+        self.z_export_options_visible_gaussians = ViewerButton(
+            name="Export Visible Gaussians",
+            visible=False,
+            cb_hook=self._export_visible_gaussians
+            )
+        self.z_export_options_camera_path_filename = ViewerText("Camera Path Filename", "", visible=False)
+        self.z_export_options_camera_path_render = ViewerButton("Render Current Pipeline", cb_hook=self.render_from_path, visible=False)
+
+    def _update_interaction_method(self, dropdown: ViewerDropdown):
+        """Update the UI based on the interaction method"""
+        hide_in_interactive = (not (dropdown.value == "Interactive")) # i.e., hide if in interactive mode
+
+        self.cluster_scene.set_hidden((not hide_in_interactive))
+        self.cluster_scene_scale.set_hidden((not hide_in_interactive))
+        self.cluster_scene_shuffle_colors.set_hidden((not hide_in_interactive))
+
+        self.click_gaussian.set_hidden(hide_in_interactive)
+        self.crop_to_click.set_hidden(hide_in_interactive)
+        self.crop_to_group_level.set_hidden(hide_in_interactive)
+        self.move_current_crop.set_hidden(hide_in_interactive)
+
+    def _update_export_options(self, checkbox: ViewerCheckbox):
+        """Update the UI based on the export options"""
+        self.z_export_options_camera_path_filename.set_hidden(not checkbox.value)
+        self.z_export_options_camera_path_render.set_hidden(not checkbox.value)
+        self.z_export_options_visible_gaussians.set_hidden(not checkbox.value)
+
+    def _reset_state(self, button: ViewerButton):
+        """Revert to previous saved state"""
+        assert len(self.state_stack) > 0, "No previous state to revert to"
+        prev_state = self.state_stack.pop()
+        for name in self.model.gauss_params.keys():
+            self.model.gauss_params[name] = prev_state[name]
+
+        self.click_location = None
+        if self.click_handle is not None:
+            self.click_handle.remove()
+        self.click_handle = None
+
+        self.click_gaussian.set_disabled(False)
+
+        self.crop_to_click.set_disabled(True)
+        self.crop_to_group_level.set_disabled(True)
+        # self.crop_to_group_level.value = 0
+        self.move_current_crop.set_disabled(True)
+        self.crop_group_list = []
+        if self.crop_transform_handle is not None:
+            self.crop_transform_handle.remove()
+            self.crop_transform_handle = None
+        if len(self.state_stack) == 0:
+            self.reset_state.set_disabled(True)
+
+        self.cluster_labels = None
+        self.cluster_scene.set_disabled(False)
+
+    def _queue_state(self):
+        """Save current state to stack"""
+        import copy
+        self.state_stack.append(copy.deepcopy({k:v.detach() for k,v in self.model.gauss_params.items()}))
+        self.reset_state.set_disabled(False)
+
+    def _click_gaussian(self, button: ViewerButton):
+        """Start listening for click-based 3D point specification.
+        Refer to garfield_interaction.py for more details."""
+        def del_handle_on_rayclick(click: ViewerClick):
+            self._on_rayclick(click)
+            self.click_gaussian.set_disabled(False)
+            self.crop_to_click.set_disabled(False)
+            self.viewer_control.unregister_click_cb(del_handle_on_rayclick)
+
+        self.click_gaussian.set_disabled(True)
+        self.viewer_control.register_click_cb(del_handle_on_rayclick)
+
+    def _on_rayclick(self, click: ViewerClick):
+        """On click, calculate the 3D position of the click and visualize it.
+        Refer to garfield_interaction.py for more details."""
+
+        cam = self.viewer_control.get_camera(500, None, 0)
+        cam2world = cam.camera_to_worlds[0, :3, :3]
+        import viser.transforms as vtf
+
+        x_pi = vtf.SO3.from_x_radians(np.pi).as_matrix().astype(np.float32)
+        world2cam = (cam2world @ x_pi).inverse()
+        # rotate the ray around into cam coordinates
+        newdir = world2cam @ torch.tensor(click.direction).unsqueeze(-1)
+        z_dir = newdir[2].item()
+        # project it into coordinates with matrix
+        K = cam.get_intrinsics_matrices()[0]
+        coords = K @ newdir
+        coords = coords / coords[2]
+        pix_x, pix_y = int(coords[0]), int(coords[1])
+        self.model.eval()
+        outputs = self.model.get_outputs(cam.to(self.device))
+        self.model.train()
+        with torch.no_grad():
+            depth = outputs["depth"][pix_y, pix_x].cpu().numpy()
+
+        self.click_location = np.array(click.origin) + np.array(click.direction) * (depth / z_dir)
+
+        sphere_mesh = trimesh.creation.icosphere(radius=0.2)
+        sphere_mesh.visual.vertex_colors = (0.0, 1.0, 0.0, 1.0)  # type: ignore
+        self.click_handle = self.viewer_control.viser_server.add_mesh_trimesh(
+            name=f"/click",
+            mesh=sphere_mesh,
+            position=VISER_NERFSTUDIO_SCALE_RATIO * self.click_location,
+        )
+
+    def _crop_to_click(self, button: ViewerButton):
+        """Crop to click location"""
+        assert self.click_location is not None, "Need to specify click location"
+
+        self._queue_state()  # Save current state
+        curr_means = self.model.gauss_params['means'].detach()
+        self.model.eval()
+
+        # The only way to reset is to reset the state using the reset button.
+        self.click_gaussian.set_disabled(True)  # Disable user from changing click
+        self.crop_to_click.set_disabled(True)  # Disable user from changing click
+
+        # Get the 3D location of the click
+        location = self.click_location
+        location = torch.tensor(location).view(1, 3).to(self.device)
+
+        # The list of positions to query for garfield features. The first one is the click location.
+        positions = torch.cat([location, curr_means])  # N x 3
+
+        # Create a kdtree, to get the closest gaussian to the click-point.
+        points = o3d.geometry.PointCloud(o3d.utility.Vector3dVector(curr_means.cpu().numpy()))
+        kdtree = o3d.geometry.KDTreeFlann(points)
+        _, inds, _ = kdtree.search_knn_vector_3d(location.view(3, -1).float().detach().cpu().numpy(), 10)
+
+        # get the closest point to the sphere, using kdtree
+        sphere_inds = inds
+        scales = torch.ones((positions.shape[0], 1)).to(self.device)
+
+        keep_list = []
+        prev_group = None
+
+        # Iterate over different scales, to get the a range of possible groupings.
+        grouping_model = self.garfield_pipeline[0].model
+        for s in tqdm.tqdm(torch.linspace(0, 1.5, 30)):
+            # Calculate the grouping features, and calculate the affinity between click point and scene
+            instances = grouping_model.get_grouping_at_points(positions, s)  # (1+N, 256)
+            click_instance = instances[0]
+            affinity = torch.norm(click_instance - instances, dim=1)[1:]
+
+            # Filter out points that have affinity < 0.5 (i.e., not likely to be in the same group)
+            keeps = torch.where(affinity < 0.5)[0].cpu()
+            keep_points = points.select_by_index(keeps.tolist())  # indices of gaussians
+
+            # Here, we desire the gaussian groups to be grouped tightly together spatially. 
+            # We use DBSCAN to group the gaussians together, and choose the cluster that contains the click point.
+            # Note that there may be spuriously high affinity between points that are spatially far apart,
+            #  possibly due two different groups being considered together at an odd angle / far viewpoint.
+
+            # If there are too many points, we downsample them first before DBSCAN.
+            # Then, we assign the filtered points to the cluster of the nearest downsampled point.
+            if len(keeps) > 5000:
+                curr_point_min = keep_points.get_min_bound()
+                curr_point_max = keep_points.get_max_bound()
+
+                downsample_size = 0.01 * s
+                _, _, curr_points_ds_ids = keep_points.voxel_down_sample_and_trace(
+                    voxel_size=max(downsample_size, 0.0001),
+                    min_bound=curr_point_min,
+                    max_bound=curr_point_max,
+                )
+                curr_points_ds_ids = np.array([points[0] for points in curr_points_ds_ids])
+                curr_points_ds = keep_points.select_by_index(curr_points_ds_ids)
+                curr_points_ds_selected = np.zeros(len(keep_points.points), dtype=bool)
+                curr_points_ds_selected[curr_points_ds_ids] = True
+
+                _clusters = np.asarray(curr_points_ds.cluster_dbscan(eps=0.02, min_points=5))
+                nn_model = NearestNeighbors(
+                    n_neighbors=1, algorithm="auto", metric="euclidean"
+                ).fit(np.asarray(curr_points_ds.points))
+
+                _, indices = nn_model.kneighbors(np.asarray(keep_points.points)[~curr_points_ds_selected])
+
+                clusters = np.zeros(len(keep_points.points), dtype=int)
+                clusters[curr_points_ds_selected] = _clusters
+                clusters[~curr_points_ds_selected] = _clusters[indices[:, 0]]
+
+            else:
+                clusters = np.asarray(keep_points.cluster_dbscan(eps=0.02, min_points=5))
+
+            # Choose the cluster that contains the click point. If there is none, move to the next scale.
+            cluster_inds = clusters[np.isin(keeps, sphere_inds)]
+            cluster_inds = cluster_inds[cluster_inds != -1]
+            if len(cluster_inds) == 0:
+                continue
+            cluster_ind = cluster_inds[0]
+
+            keeps = keeps[np.where(clusters == cluster_ind)]
+
+            if prev_group is None:
+                prev_group = keeps
+                keep_list.append(keeps)
+                continue
+
+            keeps = torch.cat([prev_group, keeps])
+            keeps = torch.unique(keeps)
+
+            # # Deduplication, based on the # of current points included in the previous group.
+            # overlap = torch.isin(keeps, prev_group).sum()
+            # if overlap < 0.8 * len(keeps):
+            #     prev_group = keeps
+            keep_list.append(keeps)
+
+        if len(keep_list) == 0:
+            print("No gaussians within crop, aborting")
+            # The only way to reset is to reset the state using the reset button.
+            self.click_gaussian.set_disabled(False)
+            self.crop_to_click.set_disabled(False)
+            return
+
+        # Remove the click handle + visualization
+        self.click_location = None
+        self.click_handle.remove()
+        self.click_handle = None
+        
+        self.crop_group_list = keep_list
+        self.crop_to_group_level.set_disabled(False)
+        self.crop_to_group_level.value = 29
+        self.move_current_crop.set_disabled(False)
+
+    def _update_crop_vis(self, number: ViewerSlider):
+        """Update which click-based crop to visualize -- this requires that _crop_to_click has been called."""
+        # If there is no click-based crop or saved state to crop from, do nothing
+        if len(self.crop_group_list) == 0:
+            return
+        if len(self.state_stack) == 0:
+            return
+        
+        # Clamp the number to be within the range of possible crops
+        if number.value > len(self.crop_group_list) - 1:
+            number.value = len(self.crop_group_list) - 1
+            return
+        elif number.value < 0:
+            number.value = 0
+            return
+
+        keep_inds = self.crop_group_list[number.value]
+        prev_state = self.state_stack[-1]
+        for name in self.model.gauss_params.keys():
+            self.model.gauss_params[name] = prev_state[name][keep_inds]
+
+    def _drag_current_crop(self, button: ViewerButton):
+        """Add a transform control to the current scene, and update the model accordingly."""
+        self.crop_to_group_level.set_disabled(True)  # Disable user from changing crop
+        self.move_current_crop.set_disabled(True)  # Disable user from creating another drag handle
+        
+        scene_centroid = self.model.gauss_params['means'].detach().mean(dim=0)
+        self.crop_transform_handle = self.viewer_control.viser_server.add_transform_controls(
+            name=f"/scene_transform",
+            position=(VISER_NERFSTUDIO_SCALE_RATIO*scene_centroid).cpu().numpy(),
+        )
+
+        # Visualize the whole scene -- the points corresponding to the crop will be controlled by the transform handle.
+        crop_inds = self.crop_group_list[self.crop_to_group_level.value]
+        prev_state = self.state_stack[-1]
+        for name in self.model.gauss_params.keys():
+            self.model.gauss_params[name] = prev_state[name].clone()
+
+        curr_means = self.model.gauss_params['means'].clone().detach()
+        curr_rotmats = quat_to_rotmat(self.model.gauss_params['quats'][crop_inds].detach())
+
+        @self.crop_transform_handle.on_update
+        def _(_):
+            handle_position = torch.tensor(self.crop_transform_handle.position).to(self.device)
+            handle_position = handle_position / VISER_NERFSTUDIO_SCALE_RATIO
+            handle_rotmat = quat_to_rotmat(torch.tensor(self.crop_transform_handle.wxyz).to(self.device).float())
+
+            means = self.model.gauss_params['means'].detach()
+            quats = self.model.gauss_params['quats'].detach()
+
+            means[crop_inds] = handle_position.float() + torch.matmul(
+                handle_rotmat, (curr_means[crop_inds] - curr_means[crop_inds].mean(dim=0)).T
+            ).T
+            quats[crop_inds] = torch.Tensor(Rot.from_matrix(
+                torch.matmul(handle_rotmat.float(), curr_rotmats.float()).cpu().numpy()
+            ).as_quat()).to(self.device)  # this is in xyzw format
+            quats[crop_inds] = quats[crop_inds][:, [3, 0, 1, 2]]  # convert to wxyz format
+
+            self.model.gauss_params['means'] = torch.nn.Parameter(means.float())
+            self.model.gauss_params['quats'] = torch.nn.Parameter(quats.float())
+
+            self.viewer_control.viewer._trigger_rerender()  # trigger viewer rerender
+
+    def _reshuffle_cluster_colors(self, button: ViewerButton):
+        """Reshuffle the cluster colors, if clusters defined using `_cluster_scene`."""
+        if self.cluster_labels is None:
+            return
+        self.cluster_scene_shuffle_colors.set_disabled(True)  # Disable user from reshuffling colors
+        self.colormap = generate_random_colors()
+        colormap = self.colormap
+
+        labels = self.cluster_labels
+
+        features_dc = self.model.gauss_params['features_dc'].detach()
+        features_rest = self.model.gauss_params['features_rest'].detach()
+        for c_id in range(0, labels.max().int().item() + 1):
+            # set the colors of the gaussians accordingly using colormap from matplotlib
+            cluster_mask = np.where(labels == c_id)
+            features_dc[cluster_mask] = RGB2SH(colormap[c_id, :3].to(self.model.gauss_params['features_dc']))
+            features_rest[cluster_mask] = 0
+
+        self.model.gauss_params['features_dc'] = torch.nn.Parameter(self.model.gauss_params['features_dc'])
+        self.model.gauss_params['features_rest'] = torch.nn.Parameter(self.model.gauss_params['features_rest'])
+        self.cluster_scene_shuffle_colors.set_disabled(False)
+
+    def _cluster_scene(self, button: ViewerButton):
+        """Cluster the scene, and assign gaussian colors based on the clusters.
+        Also populates self.crop_group_list with the clusters group indices."""
+
+        self._queue_state()  # Save current state
+        self.cluster_scene.set_disabled(True)  # Disable user from clustering, while clustering
+
+        scale = self.cluster_scene_scale.value
+        grouping_model = self.garfield_pipeline[0].model
+        
+        positions = self.model.gauss_params['means'].detach()
+        group_feats = grouping_model.get_grouping_at_points(positions, scale).cpu().numpy()  # (N, 256)
+        positions = positions.cpu().numpy()
+
+        start = time.time()
+
+        # Cluster the gaussians using HDBSCAN.
+        # We will first cluster the downsampled gaussians, then 
+        #  assign the full gaussians to the spatially closest downsampled gaussian.
+
+        vec_o3d = o3d.utility.Vector3dVector(positions)
+        pc_o3d = o3d.geometry.PointCloud(vec_o3d)
+        min_bound = np.clip(pc_o3d.get_min_bound(), -1, 1)
+        max_bound = np.clip(pc_o3d.get_max_bound(), -1, 1)
+        # downsample size to be a percent of the bounding box extent
+        downsample_size = 0.01 * scale
+        pc, _, ids = pc_o3d.voxel_down_sample_and_trace(
+            max(downsample_size, 0.0001), min_bound, max_bound
+        )
+        if len(ids) > 1e6:
+            print(f"Too many points ({len(ids)}) to cluster... aborting.")
+            print( "Consider using interactive select to reduce points before clustering.")
+            print( "Are you sure you want to cluster? Press y to continue, else return.")
+            # wait for input to continue, if yes then continue, else return
+            if input() != "y":
+                self.cluster_scene.set_disabled(False)
+                return
+
+        id_vec = np.array([points[0] for points in ids])  # indices of gaussians kept after downsampling
+        group_feats_downsampled = group_feats[id_vec]
+        positions_downsampled = np.array(pc.points)
+
+        print(f"Clustering {group_feats_downsampled.shape[0]} gaussians... ", end="", flush=True)
+
+        # Run cuml-based HDBSCAN
+        clusterer = HDBSCAN(
+            cluster_selection_epsilon=0.1,
+            min_samples=30,
+            min_cluster_size=30,
+            allow_single_cluster=True,
+        ).fit(group_feats_downsampled)
+
+        non_clustered = np.ones(positions.shape[0], dtype=bool)
+        non_clustered[id_vec] = False
+        labels = clusterer.labels_.copy()
+        clusterer.labels_ = -np.ones(positions.shape[0], dtype=np.int32)
+        clusterer.labels_[id_vec] = labels
+
+        # Assign the full gaussians to the spatially closest downsampled gaussian, with scipy NearestNeighbors.
+        positions_np = positions[non_clustered]
+        if positions_np.shape[0] > 0:  # i.e., if there were points removed during downsampling
+            k = 1
+            nn_model = NearestNeighbors(
+                n_neighbors=k, algorithm="auto", metric="euclidean"
+            ).fit(positions_downsampled)
+            _, indices = nn_model.kneighbors(positions_np)
+            clusterer.labels_[non_clustered] = labels[indices[:, 0]]
+
+        labels = clusterer.labels_
+        print(f"done. Took {time.time()-start} seconds. Found {labels.max() + 1} clusters.")
+
+        noise_mask = labels == -1
+        if noise_mask.sum() != 0 and (labels>=0).sum() > 0:
+            # if there is noise, but not all of it is noise, relabel the noise
+            valid_mask = labels >=0
+            valid_positions = positions[valid_mask]
+            k = 1
+            nn_model = NearestNeighbors(
+                n_neighbors=k, algorithm="auto", metric="euclidean"
+            ).fit(valid_positions)
+            noise_positions = positions[noise_mask]
+            _, indices = nn_model.kneighbors(noise_positions)
+            # for now just pick the closest cluster
+            noise_relabels = labels[valid_mask][indices[:, 0]]
+            labels[noise_mask] = noise_relabels
+            clusterer.labels_ = labels
+
+        labels = clusterer.labels_
+
+        colormap = self.colormap
+
+        opacities = self.model.gauss_params['opacities'].detach()
+        opacities[labels < 0] = -100  # hide unclustered gaussians
+        self.model.gauss_params['opacities'] = torch.nn.Parameter(opacities.float())
+
+        self.cluster_labels = torch.Tensor(labels)
+        features_dc = self.model.gauss_params['features_dc'].detach()
+        features_rest = self.model.gauss_params['features_rest'].detach()
+        for c_id in range(0, labels.max() + 1):
+            # set the colors of the gaussians accordingly using colormap from matplotlib
+            cluster_mask = np.where(labels == c_id)
+            features_dc[cluster_mask] = RGB2SH(colormap[c_id, :3].to(self.model.gauss_params['features_dc']))
+            features_rest[cluster_mask] = 0
+
+        self.model.gauss_params['features_dc'] = torch.nn.Parameter(self.model.gauss_params['features_dc'])
+        self.model.gauss_params['features_rest'] = torch.nn.Parameter(self.model.gauss_params['features_rest'])
+
+        self.cluster_scene.set_disabled(False)
+        self.viewer_control.viewer._trigger_rerender()  # trigger viewer rerender
+
+    def _export_visible_gaussians(self, button: ViewerButton):
+        """Export the visible gaussians to a .ply file"""
+        # location to save
+        output_dir = f"outputs/{self.datamanager.config.dataparser.data.name}"
+        filename = Path(output_dir) / f"gaussians.ply"
+
+        # Copied from exporter.py
+        from collections import OrderedDict
+        map_to_tensors = OrderedDict()
+        model=self.model
+
+        with torch.no_grad():
+            positions = model.means.cpu().numpy()
+            count = positions.shape[0]
+            n = count
+            map_to_tensors["x"] = positions[:, 0]
+            map_to_tensors["y"] = positions[:, 1]
+            map_to_tensors["z"] = positions[:, 2]
+            map_to_tensors["nx"] = np.zeros(n, dtype=np.float32)
+            map_to_tensors["ny"] = np.zeros(n, dtype=np.float32)
+            map_to_tensors["nz"] = np.zeros(n, dtype=np.float32)
+
+            if model.config.sh_degree > 0:
+                shs_0 = model.shs_0.contiguous().cpu().numpy()
+                for i in range(shs_0.shape[1]):
+                    map_to_tensors[f"f_dc_{i}"] = shs_0[:, i, None]
+
+                # transpose(1, 2) was needed to match the sh order in Inria version
+                shs_rest = model.shs_rest.transpose(1, 2).contiguous().cpu().numpy()
+                shs_rest = shs_rest.reshape((n, -1))
+                for i in range(shs_rest.shape[-1]):
+                    map_to_tensors[f"f_rest_{i}"] = shs_rest[:, i, None]
+            else:
+                colors = torch.clamp(model.colors.clone(), 0.0, 1.0).data.cpu().numpy()
+                map_to_tensors["colors"] = (colors * 255).astype(np.uint8)
+
+            map_to_tensors["opacity"] = model.opacities.data.cpu().numpy()
+
+            scales = model.scales.data.cpu().numpy()
+            for i in range(3):
+                map_to_tensors[f"scale_{i}"] = scales[:, i, None]
+
+            quats = model.quats.data.cpu().numpy()
+            for i in range(4):
+                map_to_tensors[f"rot_{i}"] = quats[:, i, None]
+
+        # post optimization, it is possible have NaN/Inf values in some attributes
+        # to ensure the exported ply file has finite values, we enforce finite filters.
+        select = np.ones(n, dtype=bool)
+        for k, t in map_to_tensors.items():
+            n_before = np.sum(select)
+            select = np.logical_and(select, np.isfinite(t).all(axis=-1))
+            n_after = np.sum(select)
+            if n_after < n_before:
+                CONSOLE.print(f"{n_before - n_after} NaN/Inf elements in {k}")
+
+        if np.sum(select) < n:
+            CONSOLE.print(f"values have NaN/Inf in map_to_tensors, only export {np.sum(select)}/{n}")
+            for k, t in map_to_tensors.items():
+                map_to_tensors[k] = map_to_tensors[k][select]
+            count = np.sum(select)
+        from nerfstudio.scripts.exporter import ExportGaussianSplat
+        ExportGaussianSplat.write_ply(str(filename), count, map_to_tensors)
+
+
+    def render_from_path(self, button: ViewerButton):
+        from nerfstudio.cameras.camera_paths import get_path_from_json
+        import json
+        from nerfstudio.scripts.render import _render_trajectory_video
+
+        assert self.z_export_options_camera_path_filename.value != ""
+        camera_path_filename = Path(self.z_export_options_camera_path_filename.value)
+        
+        with open(camera_path_filename, "r", encoding="utf-8") as f:
+            camera_path = json.load(f)
+        seconds = camera_path["seconds"]
+        camera_path = get_path_from_json(camera_path)
+        self.model.eval()
+        with torch.no_grad():
+            _render_trajectory_video(
+                self,
+                camera_path,
+                output_filename=Path('render.mp4'),
+                rendered_output_names=['rgb'],
+                rendered_resolution_scaling_factor=1.0 ,
+                seconds=seconds,
+                output_format="video",
+            )
+        self.model.train()
diff --git a/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_interaction.py b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_interaction.py
new file mode 100644
index 0000000000..6094e5aa52
--- /dev/null
+++ b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_interaction.py
@@ -0,0 +1,146 @@
+"""Helper functions for interacting/visualization with GARField model."""
+from typing import List, Optional, Tuple, Union
+import viser
+import trimesh
+import torch.nn as nn
+
+from nerfstudio.cameras.rays import RayBundle
+from nerfstudio.field_components.field_heads import FieldHeadNames
+from nerfstudio.model_components.losses import scale_gradients_by_distance_squared
+
+from nerfstudio.viewer.viewer_elements import *
+from nerfstudio.viewer.viewer import VISER_NERFSTUDIO_SCALE_RATIO
+
+from garfield.garfield_model import GarfieldModel
+
+class GarfieldClickScene(nn.Module):
+    """UI for clicking on a scene (visualized as spheres).
+    This needs to be a nn.Module to allow the viewer to register callbacks.
+    """
+    _click_handle: viser.GlbHandle
+    _box_handle: viser.GlbHandle
+    selected_location: np.ndarray
+    scale_handle: ViewerSlider  # For getting the scale to query GARField
+    model_handle: List[GarfieldModel]  # Store as list to avoid circular children
+
+    def __init__(
+            self,
+            device: torch.device,
+            scale_handle: ViewerSlider,
+            model_handle: List[GarfieldModel]
+        ):
+        super().__init__()
+        self.add_click_button: ViewerButton = ViewerButton(
+            name="Click", cb_hook=self._add_click_cb
+        )
+        self.del_click_button: ViewerButton = ViewerButton(
+            name="Reset Click", cb_hook=self._del_click_cb
+        )
+        self.viewer_control: ViewerControl = ViewerControl()
+
+        self.scale_handle = scale_handle
+        self.model_handle = model_handle
+        self.scale_handle.cb_hook = self._update_scale_vis
+
+        self._click_handle = None
+        self._box_handle = None
+        self.selected_location = None
+        self.device = device
+
+    def _add_click_cb(self, button: ViewerButton):
+        """Button press registers a click event, which will add a sphere.
+        Refer more to nerfstudio docs for more details. """
+        self.add_click_button.set_disabled(True)
+        def del_handle_on_rayclick(click: ViewerClick):
+            self._on_rayclick(click)
+            self.add_click_button.set_disabled(False)
+            self.viewer_control.unregister_click_cb(del_handle_on_rayclick)
+        self.viewer_control.register_click_cb(del_handle_on_rayclick)
+
+    def _on_rayclick(self, click: ViewerClick):
+        """On click, calculate the 3D position of the click and visualize it.
+        Also keep track of the selected location."""
+
+        origin = torch.tensor(click.origin).view(1, 3)
+        direction = torch.tensor(click.direction).view(1, 3)
+
+        # get intersection
+        bundle = RayBundle(
+            origin,
+            direction,
+            torch.tensor(0.001).view(1, 1),
+            nears=torch.tensor(0.05).view(1, 1),
+            fars=torch.tensor(100).view(1, 1),
+            camera_indices=torch.tensor(0).view(1, 1),
+        ).to(self.device)
+
+        # Get the distance/depth to the intersection --> calculate 3D position of the click
+        model = self.model_handle[0]
+        ray_samples, _, _ = model.proposal_sampler(bundle, density_fns=model.density_fns)
+        field_outputs = model.field.forward(ray_samples, compute_normals=model.config.predict_normals)
+        if model.config.use_gradient_scaling:
+            field_outputs = scale_gradients_by_distance_squared(field_outputs, ray_samples)
+        weights = ray_samples.get_weights(field_outputs[FieldHeadNames.DENSITY])
+        with torch.no_grad():
+            depth = model.renderer_depth(weights=weights, ray_samples=ray_samples)
+        distance = depth[0, 0].detach().cpu().numpy()
+        click_position = np.array(origin + direction * distance) * VISER_NERFSTUDIO_SCALE_RATIO
+
+        # Update click visualization
+        self._del_click_cb(None)
+        sphere_mesh: trimesh.Trimesh = trimesh.creation.icosphere(radius=0.1)
+        sphere_mesh.vertices += click_position
+        sphere_mesh.visual.vertex_colors = (1.0, 0.0, 0.0, 1.0)  # type: ignore
+        sphere_mesh_handle = self.viewer_control.viser_server.add_mesh_trimesh(
+            name=f"/hit_pos", mesh=sphere_mesh
+        )
+        self._click_handle = sphere_mesh_handle
+        self.selected_location = np.array(origin + direction * distance)
+        self._update_scale_vis(self.scale_handle)
+
+    def _del_click_cb(self, button: ViewerButton):
+        """Remove the click location and click visualizations."""
+        if self._click_handle is not None:
+            self._click_handle.remove()
+        self._click_handle = None
+        if self._box_handle is not None:
+            self._box_handle.remove()
+        self._box_handle = None
+        self.selected_location = None
+
+    def _update_scale_vis(self, slider: ViewerSlider):
+        """Update the scale visualization."""
+        if self._box_handle is not None:
+            self._box_handle.remove()
+            self._box_handle = None
+        if self.selected_location is not None:
+            box_mesh = trimesh.creation.icosphere(radius=VISER_NERFSTUDIO_SCALE_RATIO*max(0.001, slider.value)/2, subdivision=0)
+            self._box_handle = self.viewer_control.viser_server.add_mesh_simple(
+                name=f"/hit_pos_box", 
+                vertices=box_mesh.vertices,
+                faces=box_mesh.faces,
+                position=(self.selected_location * VISER_NERFSTUDIO_SCALE_RATIO).flatten(),
+                wireframe=True
+            )
+
+    def get_outputs(self, outputs: dict):
+        """Visualize affinity between the selected 3D point and the points visibl in current rendered view."""
+        if self.selected_location is None:
+            return None
+
+        location = self.selected_location
+        instance_scale = self.scale_handle.value
+        
+        # mimic the fields call
+        grouping_field = self.model_handle[0].grouping_field
+        positions = torch.tensor(location).view(1, 3).to(self.device)
+        positions = grouping_field.spatial_distortion(positions)
+        positions = (positions + 2.0) / 4.0
+        xs = [e(positions.view(-1, 3)) for e in grouping_field.enc_list]
+        x = torch.concat(xs, dim=-1)
+        x = x / x.norm(dim=-1, keepdim=True)
+        instance_pass = grouping_field.get_mlp(x, torch.tensor([instance_scale]).to(self.device).view(1, 1))
+
+        return {
+            "instance_interact": torch.norm(outputs['instance'] - instance_pass.float(), p=2, dim=-1)
+        }
\ No newline at end of file
diff --git a/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_model.py b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_model.py
new file mode 100644
index 0000000000..82ac67cda6
--- /dev/null
+++ b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_model.py
@@ -0,0 +1,284 @@
+from collections import defaultdict
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Mapping, Tuple, Type, Literal
+
+import torch.nn as nn
+from torch.nn import Parameter
+import trimesh
+import numpy as np
+from torchtyping import TensorType
+
+import torch
+import torch.nn.functional as F
+from nerfstudio.cameras.rays import RayBundle, RaySamples
+from nerfstudio.field_components.field_heads import FieldHeadNames
+from nerfstudio.models.nerfacto import NerfactoModel, NerfactoModelConfig
+from nerfstudio.viewer.viewer_elements import *
+from nerfstudio.viewer.viewer import VISER_NERFSTUDIO_SCALE_RATIO
+from nerfstudio.model_components.losses import scale_gradients_by_distance_squared
+
+from garfield.garfield_field import (
+    GarfieldField,
+    GarfieldFieldConfig,
+)
+
+
+class FeatureRenderer(nn.Module):
+    """Render feature embeddings along  a ray, where features are unit norm"""
+
+    @classmethod
+    def forward(
+        cls,
+        embeds: TensorType["bs":..., "num_samples", "num_classes"],
+        weights: TensorType["bs":..., "num_samples", 1],
+    ) -> TensorType["bs":..., "num_classes"]:
+        """Calculate semantics along the ray."""
+        output = torch.sum(weights * embeds, dim=-2)
+        output = output / torch.linalg.norm(output, dim=-1, keepdim=True)
+        return output
+
+
+@dataclass
+class GarfieldModelConfig(NerfactoModelConfig):
+    _target: Type = field(default_factory=lambda: GarfieldModel)
+    instance_field: GarfieldFieldConfig = field(default_factory=lambda: GarfieldFieldConfig())
+
+    max_grouping_scale: float = 2.0
+    """Maximum scale to use for grouping supervision. Should be set during pipeline init."""
+
+    num_feat_samples: int = 24
+    """Number of samples per ray to use for grouping supervision."""
+
+    use_hierarchy_losses: bool = True
+    use_single_scale: bool = False
+    """For ablation only. For full GARField, keep hierarchy=True and single_scale=False."""
+
+
+class GarfieldModel(NerfactoModel):
+    config: GarfieldModelConfig
+    grouping_field: GarfieldField
+
+    def populate_modules(self):
+        super().populate_modules()
+        self.renderer_feat = FeatureRenderer()
+        self.config.instance_field.use_single_scale = self.config.use_single_scale
+        self.grouping_field = self.config.instance_field.setup()
+        
+        # Add a slider to the viewer to control the scale of the grouping field.
+        self.scale_slider = ViewerSlider("Scale", 0.0, 0.0, 2.0, 0.001)
+
+        # Store reference to click interface for GARField. 
+        # Note the List[GarfieldModel] is to avoid circular children.
+        from garfield.garfield_interaction import GarfieldClickScene
+        self.click_scene: GarfieldClickScene = GarfieldClickScene(
+            device=("cuda" if torch.cuda.is_available() else "cpu"),
+            scale_handle=self.scale_slider,
+            model_handle=[self]
+            )
+
+    def get_outputs(self, ray_bundle: RayBundle) -> Dict[str, TensorType]:
+        outputs = super().get_outputs(ray_bundle)
+
+        if self.grouping_field.quantile_transformer is None:
+            # If scale statistics are not available, it's not possible to calculate grouping features.
+            return outputs
+
+        # Recalculate ray samples and weights
+        # ... only if the model is in eval mode, where it should be no_grad(). 
+        # If in training mode, `outputs` should already have calculated ray samples and weights.
+        # Without this if-block, camera optimizer? gradients? seem to get messed up.
+        ray_samples: RaySamples
+        if self.training:
+            ray_samples, weights = outputs["ray_samples_list"][-1], outputs["weights_list"][-1]
+        else:
+            ray_samples, weights_list, ray_samples_list = self.proposal_sampler(ray_bundle, density_fns=self.density_fns)
+            field_outputs = self.field.forward(ray_samples, compute_normals=self.config.predict_normals)
+            if self.config.use_gradient_scaling:
+                field_outputs = scale_gradients_by_distance_squared(field_outputs, ray_samples)
+            weights = ray_samples.get_weights(field_outputs[FieldHeadNames.DENSITY])
+
+        # Choose the top k samples with the highest weights, to be used for grouping.
+        # This is to decrease # of samples queried for grouping, while sampling close to the scene density.
+        def gather_fn(tens):
+            return torch.gather(
+                tens, -2, best_ids.expand(*best_ids.shape[:-1], tens.shape[-1])
+            )
+
+        dataclass_fn = lambda dc: dc._apply_fn_to_fields(gather_fn, dataclass_fn)
+        grouping_weights, best_ids = torch.topk(
+            weights, self.config.num_feat_samples, dim=-2, sorted=False
+        )
+        grouping_samples: RaySamples = ray_samples._apply_fn_to_fields(
+            gather_fn, dataclass_fn
+        )
+
+        # Define the scale for each sample. If the scale is not provided, use the selected scale.
+        # "scale" is included in ray_bundle.metadata only from training batches, but
+        # this would be good way to override the scale during inference.
+        if self.training and ("scale" in ray_bundle.metadata):
+            scales = ray_bundle.metadata["scale"]
+            instance_scales = scales.view(grouping_samples.shape[0], 1)
+        elif "scale" in ray_bundle.metadata:
+            scales = ray_bundle.metadata["scale"]
+            instance_scales = scales.view(grouping_samples.shape[0], 1)
+        else:
+            slider_value = self.scale_slider.value
+            instance_scales = (
+                torch.ones(grouping_samples.shape[0], 1, device=self.device)
+                * slider_value
+            )
+
+        # Calculate features for the scale-conditioned grouping field.
+        # Hash values need to be included in the outputs for the loss calculation.
+        hash = self.grouping_field.get_hash(grouping_samples)
+        hash_rendered = self.renderer_feat(
+            embeds=hash, weights=grouping_weights.detach().half()
+        )
+        if self.training:
+            outputs["instance_hash"] = hash_rendered  # normalized!
+        outputs["instance"] = self.grouping_field.get_mlp(hash_rendered, instance_scales).float()
+
+        # If a click point is available, calculate the affinity between the click point and the scene.
+        click_output = self.click_scene.get_outputs(outputs)
+        if click_output is not None:
+            outputs.update(click_output)
+
+        return outputs
+
+    @torch.no_grad()
+    def get_grouping_at_points(self, positions: TensorType, scale: float) -> TensorType:
+        """Get the grouping features at a set of points, given a scale."""
+        # Apply distortion, calculate hash values, then normalize
+        positions = self.grouping_field.spatial_distortion(positions)
+        positions = (positions + 2.0) / 4.0
+        xs = [e(positions.view(-1, 3)) for e in self.grouping_field.enc_list]
+        x = torch.concat(xs, dim=-1)
+        x = x / x.norm(dim=-1, keepdim=True)
+
+        # Calculate grouping features; create a scale tensor to match the batch size
+        instance_scale = torch.ones((x.shape[0], 1), device=self.device) * scale
+        return self.grouping_field.get_mlp(x, instance_scale)
+
+    def get_loss_dict_group(self, outputs, batch, metrics_dict=None):
+        # loss_dict = super().get_loss_dict(outputs, batch, metrics_dict)
+        if not self.training:
+            return
+
+        loss_dict = {}
+        margin = 1.0
+
+        ####################################################################################
+        # Calculate GT labels for the positive and negative pairs
+        ####################################################################################
+        # TODO(cmk) want to make this a little more efficient and cleaner
+        input_id1 = input_id2 = batch["mask_id"]
+
+        # Expand labels
+        labels1_expanded = input_id1.unsqueeze(1).expand(-1, input_id1.shape[0])
+        labels2_expanded = input_id2.unsqueeze(0).expand(input_id2.shape[0], -1)
+
+        # Mask for positive/negative pairs across the entire matrix
+        mask_full_positive = labels1_expanded == labels2_expanded
+        mask_full_negative = ~mask_full_positive
+
+        # Create a block mask to only consider pairs within the same image -- no cross-image pairs
+        chunk_size = batch["nPxImg"]  # i.e., the number of rays per image
+        num_chunks = input_id1.shape[0] // chunk_size  # i.e., # of images in the batch
+        block_mask = torch.kron(
+            torch.eye(num_chunks, device=self.device, dtype=bool),
+            torch.ones((chunk_size, chunk_size), device=self.device, dtype=bool),
+        )  # block-diagonal matrix, to consider only pairs within the same image
+        
+        # Only consider upper triangle to avoid double-counting
+        block_mask = torch.triu(block_mask, diagonal=0)  
+        # Only consider pairs where both points are valid (-1 means not in mask / invalid)
+        block_mask = block_mask * (labels1_expanded != -1) * (labels2_expanded != -1)
+
+        # Mask for diagonal elements (i.e., pairs of the same point).
+        # Don't consider these pairs for grouping supervision (pulling), since they are trivially similar.
+        diag_mask = torch.eye(block_mask.shape[0], device=self.device, dtype=bool)
+
+        hash_rendered = outputs["instance_hash"]
+        scale = batch["scale"].view(-1, 1)
+
+        ####################################################################################
+        # Grouping supervision
+        ####################################################################################
+        total_loss = 0
+
+        # 1. If (A, s_A) and (A', s_A) in same group, then supervise the features to be similar
+        # Note that `use_single_scale` (for ablation only) causes grouping_field to ignore the scale input.
+        instance = self.grouping_field.get_mlp(hash_rendered, scale)
+        mask = torch.where(mask_full_positive * block_mask * (~diag_mask))
+        instance_loss_1 = torch.norm(
+            instance[mask[0]] - instance[mask[1]], p=2, dim=-1
+        ).nansum()
+        total_loss += instance_loss_1
+
+        # 2. If ", then also supervise them to be similar at s > s_A
+        if self.config.use_hierarchy_losses and (not self.config.use_single_scale):
+            scale_diff = torch.max(
+                torch.zeros_like(scale), (self.config.max_grouping_scale - scale)
+            )
+            larger_scale = scale + scale_diff * torch.rand(
+                size=(1,), device=scale.device
+            )
+            instance = self.grouping_field.get_mlp(hash_rendered, larger_scale)
+            mask = torch.where(mask_full_positive * block_mask * (~diag_mask))
+            instance_loss_2 = torch.norm(
+                instance[mask[0]] - instance[mask[1]], p=2, dim=-1
+            ).nansum()
+            total_loss += instance_loss_2
+
+        # 4. Also supervising A, B to be dissimilar at scales s_A, s_B respectively seems to help.
+        instance = self.grouping_field.get_mlp(hash_rendered, scale)
+        mask = torch.where(mask_full_negative * block_mask)
+        instance_loss_4 = (
+            F.relu(
+                margin - torch.norm(instance[mask[0]] - instance[mask[1]], p=2, dim=-1)
+            )
+        ).nansum()
+        total_loss += instance_loss_4
+
+        loss_dict["instance_loss"] = total_loss / torch.sum(block_mask).float()
+
+        return loss_dict
+
+    def get_param_groups(self) -> Dict[str, List[Parameter]]:
+        param_groups = super().get_param_groups()
+        param_groups["garfield"] = list(self.grouping_field.parameters())
+        return param_groups
+
+    @torch.no_grad()
+    def get_outputs_for_camera_ray_bundle(self, camera_ray_bundle: RayBundle) -> Dict[str, torch.Tensor]:
+        """Takes in camera parameters and computes the output of the model.
+        This is the same as the base model's, but with a try/except in the case the shape is incorrect.
+
+        Args:
+            camera_ray_bundle: ray bundle to calculate outputs over
+        """
+        input_device = camera_ray_bundle.directions.device
+        num_rays_per_chunk = self.config.eval_num_rays_per_chunk
+        image_height, image_width = camera_ray_bundle.origins.shape[:2]
+        num_rays = len(camera_ray_bundle)
+        outputs_lists = defaultdict(list)
+        for i in range(0, num_rays, num_rays_per_chunk):
+            start_idx = i
+            end_idx = i + num_rays_per_chunk
+            ray_bundle = camera_ray_bundle.get_row_major_sliced_ray_bundle(start_idx, end_idx)
+            # move the chunk inputs to the model device
+            ray_bundle = ray_bundle.to(self.device)
+            outputs = self.forward(ray_bundle=ray_bundle)
+            for output_name, output in outputs.items():  # type: ignore
+                if not isinstance(output, torch.Tensor):
+                    # TODO: handle lists of tensors as well
+                    continue
+                # move the chunk outputs from the model device back to the device of the inputs.
+                outputs_lists[output_name].append(output.to(input_device))
+        outputs = {}
+        for output_name, outputs_list in outputs_lists.items():
+            try:
+                outputs[output_name] = torch.cat(outputs_list).view(image_height, image_width, -1)  # type: ignore
+            except:
+                pass
+        return outputs
diff --git a/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_pipeline.py b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_pipeline.py
new file mode 100644
index 0000000000..bd5eceb3e2
--- /dev/null
+++ b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_pipeline.py
@@ -0,0 +1,186 @@
+import typing
+from dataclasses import dataclass, field
+from typing import Literal, Type, Mapping, Any
+
+import torch
+from nerfstudio.pipelines.base_pipeline import VanillaPipeline, VanillaPipelineConfig
+from torch.cuda.amp.grad_scaler import GradScaler
+
+import tqdm
+
+from sklearn.preprocessing import QuantileTransformer
+from garfield.garfield_datamanager import GarfieldDataManagerConfig, GarfieldDataManager
+from garfield.garfield_model import GarfieldModel, GarfieldModelConfig
+
+
+@dataclass
+class GarfieldPipelineConfig(VanillaPipelineConfig):
+    """Configuration for GARField pipeline instantiation"""
+
+    _target: Type = field(default_factory=lambda: GarfieldPipeline)
+    """target class to instantiate"""
+
+    datamanager: GarfieldDataManagerConfig = field(default_factory=lambda: GarfieldDataManagerConfig())
+    model: GarfieldModelConfig = field(default_factory=lambda: GarfieldModelConfig())
+
+    start_grouping_step: int = 2000
+    max_grouping_scale: float = 2.0
+    num_rays_per_image: int = 256
+    normalize_grouping_scale: bool = True
+
+
+class GarfieldPipeline(VanillaPipeline):
+    config: GarfieldPipelineConfig
+    datamanager: GarfieldDataManager
+    model: GarfieldModel
+
+    def __init__(
+        self,
+        config: GarfieldPipelineConfig,
+        device: str,
+        test_mode: Literal["test", "val", "inference"] = "val",
+        world_size: int = 1,
+        local_rank: int = 0,
+        grad_scaler: typing.Optional[GradScaler] = None,
+    ):
+        config.model.max_grouping_scale = config.max_grouping_scale
+        super().__init__(
+            config,
+            device,
+            test_mode,
+            world_size,
+            local_rank,
+            grad_scaler,
+        )
+
+    def get_train_loss_dict(self, step: int):
+        """In addition to the base class, we also calculate SAM masks
+        and their 3D scales at `start_grouping_step`."""
+        if step == self.config.start_grouping_step:
+            loaded = self.datamanager.load_sam_data()
+            if not loaded:
+                self.populate_grouping_info()
+            else:
+                # Initialize grouping statistics. This will be automatically loaded from a checkpoint next time.
+                scale_stats = self.datamanager.scale_3d_statistics
+                self.grouping_stats = torch.nn.Parameter(scale_stats)
+                self.model.grouping_field.quantile_transformer = (
+                    self._get_quantile_func(scale_stats)
+                )
+            # Set the number of rays per image to the number of rays per image for grouping
+            pixel_sampler = self.datamanager.train_pixel_sampler
+            pixel_sampler.num_rays_per_image = pixel_sampler.config.num_rays_per_image
+
+        ray_bundle, batch = self.datamanager.next_train(step)
+        if step >= self.config.start_grouping_step:
+            # also set the grouping info in the batch; in-place operation
+            self.datamanager.next_group(ray_bundle, batch)
+
+        model_outputs = self._model(
+            ray_bundle
+        )  # train distributed data parallel model if world_size > 1
+
+        metrics_dict = self.model.get_metrics_dict(model_outputs, batch)
+        loss_dict = self.model.get_loss_dict(model_outputs, batch, metrics_dict)
+        if step >= self.config.start_grouping_step:
+            loss_dict.update(
+                self.model.get_loss_dict_group(model_outputs, batch, metrics_dict)
+            )
+
+        return model_outputs, loss_dict, metrics_dict
+
+    def populate_grouping_info(self):
+        """
+        Calculate groups from SAM and their 3D scales, and save them in the datamanager.
+        This information is required to supervise the grouping field.
+        """
+        # Note that pipeline is in train mode here, via the base trainer.
+        self.model.eval()
+
+        # Calculate multi-scale masks, and their 3D scales
+        scales_3d_list, pixel_level_keys_list, group_cdf_list = [], [], []
+        train_cameras = self.datamanager.train_dataset.cameras
+        for i in tqdm.trange(len(train_cameras), desc="Calculating 3D masks"):
+            camera_ray_bundle = train_cameras.generate_rays(camera_indices=i).to(
+                self.device
+            )
+            with torch.no_grad():
+                outputs = self.model.get_outputs_for_camera_ray_bundle(
+                    camera_ray_bundle
+                )
+
+            # Get RGB (for SAM mask generation), depth and 3D point locations (for 3D scale calculation)
+            rgb = self.datamanager.train_dataset[i]["image"]
+            depth = outputs["depth"]
+            points = camera_ray_bundle.origins + camera_ray_bundle.directions * depth
+            # Scales are capped to `max_grouping_scale` to filter noisy / outlier masks.
+            (
+                pixel_level_keys,
+                scale_3d,
+                group_cdf,
+            ) = self.datamanager._calculate_3d_groups(
+                rgb, depth, points, max_scale=self.config.max_grouping_scale
+            )
+
+            pixel_level_keys_list.append(pixel_level_keys)
+            scales_3d_list.append(scale_3d)
+            group_cdf_list.append(group_cdf)
+
+        # Save grouping data, and set it in the datamanager for current training.
+        # This will be cached, so we don't need to calculate it again.
+        self.datamanager.save_sam_data(
+            pixel_level_keys_list, scales_3d_list, group_cdf_list
+        )
+        self.datamanager.pixel_level_keys = torch.nested.nested_tensor(
+            pixel_level_keys_list
+        )
+        self.datamanager.scale_3d = torch.nested.nested_tensor(scales_3d_list)
+        self.datamanager.group_cdf = torch.nested.nested_tensor(group_cdf_list)
+
+        # Initialize grouping statistics. This will be automatically loaded from a checkpoint next time.
+        self.grouping_stats = torch.nn.Parameter(torch.cat(scales_3d_list))
+        self.model.grouping_field.quantile_transformer = self._get_quantile_func(
+            torch.cat(scales_3d_list)
+        )
+
+        # Turn model back to train mode
+        self.model.train()
+
+    def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True):
+        """
+        Same as the base class, but also loads the grouping statistics.
+        It's important to normalize the 3D scales as input to the grouping field.
+        """
+        # Load 3D group scale statistics
+        grouping_stats = state_dict["grouping_stats"]
+        self.grouping_stats = torch.nn.Parameter(torch.zeros_like(grouping_stats)).to(
+            self.device
+        )
+        # Calculate quantile transformer
+        self.model.grouping_field.quantile_transformer = self._get_quantile_func(
+            grouping_stats
+        )
+
+        return super().load_state_dict(state_dict, strict)
+
+    def _get_quantile_func(self, scales: torch.Tensor, distribution="normal"):
+        """
+        Use 3D scale statistics to normalize scales -- use quantile transformer.
+        """
+        scales = scales.flatten()
+        scales = scales[(scales > 0) & (scales < self.config.max_grouping_scale)]
+
+        scales = scales.detach().cpu().numpy()
+
+        # Calculate quantile transformer
+        quantile_transformer = QuantileTransformer(output_distribution=distribution)
+        quantile_transformer = quantile_transformer.fit(scales.reshape(-1, 1))
+
+        def quantile_transformer_func(scales):
+            # This function acts as a wrapper for QuantileTransformer.
+            # QuantileTransformer expects a numpy array, while we have a torch tensor.
+            return torch.Tensor(
+                quantile_transformer.transform(scales.cpu().numpy())
+            ).to(scales.device)
+
+        return quantile_transformer_func
diff --git a/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_pixel_sampler.py b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_pixel_sampler.py
new file mode 100644
index 0000000000..69deed66d8
--- /dev/null
+++ b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_pixel_sampler.py
@@ -0,0 +1,78 @@
+"""
+Datamanager.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Optional, Type, Union
+
+import torch
+from jaxtyping import Int
+from torch import Tensor
+from nerfstudio.data.pixel_samplers import (
+    PixelSampler,
+    PixelSamplerConfig,
+)
+from rich.progress import Console
+
+CONSOLE = Console(width=120)
+
+
+@dataclass
+class GarfieldPixelSamplerConfig(PixelSamplerConfig):
+    _target: Type = field(default_factory=lambda: GarfieldPixelSampler)
+    num_rays_per_image: int = 256  # different from num_rays_per_batch
+
+
+class GarfieldPixelSampler(PixelSampler):
+    def __init__(self, config: GarfieldPixelSamplerConfig, **kwargs):
+        self.num_rays_per_image = 1  # Start with 1 (i.e., no indices grouped by image. Will be updated later in pipeline)
+        super().__init__(config, **kwargs)
+
+    def sample_method(
+        self,
+        batch_size: int,
+        num_images: int,
+        image_height: int,
+        image_width: int,
+        mask: Optional[Tensor] = None,
+        device: Union[torch.device, str] = "cpu",
+    ) -> Int[Tensor, "batch_size 3"]:
+        """
+        Equivalent to PixelSampler, but with the following differences when `grouping_enabled` is True:
+        - `batch_size` is expected to be a multiple of `num_rays_per_image`.
+        - Indices are grouped by image, with `num_rays_per_image` rays per image.
+        [
+            [image_0, x_0, y_0], [image_0, x_1, y_1], ..., [image_0, x_n, y_n],
+            [image_1, x_0, y_0], [image_1, x_1, y_1], ..., [image_1, x_n, y_n],
+            ...
+        ]
+        """
+        if isinstance(mask, Tensor):
+            raise NotImplementedError(
+                "GarfieldPixelSampler does not support masks yet."
+            )
+
+        indices = super().sample_method(
+            batch_size,
+            num_images,
+            image_height,
+            image_width,
+            mask,
+            device,
+        )
+
+        if self.num_rays_per_image == 1:
+            return indices
+
+        sub_bs = batch_size // (self.num_rays_per_image)
+        if (sub_bs * self.num_rays_per_image) != batch_size:
+            raise ValueError(
+                f"Batch size {batch_size} is not a multiple of num_rays_per_image {self.num_rays_per_image}."
+            )
+
+        image_indices = torch.randint(low=0, high=num_images, size=(sub_bs,))
+        indices[:, 0] = image_indices.repeat_interleave(self.num_rays_per_image)
+
+        return indices
diff --git a/fvdb/projects/panoptic_segmentation/garfield/garfield/img_group_model.py b/fvdb/projects/panoptic_segmentation/garfield/garfield/img_group_model.py
new file mode 100644
index 0000000000..d527e7d0cd
--- /dev/null
+++ b/fvdb/projects/panoptic_segmentation/garfield/garfield/img_group_model.py
@@ -0,0 +1,102 @@
+"""
+Quick wrapper for Segment Anything Model
+"""
+
+from dataclasses import dataclass, field
+from typing import Type, Union, Literal
+
+import torch
+import numpy as np
+from transformers import pipeline
+
+from PIL import Image
+
+from nerfstudio.configs import base_config as cfg
+
+
+@dataclass
+class ImgGroupModelConfig(cfg.InstantiateConfig):
+    _target: Type = field(default_factory=lambda: ImgGroupModel)
+    """target class to instantiate"""
+    model_type: Literal["sam_fb", "sam_hf", "maskformer"] = "sam_fb"
+    """
+    Currently supports:
+     - "sam_fb": Original SAM model (from facebook github)
+     - "sam_hf": SAM model from huggingface
+     - "maskformer": MaskFormer model from huggingface (experimental)
+    """
+
+    sam_model_type: str = ""
+    sam_model_ckpt: str = ""
+    sam_kwargs: dict = field(default_factory=lambda: {})
+    "Arguments for SAM model (fb)."
+
+    # # Settings used for the paper:
+    # model_type="sam_fb",  
+    # sam_model_type="vit_h",
+    # sam_model_ckpt="models/sam_vit_h_4b8939.pth",
+    # sam_kwargs={
+    #     "points_per_side": 32,  # 32 in original
+    #     "pred_iou_thresh": 0.90,
+    #     "stability_score_thresh": 0.90,
+    # },
+
+    device: Union[torch.device, str] = ("cpu",)
+
+
+class ImgGroupModel:
+    """
+    Wrapper for 2D image segmentation models (e.g. MaskFormer, SAM)
+    Original paper uses SAM, but we can use any model that outputs masks.
+    The code currently assumes that every image has at least one group/mask.
+    """
+    def __init__(self, config: ImgGroupModelConfig, **kwargs):
+        self.config = config
+        self.kwargs = kwargs
+        self.device = self.config.device = self.kwargs["device"]
+        self.model = None
+
+        # also, assert that model_type doesn't have a "/" in it! Will mess with h5df.
+        assert "/" not in self.config.model_type, "model_type cannot have a '/' in it!"
+
+    def __call__(self, img: np.ndarray):
+        # takes in range 0-255... HxWx3
+        # For using huggingface transformer's SAM model
+        if self.config.model_type == "sam_hf":
+            if self.model is None:
+                self.model = pipeline("mask-generation", model="facebook/sam-vit-huge", device=self.device)
+            img = Image.fromarray(img)
+            masks = self.model(img, points_per_side=32, pred_iou_thresh=0.90, stability_score_thresh=0.90)
+            masks = masks['masks']
+            masks = sorted(masks, key=lambda x: x.sum())
+            return masks
+        
+        elif self.config.model_type == "sam_fb":
+            # For using the original SAM model
+            if self.model is None:
+                from segment_anything import SamAutomaticMaskGenerator, sam_model_registry
+                registry = sam_model_registry[self.config.sam_model_type]
+                model = registry(checkpoint=self.config.sam_model_ckpt)
+                model = model.to(device=self.config.device)
+                self.model = SamAutomaticMaskGenerator(
+                    model=model, **self.config.sam_kwargs
+                )
+            masks = self.model.generate(img)
+            masks = [m['segmentation'] for m in masks] # already as bool
+            masks = sorted(masks, key=lambda x: x.sum())
+            return masks
+        
+        elif self.config.model_type == "maskformer":
+            # For using another model (e.g., MaskFormer)
+            if self.model is None:
+                self.model = pipeline(model="facebook/maskformer-swin-large-coco", device=self.device)
+            img = Image.fromarray(img)
+            masks = self.model(img)
+            masks = [
+                (np.array(m['mask']) != 0)
+                for m in masks
+            ]
+            masks = sorted(masks, key=lambda x: x.sum())
+            return masks
+
+        raise NotImplementedError(f"Model type {self.config.model_type} not implemented")
diff --git a/fvdb/projects/panoptic_segmentation/garfield/pyproject.toml b/fvdb/projects/panoptic_segmentation/garfield/pyproject.toml
new file mode 100644
index 0000000000..a79e770627
--- /dev/null
+++ b/fvdb/projects/panoptic_segmentation/garfield/pyproject.toml
@@ -0,0 +1,22 @@
+[project]
+name = "garfield"
+version = "0.1.0"
+
+dependencies=[
+    "nerfstudio>=1.0.0",
+    "transformers",
+    "gsplat",
+    "trimesh",
+    "viser",
+    "torch",
+    "scikit-learn",
+    "torchtyping",
+    # "cuml",
+]
+
+[tool.setuptools.packages.find]
+include = ["garfield"]
+
+[project.entry-points.'nerfstudio.method_configs']
+garfield = 'garfield.garfield_config:garfield_method'
+garfield_gauss = 'garfield.garfield_config:garfield_gauss_method'

From d2d4e13971826ea3911307c5d8ae5073ddeed8d1 Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Fri, 20 Dec 2024 18:35:16 +1300
Subject: [PATCH 36/59] Add a garfield CUDA 12.1 environment

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 .../garfield/garfield_environment.yml         | 57 +++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 fvdb/projects/panoptic_segmentation/garfield/garfield_environment.yml

diff --git a/fvdb/projects/panoptic_segmentation/garfield/garfield_environment.yml b/fvdb/projects/panoptic_segmentation/garfield/garfield_environment.yml
new file mode 100644
index 0000000000..651d195945
--- /dev/null
+++ b/fvdb/projects/panoptic_segmentation/garfield/garfield_environment.yml
@@ -0,0 +1,57 @@
+name: fvdb_garfield
+channels:
+  - pytorch
+  - nvidia/label/cuda-12.1.0
+  - rapidsai
+  - conda-forge
+  - nodefaults
+dependencies:
+  - python=3.10
+  - pytorch::pytorch=2.4.0
+  - pytorch::pytorch-cuda=12.1
+  - pytorch::pytorch-mutex=*=cuda
+  - cuda-toolkit
+  - cuda-compiler
+  - cuda-nvcc=12.1
+  - cuda-cccl=12.1
+  - cuda-libraries-static
+  # specifically need these 12.1.1 versions of cudart 
+  #  because of awkward overwriting with conda-forge versions that get picked up
+  - nvidia/label/cuda-12.1.1::cuda-cudart-static
+  - nvidia/label/cuda-12.1.1::cuda-cudart
+  - nvidia/label/cuda-12.1.1::cuda-cudart-dev
+  - gcc_linux-64=11
+  - gxx_linux-64=11
+  - cxx-compiler
+  - pip
+  - git
+  - gitpython
+  - ipython
+  - tqdm
+  - numpy<2
+  - tyro
+  ##  nerfstudio
+  - tensorboard
+  - torchvision
+  - open3d>=0.16.0
+  - transformers
+  - trimesh
+  - ninja
+  - fastai::opencv-python-headless
+  - imageio
+  - torchmetrics
+  - protobuf
+  ##  garfield
+  - scikit-learn
+  - rapidsai::cuml
+  - rapidsai::libcumlprims
+  - pip:
+    ##  nerfstudio
+    # NOTE: have to build tiny-cuda-nn and nerfacc from source for CUDA 12
+    - git+https://github.com/nerfstudio-project/nerfacc.git@v0.5.2
+    - git+https://github.com/swahtz/tiny-cuda-nn/@cuda_libdir_fix#subdirectory=bindings/torch
+    - gsplat
+    - viser
+    - nerfstudio>=1.0.0
+    ##  garfield
+    - torchtyping

From d560caea5cea299ab783ecd925315268cd50bf4f Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Mon, 23 Dec 2024 18:59:43 +1300
Subject: [PATCH 37/59] Fixing garfield black formatting; adding garfield to
 SPDX ignore-paths

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 .github/workflows/fvdb_codestyle.yml          |   1 +
 .../garfield/garfield/garfield_config.py      |  63 ++----
 .../garfield/garfield/garfield_datamanager.py |  52 ++---
 .../garfield/garfield/garfield_field.py       |  12 +-
 .../garfield/garfield_gaussian_pipeline.py    | 210 ++++++++++--------
 .../garfield/garfield/garfield_interaction.py |  50 ++---
 .../garfield/garfield/garfield_model.py       |  85 +++----
 .../garfield/garfield/garfield_pipeline.py    |  60 ++---
 .../garfield/garfield_pixel_sampler.py        |  15 +-
 .../garfield/garfield/img_group_model.py      |  36 ++-
 10 files changed, 249 insertions(+), 335 deletions(-)

diff --git a/.github/workflows/fvdb_codestyle.yml b/.github/workflows/fvdb_codestyle.yml
index 525b67c98b..c5a5e7af2e 100644
--- a/.github/workflows/fvdb_codestyle.yml
+++ b/.github/workflows/fvdb_codestyle.yml
@@ -78,3 +78,4 @@ jobs:
             openvdb/openvdb/math/Half.h
             openvdb_wolfram/OpenVDBLink
             openvdb_ax/openvdb_ax/grammar/generated
+            fvdb/projects/panoptic_segmentation/garfield/garfield
diff --git a/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_config.py b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_config.py
index 7cd1c2f9b2..09d748559b 100644
--- a/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_config.py
+++ b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_config.py
@@ -1,27 +1,25 @@
+from garfield.garfield_datamanager import GarfieldDataManagerConfig
+from garfield.garfield_field import GarfieldFieldConfig
+from garfield.garfield_gaussian_pipeline import GarfieldGaussianPipelineConfig
+from garfield.garfield_model import GarfieldModelConfig
+from garfield.garfield_pipeline import GarfieldPipelineConfig
+from garfield.garfield_pixel_sampler import GarfieldPixelSamplerConfig
+from garfield.img_group_model import ImgGroupModelConfig
 from nerfstudio.configs.base_config import ViewerConfig
-from nerfstudio.data.dataparsers.nerfstudio_dataparser import NerfstudioDataParserConfig
-from nerfstudio.engine.optimizers import AdamOptimizerConfig
-from nerfstudio.engine.schedulers import ExponentialDecaySchedulerConfig
-from nerfstudio.plugins.types import MethodSpecification
-from nerfstudio.pipelines.base_pipeline import VanillaPipelineConfig
 from nerfstudio.data.datamanagers.base_datamanager import VanillaDataManagerConfig
-from nerfstudio.engine.trainer import TrainerConfig
 
 # For Gaussian Splatting
 from nerfstudio.data.datamanagers.full_images_datamanager import (
     FullImageDatamanagerConfig,
 )
-from nerfstudio.models.splatfacto import SplatfactoModelConfig
 from nerfstudio.data.dataparsers.colmap_dataparser import ColmapDataParserConfig
-
-from garfield.garfield_pipeline import GarfieldPipelineConfig
-from garfield.garfield_datamanager import GarfieldDataManagerConfig
-from garfield.garfield_pixel_sampler import GarfieldPixelSamplerConfig
-from garfield.garfield_model import GarfieldModelConfig
-from garfield.garfield_field import GarfieldFieldConfig
-from garfield.img_group_model import ImgGroupModelConfig
-from garfield.garfield_gaussian_pipeline import GarfieldGaussianPipelineConfig
-
+from nerfstudio.data.dataparsers.nerfstudio_dataparser import NerfstudioDataParserConfig
+from nerfstudio.engine.optimizers import AdamOptimizerConfig
+from nerfstudio.engine.schedulers import ExponentialDecaySchedulerConfig
+from nerfstudio.engine.trainer import TrainerConfig
+from nerfstudio.models.splatfacto import SplatfactoModelConfig
+from nerfstudio.pipelines.base_pipeline import VanillaPipelineConfig
+from nerfstudio.plugins.types import MethodSpecification
 
 garfield_method = MethodSpecification(
     config=TrainerConfig(
@@ -41,17 +39,13 @@
                     num_rays_per_image=256,  # 4096/256 = 16 images per batch
                 ),
                 img_group_model=ImgGroupModelConfig(
-                    model_type="sam_hf",  
+                    model_type="sam_hf",
                     # Can choose out of "sam_fb", "sam_hf", "maskformer"
-                    # Used sam_fb for the paper, see `img_group_model.py`. 
+                    # Used sam_fb for the paper, see `img_group_model.py`.
                     device="cuda",
                 ),
             ),
-            model=GarfieldModelConfig(
-                instance_field=GarfieldFieldConfig(
-                    n_instance_dims=256  # 256 in original
-                )
-            ),
+            model=GarfieldModelConfig(instance_field=GarfieldFieldConfig(n_instance_dims=256)),  # 256 in original
         ),
         optimizers={
             "proposal_networks": {
@@ -60,24 +54,16 @@
             },
             "fields": {
                 "optimizer": AdamOptimizerConfig(lr=1e-2, eps=1e-15),
-                "scheduler": ExponentialDecaySchedulerConfig(
-                    lr_final=1e-3, max_steps=30000
-                ),
+                "scheduler": ExponentialDecaySchedulerConfig(lr_final=1e-3, max_steps=30000),
             },
             "garfield": {
-                "optimizer": AdamOptimizerConfig(
-                    lr=1e-4, eps=1e-15, weight_decay=1e-6, max_norm=1.0
-                ),
+                "optimizer": AdamOptimizerConfig(lr=1e-4, eps=1e-15, weight_decay=1e-6, max_norm=1.0),
                 # TODO the warmup_steps == pipeline.start_grouping_step, but would be good to not hardcode it
-                "scheduler": ExponentialDecaySchedulerConfig(
-                    lr_final=1e-5, max_steps=10000, warmup_steps=2000
-                ),
+                "scheduler": ExponentialDecaySchedulerConfig(lr_final=1e-5, max_steps=10000, warmup_steps=2000),
             },
             "camera_opt": {
                 "optimizer": AdamOptimizerConfig(lr=1e-3, eps=1e-15),
-                "scheduler": ExponentialDecaySchedulerConfig(
-                    lr_final=1e-4, max_steps=5000
-                ),
+                "scheduler": ExponentialDecaySchedulerConfig(lr_final=1e-4, max_steps=5000),
             },
         },
         viewer=ViewerConfig(num_rays_per_chunk=1 << 15),
@@ -92,11 +78,10 @@
         steps_per_eval_image=100,
         steps_per_eval_batch=100,
         steps_per_save=2000,
-        steps_per_eval_all_images=100000, 
+        steps_per_eval_all_images=100000,
         max_num_iterations=30000,
         mixed_precision=False,
-        gradient_accumulation_steps = {'camera_opt': 100,'color':10,'shs':10},
-
+        gradient_accumulation_steps={"camera_opt": 100, "color": 10, "shs": 10},
         pipeline=GarfieldGaussianPipelineConfig(
             datamanager=FullImageDatamanagerConfig(
                 dataparser=NerfstudioDataParserConfig(load_3D_points=True),
@@ -140,4 +125,4 @@
         vis="viewer",
     ),
     description="anythingnerf with gauss",
-)
\ No newline at end of file
+)
diff --git a/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_datamanager.py b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_datamanager.py
index 807bf47392..c527b1f42b 100644
--- a/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_datamanager.py
+++ b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_datamanager.py
@@ -7,29 +7,27 @@
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union
-from typing_extensions import TypeVar
 
 import torch
 from nerfstudio.cameras.rays import RayBundle
 from nerfstudio.data.datasets.base_dataset import InputDataset
 from rich.progress import Console
+from typing_extensions import TypeVar
 
 CONSOLE = Console(width=120)
 
-import h5py
 import os
 import os.path as osp
 
-
+import h5py
 import numpy as np
+from garfield.garfield_pixel_sampler import GarfieldPixelSampler
+from garfield.img_group_model import ImgGroupModel, ImgGroupModelConfig
 from nerfstudio.data.datamanagers.base_datamanager import (
     VanillaDataManager,
     VanillaDataManagerConfig,
 )
 
-from garfield.img_group_model import ImgGroupModelConfig, ImgGroupModel
-from garfield.garfield_pixel_sampler import GarfieldPixelSampler
-
 
 @dataclass
 class GarfieldDataManagerConfig(VanillaDataManagerConfig):
@@ -96,9 +94,7 @@ def load_sam_data(self) -> bool:
 
             num_entries = len(sam_data["pixel_level_keys"].keys())
             for i in range(num_entries):
-                pixel_level_keys_list.append(
-                    torch.from_numpy(sam_data["pixel_level_keys"][str(i)][...])
-                )
+                pixel_level_keys_list.append(torch.from_numpy(sam_data["pixel_level_keys"][str(i)][...]))
             self.pixel_level_keys = torch.nested.nested_tensor(pixel_level_keys_list)
             del pixel_level_keys_list
 
@@ -140,9 +136,7 @@ def create_pixel_mask_array(masks: torch.Tensor):
         """
         max_masks = masks.sum(dim=0).max().item()
         image_shape = masks.shape[1:]
-        pixel_mask_array = torch.full(
-            (max_masks, image_shape[0], image_shape[1]), -1, dtype=torch.int
-        ).to(masks.device)
+        pixel_mask_array = torch.full((max_masks, image_shape[0], image_shape[1]), -1, dtype=torch.int).to(masks.device)
 
         for m, mask in enumerate(masks):
             mask_clone = mask.clone()
@@ -182,13 +176,9 @@ def helper_return_no_masks():
             # Fail gracefully when no masks are found.
             # Create dummy data (all -1s), which will be ignored later.
             # See: `get_loss_dict_group` in `garfield_model.py`
-            pixel_level_keys = torch.full(
-                (image_shape[0], image_shape[1], 1), -1, dtype=torch.int
-            )
+            pixel_level_keys = torch.full((image_shape[0], image_shape[1], 1), -1, dtype=torch.int)
             scale = torch.Tensor([0.0]).view(-1, 1)
-            mask_cdf = torch.full(
-                (image_shape[0], image_shape[1], 1), 1, dtype=torch.float
-            )
+            mask_cdf = torch.full((image_shape[0], image_shape[1], 1), 1, dtype=torch.float)
             return (pixel_level_keys, scale, mask_cdf)
 
         # Calculate SAM masks
@@ -235,9 +225,7 @@ def helper_return_no_masks():
 
         # Calculate "pixel level keys", which is a 2D array of shape (H, W, max_masks)
         # Each pixel has a list of group indices that it belongs to, in order of increasing scale.
-        pixel_level_keys = self.create_pixel_mask_array(
-            sam_mask
-        ).long()  # (H, W, max_masks)
+        pixel_level_keys = self.create_pixel_mask_array(sam_mask).long()  # (H, W, max_masks)
 
         # Calculate group sampling CDF, to bias sampling towards smaller groups
         # Be careful to not include -1s in the CDF (padding, or unlabeled pixels)
@@ -247,15 +235,11 @@ def helper_return_no_masks():
         mask_inds, counts = mask_inds[mask_sorted], counts[mask_sorted]
         counts[0] = 0  # don't include -1
         probs = counts / counts.sum()  # [-1, 0, ...]
-        mask_probs = torch.gather(probs, 0, pixel_level_keys.reshape(-1) + 1).view(
-            pixel_level_keys.shape
-        )
+        mask_probs = torch.gather(probs, 0, pixel_level_keys.reshape(-1) + 1).view(pixel_level_keys.shape)
         mask_log_probs = torch.log(mask_probs)
         never_masked = mask_log_probs.isinf()
         mask_log_probs[never_masked] = 0.0
-        mask_log_probs = mask_log_probs / (
-            mask_log_probs.sum(dim=-1, keepdim=True) + 1e-6
-        )
+        mask_log_probs = mask_log_probs / (mask_log_probs.sum(dim=-1, keepdim=True) + 1e-6)
         mask_cdf = torch.cumsum(mask_log_probs, dim=-1)
         mask_cdf[never_masked] = 1.0
 
@@ -289,12 +273,9 @@ def next_group(self, ray_bundle: RayBundle, batch: Dict[str, Any]):
             img_idx = img_ind[i]
 
             # Use `random_vec` to choose a group for each pixel.
-            per_pixel_index = self.pixel_level_keys[img_idx][
-                x_ind[i : i + npximg], y_ind[i : i + npximg]
-            ]
+            per_pixel_index = self.pixel_level_keys[img_idx][x_ind[i : i + npximg], y_ind[i : i + npximg]]
             random_index = torch.sum(
-                random_vec_sampling.view(-1, 1)
-                > self.group_cdf[img_idx][x_ind[i : i + npximg], y_ind[i : i + npximg]],
+                random_vec_sampling.view(-1, 1) > self.group_cdf[img_idx][x_ind[i : i + npximg], y_ind[i : i + npximg]],
                 dim=-1,
             )
 
@@ -304,9 +285,7 @@ def next_group(self, ray_bundle: RayBundle, batch: Dict[str, Any]):
             if per_pixel_index.shape[-1] == 1:
                 per_pixel_mask = per_pixel_index.squeeze()
             else:
-                per_pixel_mask = torch.gather(
-                    per_pixel_index, 1, random_index.unsqueeze(-1)
-                ).squeeze()
+                per_pixel_mask = torch.gather(per_pixel_index, 1, random_index.unsqueeze(-1)).squeeze()
                 per_pixel_mask_ = torch.gather(
                     per_pixel_index,
                     1,
@@ -318,8 +297,7 @@ def next_group(self, ray_bundle: RayBundle, batch: Dict[str, Any]):
             # interval scale supervision
             curr_scale = self.scale_3d[img_idx][per_pixel_mask]
             curr_scale[random_index == 0] = (
-                self.scale_3d[img_idx][per_pixel_mask][random_index == 0]
-                * random_vec_densify[random_index == 0]
+                self.scale_3d[img_idx][per_pixel_mask][random_index == 0] * random_vec_densify[random_index == 0]
             )
             for j in range(1, self.group_cdf[img_idx].shape[-1]):
                 if (random_index == j).sum() == 0:
diff --git a/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_field.py b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_field.py
index 71f7e7e433..1c4f36b647 100644
--- a/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_field.py
+++ b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_field.py
@@ -1,6 +1,6 @@
-from enum import Enum
-from typing import Dict, List, Optional, Tuple, Callable, Any, Type
 from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type
 
 import numpy as np
 import torch
@@ -66,9 +66,7 @@ def __init__(
         # This is a trick to make the hashgrid encoding work with the TCNN library.
         self.enc_list = torch.nn.ModuleList(
             [
-                self._get_encoding(
-                    hashgrid_cfg["resolution_range"][i], hashgrid_cfg["level"][i]
-                )
+                self._get_encoding(hashgrid_cfg["resolution_range"][i], hashgrid_cfg["level"][i])
                 for i in range(len(hashgrid_cfg["level"]))
             ]
         )
@@ -90,9 +88,7 @@ def __init__(
         self.quantile_transformer = None  # for scale normalization
 
     @staticmethod
-    def _get_encoding(
-        res_range: Tuple[int, int], levels: int, indim=3, hash_size=19
-    ) -> tcnn.Encoding:
+    def _get_encoding(res_range: Tuple[int, int], levels: int, indim=3, hash_size=19) -> tcnn.Encoding:
         """
         Helper function to create a HashGrid encoding.
         """
diff --git a/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_gaussian_pipeline.py b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_gaussian_pipeline.py
index c4bcc21772..c1497da5c8 100644
--- a/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_gaussian_pipeline.py
+++ b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_gaussian_pipeline.py
@@ -1,35 +1,30 @@
+import time
 import typing
 from dataclasses import dataclass, field
-from typing import Literal, Type, Mapping, Any, Optional, List, Dict
-from torchtyping import TensorType
 from pathlib import Path
+from typing import Any, Dict, List, Literal, Mapping, Optional, Type
+
+import cv2
+import open3d as o3d
+import torch
+import tqdm
 import trimesh
 import viser
 import viser.transforms as vtf
-import open3d as o3d
-import cv2
-import time
-
-import torch
+from cuml.cluster.hdbscan import HDBSCAN
+from garfield.garfield_datamanager import GarfieldDataManager, GarfieldDataManagerConfig
+from garfield.garfield_model import GarfieldModel, GarfieldModelConfig
+from garfield.garfield_pipeline import GarfieldPipeline, GarfieldPipelineConfig
+from nerfstudio.models.splatfacto import RGB2SH, SplatfactoModel
 from nerfstudio.pipelines.base_pipeline import VanillaPipeline, VanillaPipelineConfig
-from torch.cuda.amp.grad_scaler import GradScaler
-from nerfstudio.viewer.viewer_elements import *
 from nerfstudio.viewer.viewer import VISER_NERFSTUDIO_SCALE_RATIO
-from nerfstudio.models.splatfacto import SplatfactoModel
-
-from cuml.cluster.hdbscan import HDBSCAN
-from nerfstudio.models.splatfacto import RGB2SH
-
-import tqdm
-
-from sklearn.preprocessing import QuantileTransformer
-from sklearn.neighbors import NearestNeighbors
-
+from nerfstudio.viewer.viewer_elements import *
 from scipy.spatial.transform import Rotation as Rot
+from sklearn.neighbors import NearestNeighbors
+from sklearn.preprocessing import QuantileTransformer
+from torch.cuda.amp.grad_scaler import GradScaler
+from torchtyping import TensorType
 
-from garfield.garfield_datamanager import GarfieldDataManagerConfig, GarfieldDataManager
-from garfield.garfield_model import GarfieldModel, GarfieldModelConfig
-from garfield.garfield_pipeline import GarfieldPipelineConfig, GarfieldPipeline
 
 def quat_to_rotmat(quat):
     assert quat.shape[-1] == 4, quat.shape
@@ -50,6 +45,7 @@ def quat_to_rotmat(quat):
     )
     return mat.reshape(quat.shape[:-1] + (3, 3))
 
+
 def generate_random_colors(N=5000) -> torch.Tensor:
     """Generate random colors for visualization"""
     hs = np.random.uniform(0, 1, size=(N, 1))
@@ -64,6 +60,7 @@ def generate_random_colors(N=5000) -> torch.Tensor:
 @dataclass
 class GarfieldGaussianPipelineConfig(VanillaPipelineConfig):
     """Gaussian Splatting, but also loading GARField grouping field from ckpt."""
+
     _target: Type = field(default_factory=lambda: GarfieldGaussianPipeline)
     garfield_ckpt: Optional[Path] = None  # Need to specify this
 
@@ -77,6 +74,7 @@ class GarfieldGaussianPipeline(VanillaPipeline):
 
     Note that the pipeline training must be stopped before you can interact with the scene!!
     """
+
     model: SplatfactoModel
     garfield_pipeline: List[GarfieldPipeline]  # To avoid importing Viewer* from nerf pipeline
     state_stack: List[Dict[str, TensorType]]  # To revert to previous state
@@ -100,9 +98,8 @@ def __init__(
         print("Loading instance feature model...")
         assert config.garfield_ckpt is not None, "Need to specify garfield checkpoint"
         from nerfstudio.utils.eval_utils import eval_setup
-        _, garfield_pipeline, _, _ = eval_setup(
-            config.garfield_ckpt, test_mode="inference"
-        )
+
+        _, garfield_pipeline, _, _ = eval_setup(config.garfield_ckpt, test_mode="inference")
         self.garfield_pipeline = [garfield_pipeline]
         self.state_stack = []
 
@@ -114,7 +111,7 @@ def __init__(
             "Interaction Method",
             default_value="Interactive",
             options=["Interactive", "Clustering"],
-            cb_hook=self._update_interaction_method
+            cb_hook=self._update_interaction_method,
         )
 
         self.click_gaussian = ViewerButton(name="Click", cb_hook=self._click_gaussian)
@@ -122,31 +119,53 @@ def __init__(
         self.click_handle = None
 
         self.crop_to_click = ViewerButton(name="Crop to Click", cb_hook=self._crop_to_click, disabled=True)
-        self.crop_to_group_level = ViewerSlider(name="Group Level", min_value=0, max_value=29, step=1, default_value=0, cb_hook=self._update_crop_vis, disabled=True)
+        self.crop_to_group_level = ViewerSlider(
+            name="Group Level",
+            min_value=0,
+            max_value=29,
+            step=1,
+            default_value=0,
+            cb_hook=self._update_crop_vis,
+            disabled=True,
+        )
         self.crop_group_list = []
 
         self.move_current_crop = ViewerButton(name="Drag Current Crop", cb_hook=self._drag_current_crop, disabled=True)
         self.crop_transform_handle = None
 
-        self.cluster_scene = ViewerButton(name="Cluster Scene", cb_hook=self._cluster_scene, disabled=False, visible=False)
-        self.cluster_scene_scale = ViewerSlider(name="Cluster Scale", min_value=0.0, max_value=2.0, step=0.01, default_value=0.0, disabled=False, visible=False)
-        self.cluster_scene_shuffle_colors = ViewerButton(name="Reshuffle Cluster Colors", cb_hook=self._reshuffle_cluster_colors, disabled=False, visible=False)
+        self.cluster_scene = ViewerButton(
+            name="Cluster Scene", cb_hook=self._cluster_scene, disabled=False, visible=False
+        )
+        self.cluster_scene_scale = ViewerSlider(
+            name="Cluster Scale",
+            min_value=0.0,
+            max_value=2.0,
+            step=0.01,
+            default_value=0.0,
+            disabled=False,
+            visible=False,
+        )
+        self.cluster_scene_shuffle_colors = ViewerButton(
+            name="Reshuffle Cluster Colors", cb_hook=self._reshuffle_cluster_colors, disabled=False, visible=False
+        )
         self.cluster_labels = None
 
         self.reset_state = ViewerButton(name="Reset State", cb_hook=self._reset_state, disabled=True)
 
-        self.z_export_options = ViewerCheckbox(name="Export Options", default_value=False, cb_hook=self._update_export_options)
+        self.z_export_options = ViewerCheckbox(
+            name="Export Options", default_value=False, cb_hook=self._update_export_options
+        )
         self.z_export_options_visible_gaussians = ViewerButton(
-            name="Export Visible Gaussians",
-            visible=False,
-            cb_hook=self._export_visible_gaussians
-            )
+            name="Export Visible Gaussians", visible=False, cb_hook=self._export_visible_gaussians
+        )
         self.z_export_options_camera_path_filename = ViewerText("Camera Path Filename", "", visible=False)
-        self.z_export_options_camera_path_render = ViewerButton("Render Current Pipeline", cb_hook=self.render_from_path, visible=False)
+        self.z_export_options_camera_path_render = ViewerButton(
+            "Render Current Pipeline", cb_hook=self.render_from_path, visible=False
+        )
 
     def _update_interaction_method(self, dropdown: ViewerDropdown):
         """Update the UI based on the interaction method"""
-        hide_in_interactive = (not (dropdown.value == "Interactive")) # i.e., hide if in interactive mode
+        hide_in_interactive = not (dropdown.value == "Interactive")  # i.e., hide if in interactive mode
 
         self.cluster_scene.set_hidden((not hide_in_interactive))
         self.cluster_scene_scale.set_hidden((not hide_in_interactive))
@@ -194,12 +213,14 @@ def _reset_state(self, button: ViewerButton):
     def _queue_state(self):
         """Save current state to stack"""
         import copy
-        self.state_stack.append(copy.deepcopy({k:v.detach() for k,v in self.model.gauss_params.items()}))
+
+        self.state_stack.append(copy.deepcopy({k: v.detach() for k, v in self.model.gauss_params.items()}))
         self.reset_state.set_disabled(False)
 
     def _click_gaussian(self, button: ViewerButton):
         """Start listening for click-based 3D point specification.
         Refer to garfield_interaction.py for more details."""
+
         def del_handle_on_rayclick(click: ViewerClick):
             self._on_rayclick(click)
             self.click_gaussian.set_disabled(False)
@@ -248,7 +269,7 @@ def _crop_to_click(self, button: ViewerButton):
         assert self.click_location is not None, "Need to specify click location"
 
         self._queue_state()  # Save current state
-        curr_means = self.model.gauss_params['means'].detach()
+        curr_means = self.model.gauss_params["means"].detach()
         self.model.eval()
 
         # The only way to reset is to reset the state using the reset button.
@@ -286,7 +307,7 @@ def _crop_to_click(self, button: ViewerButton):
             keeps = torch.where(affinity < 0.5)[0].cpu()
             keep_points = points.select_by_index(keeps.tolist())  # indices of gaussians
 
-            # Here, we desire the gaussian groups to be grouped tightly together spatially. 
+            # Here, we desire the gaussian groups to be grouped tightly together spatially.
             # We use DBSCAN to group the gaussians together, and choose the cluster that contains the click point.
             # Note that there may be spuriously high affinity between points that are spatially far apart,
             #  possibly due two different groups being considered together at an odd angle / far viewpoint.
@@ -309,9 +330,9 @@ def _crop_to_click(self, button: ViewerButton):
                 curr_points_ds_selected[curr_points_ds_ids] = True
 
                 _clusters = np.asarray(curr_points_ds.cluster_dbscan(eps=0.02, min_points=5))
-                nn_model = NearestNeighbors(
-                    n_neighbors=1, algorithm="auto", metric="euclidean"
-                ).fit(np.asarray(curr_points_ds.points))
+                nn_model = NearestNeighbors(n_neighbors=1, algorithm="auto", metric="euclidean").fit(
+                    np.asarray(curr_points_ds.points)
+                )
 
                 _, indices = nn_model.kneighbors(np.asarray(keep_points.points)[~curr_points_ds_selected])
 
@@ -356,7 +377,7 @@ def _crop_to_click(self, button: ViewerButton):
         self.click_location = None
         self.click_handle.remove()
         self.click_handle = None
-        
+
         self.crop_group_list = keep_list
         self.crop_to_group_level.set_disabled(False)
         self.crop_to_group_level.value = 29
@@ -369,7 +390,7 @@ def _update_crop_vis(self, number: ViewerSlider):
             return
         if len(self.state_stack) == 0:
             return
-        
+
         # Clamp the number to be within the range of possible crops
         if number.value > len(self.crop_group_list) - 1:
             number.value = len(self.crop_group_list) - 1
@@ -387,11 +408,11 @@ def _drag_current_crop(self, button: ViewerButton):
         """Add a transform control to the current scene, and update the model accordingly."""
         self.crop_to_group_level.set_disabled(True)  # Disable user from changing crop
         self.move_current_crop.set_disabled(True)  # Disable user from creating another drag handle
-        
-        scene_centroid = self.model.gauss_params['means'].detach().mean(dim=0)
+
+        scene_centroid = self.model.gauss_params["means"].detach().mean(dim=0)
         self.crop_transform_handle = self.viewer_control.viser_server.add_transform_controls(
             name=f"/scene_transform",
-            position=(VISER_NERFSTUDIO_SCALE_RATIO*scene_centroid).cpu().numpy(),
+            position=(VISER_NERFSTUDIO_SCALE_RATIO * scene_centroid).cpu().numpy(),
         )
 
         # Visualize the whole scene -- the points corresponding to the crop will be controlled by the transform handle.
@@ -400,8 +421,8 @@ def _drag_current_crop(self, button: ViewerButton):
         for name in self.model.gauss_params.keys():
             self.model.gauss_params[name] = prev_state[name].clone()
 
-        curr_means = self.model.gauss_params['means'].clone().detach()
-        curr_rotmats = quat_to_rotmat(self.model.gauss_params['quats'][crop_inds].detach())
+        curr_means = self.model.gauss_params["means"].clone().detach()
+        curr_rotmats = quat_to_rotmat(self.model.gauss_params["quats"][crop_inds].detach())
 
         @self.crop_transform_handle.on_update
         def _(_):
@@ -409,19 +430,22 @@ def _(_):
             handle_position = handle_position / VISER_NERFSTUDIO_SCALE_RATIO
             handle_rotmat = quat_to_rotmat(torch.tensor(self.crop_transform_handle.wxyz).to(self.device).float())
 
-            means = self.model.gauss_params['means'].detach()
-            quats = self.model.gauss_params['quats'].detach()
+            means = self.model.gauss_params["means"].detach()
+            quats = self.model.gauss_params["quats"].detach()
 
-            means[crop_inds] = handle_position.float() + torch.matmul(
-                handle_rotmat, (curr_means[crop_inds] - curr_means[crop_inds].mean(dim=0)).T
-            ).T
-            quats[crop_inds] = torch.Tensor(Rot.from_matrix(
-                torch.matmul(handle_rotmat.float(), curr_rotmats.float()).cpu().numpy()
-            ).as_quat()).to(self.device)  # this is in xyzw format
+            means[crop_inds] = (
+                handle_position.float()
+                + torch.matmul(handle_rotmat, (curr_means[crop_inds] - curr_means[crop_inds].mean(dim=0)).T).T
+            )
+            quats[crop_inds] = torch.Tensor(
+                Rot.from_matrix(torch.matmul(handle_rotmat.float(), curr_rotmats.float()).cpu().numpy()).as_quat()
+            ).to(
+                self.device
+            )  # this is in xyzw format
             quats[crop_inds] = quats[crop_inds][:, [3, 0, 1, 2]]  # convert to wxyz format
 
-            self.model.gauss_params['means'] = torch.nn.Parameter(means.float())
-            self.model.gauss_params['quats'] = torch.nn.Parameter(quats.float())
+            self.model.gauss_params["means"] = torch.nn.Parameter(means.float())
+            self.model.gauss_params["quats"] = torch.nn.Parameter(quats.float())
 
             self.viewer_control.viewer._trigger_rerender()  # trigger viewer rerender
 
@@ -435,16 +459,16 @@ def _reshuffle_cluster_colors(self, button: ViewerButton):
 
         labels = self.cluster_labels
 
-        features_dc = self.model.gauss_params['features_dc'].detach()
-        features_rest = self.model.gauss_params['features_rest'].detach()
+        features_dc = self.model.gauss_params["features_dc"].detach()
+        features_rest = self.model.gauss_params["features_rest"].detach()
         for c_id in range(0, labels.max().int().item() + 1):
             # set the colors of the gaussians accordingly using colormap from matplotlib
             cluster_mask = np.where(labels == c_id)
-            features_dc[cluster_mask] = RGB2SH(colormap[c_id, :3].to(self.model.gauss_params['features_dc']))
+            features_dc[cluster_mask] = RGB2SH(colormap[c_id, :3].to(self.model.gauss_params["features_dc"]))
             features_rest[cluster_mask] = 0
 
-        self.model.gauss_params['features_dc'] = torch.nn.Parameter(self.model.gauss_params['features_dc'])
-        self.model.gauss_params['features_rest'] = torch.nn.Parameter(self.model.gauss_params['features_rest'])
+        self.model.gauss_params["features_dc"] = torch.nn.Parameter(self.model.gauss_params["features_dc"])
+        self.model.gauss_params["features_rest"] = torch.nn.Parameter(self.model.gauss_params["features_rest"])
         self.cluster_scene_shuffle_colors.set_disabled(False)
 
     def _cluster_scene(self, button: ViewerButton):
@@ -456,15 +480,15 @@ def _cluster_scene(self, button: ViewerButton):
 
         scale = self.cluster_scene_scale.value
         grouping_model = self.garfield_pipeline[0].model
-        
-        positions = self.model.gauss_params['means'].detach()
+
+        positions = self.model.gauss_params["means"].detach()
         group_feats = grouping_model.get_grouping_at_points(positions, scale).cpu().numpy()  # (N, 256)
         positions = positions.cpu().numpy()
 
         start = time.time()
 
         # Cluster the gaussians using HDBSCAN.
-        # We will first cluster the downsampled gaussians, then 
+        # We will first cluster the downsampled gaussians, then
         #  assign the full gaussians to the spatially closest downsampled gaussian.
 
         vec_o3d = o3d.utility.Vector3dVector(positions)
@@ -473,13 +497,11 @@ def _cluster_scene(self, button: ViewerButton):
         max_bound = np.clip(pc_o3d.get_max_bound(), -1, 1)
         # downsample size to be a percent of the bounding box extent
         downsample_size = 0.01 * scale
-        pc, _, ids = pc_o3d.voxel_down_sample_and_trace(
-            max(downsample_size, 0.0001), min_bound, max_bound
-        )
+        pc, _, ids = pc_o3d.voxel_down_sample_and_trace(max(downsample_size, 0.0001), min_bound, max_bound)
         if len(ids) > 1e6:
             print(f"Too many points ({len(ids)}) to cluster... aborting.")
-            print( "Consider using interactive select to reduce points before clustering.")
-            print( "Are you sure you want to cluster? Press y to continue, else return.")
+            print("Consider using interactive select to reduce points before clustering.")
+            print("Are you sure you want to cluster? Press y to continue, else return.")
             # wait for input to continue, if yes then continue, else return
             if input() != "y":
                 self.cluster_scene.set_disabled(False)
@@ -509,9 +531,7 @@ def _cluster_scene(self, button: ViewerButton):
         positions_np = positions[non_clustered]
         if positions_np.shape[0] > 0:  # i.e., if there were points removed during downsampling
             k = 1
-            nn_model = NearestNeighbors(
-                n_neighbors=k, algorithm="auto", metric="euclidean"
-            ).fit(positions_downsampled)
+            nn_model = NearestNeighbors(n_neighbors=k, algorithm="auto", metric="euclidean").fit(positions_downsampled)
             _, indices = nn_model.kneighbors(positions_np)
             clusterer.labels_[non_clustered] = labels[indices[:, 0]]
 
@@ -519,14 +539,12 @@ def _cluster_scene(self, button: ViewerButton):
         print(f"done. Took {time.time()-start} seconds. Found {labels.max() + 1} clusters.")
 
         noise_mask = labels == -1
-        if noise_mask.sum() != 0 and (labels>=0).sum() > 0:
+        if noise_mask.sum() != 0 and (labels >= 0).sum() > 0:
             # if there is noise, but not all of it is noise, relabel the noise
-            valid_mask = labels >=0
+            valid_mask = labels >= 0
             valid_positions = positions[valid_mask]
             k = 1
-            nn_model = NearestNeighbors(
-                n_neighbors=k, algorithm="auto", metric="euclidean"
-            ).fit(valid_positions)
+            nn_model = NearestNeighbors(n_neighbors=k, algorithm="auto", metric="euclidean").fit(valid_positions)
             noise_positions = positions[noise_mask]
             _, indices = nn_model.kneighbors(noise_positions)
             # for now just pick the closest cluster
@@ -538,21 +556,21 @@ def _cluster_scene(self, button: ViewerButton):
 
         colormap = self.colormap
 
-        opacities = self.model.gauss_params['opacities'].detach()
+        opacities = self.model.gauss_params["opacities"].detach()
         opacities[labels < 0] = -100  # hide unclustered gaussians
-        self.model.gauss_params['opacities'] = torch.nn.Parameter(opacities.float())
+        self.model.gauss_params["opacities"] = torch.nn.Parameter(opacities.float())
 
         self.cluster_labels = torch.Tensor(labels)
-        features_dc = self.model.gauss_params['features_dc'].detach()
-        features_rest = self.model.gauss_params['features_rest'].detach()
+        features_dc = self.model.gauss_params["features_dc"].detach()
+        features_rest = self.model.gauss_params["features_rest"].detach()
         for c_id in range(0, labels.max() + 1):
             # set the colors of the gaussians accordingly using colormap from matplotlib
             cluster_mask = np.where(labels == c_id)
-            features_dc[cluster_mask] = RGB2SH(colormap[c_id, :3].to(self.model.gauss_params['features_dc']))
+            features_dc[cluster_mask] = RGB2SH(colormap[c_id, :3].to(self.model.gauss_params["features_dc"]))
             features_rest[cluster_mask] = 0
 
-        self.model.gauss_params['features_dc'] = torch.nn.Parameter(self.model.gauss_params['features_dc'])
-        self.model.gauss_params['features_rest'] = torch.nn.Parameter(self.model.gauss_params['features_rest'])
+        self.model.gauss_params["features_dc"] = torch.nn.Parameter(self.model.gauss_params["features_dc"])
+        self.model.gauss_params["features_rest"] = torch.nn.Parameter(self.model.gauss_params["features_rest"])
 
         self.cluster_scene.set_disabled(False)
         self.viewer_control.viewer._trigger_rerender()  # trigger viewer rerender
@@ -565,8 +583,9 @@ def _export_visible_gaussians(self, button: ViewerButton):
 
         # Copied from exporter.py
         from collections import OrderedDict
+
         map_to_tensors = OrderedDict()
-        model=self.model
+        model = self.model
 
         with torch.no_grad():
             positions = model.means.cpu().numpy()
@@ -619,17 +638,18 @@ def _export_visible_gaussians(self, button: ViewerButton):
                 map_to_tensors[k] = map_to_tensors[k][select]
             count = np.sum(select)
         from nerfstudio.scripts.exporter import ExportGaussianSplat
-        ExportGaussianSplat.write_ply(str(filename), count, map_to_tensors)
 
+        ExportGaussianSplat.write_ply(str(filename), count, map_to_tensors)
 
     def render_from_path(self, button: ViewerButton):
-        from nerfstudio.cameras.camera_paths import get_path_from_json
         import json
+
+        from nerfstudio.cameras.camera_paths import get_path_from_json
         from nerfstudio.scripts.render import _render_trajectory_video
 
         assert self.z_export_options_camera_path_filename.value != ""
         camera_path_filename = Path(self.z_export_options_camera_path_filename.value)
-        
+
         with open(camera_path_filename, "r", encoding="utf-8") as f:
             camera_path = json.load(f)
         seconds = camera_path["seconds"]
@@ -639,9 +659,9 @@ def render_from_path(self, button: ViewerButton):
             _render_trajectory_video(
                 self,
                 camera_path,
-                output_filename=Path('render.mp4'),
-                rendered_output_names=['rgb'],
-                rendered_resolution_scaling_factor=1.0 ,
+                output_filename=Path("render.mp4"),
+                rendered_output_names=["rgb"],
+                rendered_resolution_scaling_factor=1.0,
                 seconds=seconds,
                 output_format="video",
             )
diff --git a/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_interaction.py b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_interaction.py
index 6094e5aa52..503ca02daa 100644
--- a/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_interaction.py
+++ b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_interaction.py
@@ -1,41 +1,33 @@
 """Helper functions for interacting/visualization with GARField model."""
+
 from typing import List, Optional, Tuple, Union
-import viser
-import trimesh
-import torch.nn as nn
 
+import torch.nn as nn
+import trimesh
+import viser
+from garfield.garfield_model import GarfieldModel
 from nerfstudio.cameras.rays import RayBundle
 from nerfstudio.field_components.field_heads import FieldHeadNames
 from nerfstudio.model_components.losses import scale_gradients_by_distance_squared
-
-from nerfstudio.viewer.viewer_elements import *
 from nerfstudio.viewer.viewer import VISER_NERFSTUDIO_SCALE_RATIO
+from nerfstudio.viewer.viewer_elements import *
 
-from garfield.garfield_model import GarfieldModel
 
 class GarfieldClickScene(nn.Module):
     """UI for clicking on a scene (visualized as spheres).
     This needs to be a nn.Module to allow the viewer to register callbacks.
     """
+
     _click_handle: viser.GlbHandle
     _box_handle: viser.GlbHandle
     selected_location: np.ndarray
     scale_handle: ViewerSlider  # For getting the scale to query GARField
     model_handle: List[GarfieldModel]  # Store as list to avoid circular children
 
-    def __init__(
-            self,
-            device: torch.device,
-            scale_handle: ViewerSlider,
-            model_handle: List[GarfieldModel]
-        ):
+    def __init__(self, device: torch.device, scale_handle: ViewerSlider, model_handle: List[GarfieldModel]):
         super().__init__()
-        self.add_click_button: ViewerButton = ViewerButton(
-            name="Click", cb_hook=self._add_click_cb
-        )
-        self.del_click_button: ViewerButton = ViewerButton(
-            name="Reset Click", cb_hook=self._del_click_cb
-        )
+        self.add_click_button: ViewerButton = ViewerButton(name="Click", cb_hook=self._add_click_cb)
+        self.del_click_button: ViewerButton = ViewerButton(name="Reset Click", cb_hook=self._del_click_cb)
         self.viewer_control: ViewerControl = ViewerControl()
 
         self.scale_handle = scale_handle
@@ -49,12 +41,14 @@ def __init__(
 
     def _add_click_cb(self, button: ViewerButton):
         """Button press registers a click event, which will add a sphere.
-        Refer more to nerfstudio docs for more details. """
+        Refer more to nerfstudio docs for more details."""
         self.add_click_button.set_disabled(True)
+
         def del_handle_on_rayclick(click: ViewerClick):
             self._on_rayclick(click)
             self.add_click_button.set_disabled(False)
             self.viewer_control.unregister_click_cb(del_handle_on_rayclick)
+
         self.viewer_control.register_click_cb(del_handle_on_rayclick)
 
     def _on_rayclick(self, click: ViewerClick):
@@ -91,9 +85,7 @@ def _on_rayclick(self, click: ViewerClick):
         sphere_mesh: trimesh.Trimesh = trimesh.creation.icosphere(radius=0.1)
         sphere_mesh.vertices += click_position
         sphere_mesh.visual.vertex_colors = (1.0, 0.0, 0.0, 1.0)  # type: ignore
-        sphere_mesh_handle = self.viewer_control.viser_server.add_mesh_trimesh(
-            name=f"/hit_pos", mesh=sphere_mesh
-        )
+        sphere_mesh_handle = self.viewer_control.viser_server.add_mesh_trimesh(name=f"/hit_pos", mesh=sphere_mesh)
         self._click_handle = sphere_mesh_handle
         self.selected_location = np.array(origin + direction * distance)
         self._update_scale_vis(self.scale_handle)
@@ -114,13 +106,15 @@ def _update_scale_vis(self, slider: ViewerSlider):
             self._box_handle.remove()
             self._box_handle = None
         if self.selected_location is not None:
-            box_mesh = trimesh.creation.icosphere(radius=VISER_NERFSTUDIO_SCALE_RATIO*max(0.001, slider.value)/2, subdivision=0)
+            box_mesh = trimesh.creation.icosphere(
+                radius=VISER_NERFSTUDIO_SCALE_RATIO * max(0.001, slider.value) / 2, subdivision=0
+            )
             self._box_handle = self.viewer_control.viser_server.add_mesh_simple(
-                name=f"/hit_pos_box", 
+                name=f"/hit_pos_box",
                 vertices=box_mesh.vertices,
                 faces=box_mesh.faces,
                 position=(self.selected_location * VISER_NERFSTUDIO_SCALE_RATIO).flatten(),
-                wireframe=True
+                wireframe=True,
             )
 
     def get_outputs(self, outputs: dict):
@@ -130,7 +124,7 @@ def get_outputs(self, outputs: dict):
 
         location = self.selected_location
         instance_scale = self.scale_handle.value
-        
+
         # mimic the fields call
         grouping_field = self.model_handle[0].grouping_field
         positions = torch.tensor(location).view(1, 3).to(self.device)
@@ -141,6 +135,4 @@ def get_outputs(self, outputs: dict):
         x = x / x.norm(dim=-1, keepdim=True)
         instance_pass = grouping_field.get_mlp(x, torch.tensor([instance_scale]).to(self.device).view(1, 1))
 
-        return {
-            "instance_interact": torch.norm(outputs['instance'] - instance_pass.float(), p=2, dim=-1)
-        }
\ No newline at end of file
+        return {"instance_interact": torch.norm(outputs["instance"] - instance_pass.float(), p=2, dim=-1)}
diff --git a/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_model.py b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_model.py
index 82ac67cda6..2e991dec7f 100644
--- a/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_model.py
+++ b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_model.py
@@ -1,26 +1,21 @@
 from collections import defaultdict
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Mapping, Tuple, Type, Literal
+from typing import Any, Dict, List, Literal, Mapping, Tuple, Type
 
-import torch.nn as nn
-from torch.nn import Parameter
-import trimesh
 import numpy as np
-from torchtyping import TensorType
-
 import torch
+import torch.nn as nn
 import torch.nn.functional as F
+import trimesh
+from garfield.garfield_field import GarfieldField, GarfieldFieldConfig
 from nerfstudio.cameras.rays import RayBundle, RaySamples
 from nerfstudio.field_components.field_heads import FieldHeadNames
+from nerfstudio.model_components.losses import scale_gradients_by_distance_squared
 from nerfstudio.models.nerfacto import NerfactoModel, NerfactoModelConfig
-from nerfstudio.viewer.viewer_elements import *
 from nerfstudio.viewer.viewer import VISER_NERFSTUDIO_SCALE_RATIO
-from nerfstudio.model_components.losses import scale_gradients_by_distance_squared
-
-from garfield.garfield_field import (
-    GarfieldField,
-    GarfieldFieldConfig,
-)
+from nerfstudio.viewer.viewer_elements import *
+from torch.nn import Parameter
+from torchtyping import TensorType
 
 
 class FeatureRenderer(nn.Module):
@@ -63,18 +58,17 @@ def populate_modules(self):
         self.renderer_feat = FeatureRenderer()
         self.config.instance_field.use_single_scale = self.config.use_single_scale
         self.grouping_field = self.config.instance_field.setup()
-        
+
         # Add a slider to the viewer to control the scale of the grouping field.
         self.scale_slider = ViewerSlider("Scale", 0.0, 0.0, 2.0, 0.001)
 
-        # Store reference to click interface for GARField. 
+        # Store reference to click interface for GARField.
         # Note the List[GarfieldModel] is to avoid circular children.
         from garfield.garfield_interaction import GarfieldClickScene
+
         self.click_scene: GarfieldClickScene = GarfieldClickScene(
-            device=("cuda" if torch.cuda.is_available() else "cpu"),
-            scale_handle=self.scale_slider,
-            model_handle=[self]
-            )
+            device=("cuda" if torch.cuda.is_available() else "cpu"), scale_handle=self.scale_slider, model_handle=[self]
+        )
 
     def get_outputs(self, ray_bundle: RayBundle) -> Dict[str, TensorType]:
         outputs = super().get_outputs(ray_bundle)
@@ -84,14 +78,16 @@ def get_outputs(self, ray_bundle: RayBundle) -> Dict[str, TensorType]:
             return outputs
 
         # Recalculate ray samples and weights
-        # ... only if the model is in eval mode, where it should be no_grad(). 
+        # ... only if the model is in eval mode, where it should be no_grad().
         # If in training mode, `outputs` should already have calculated ray samples and weights.
         # Without this if-block, camera optimizer? gradients? seem to get messed up.
         ray_samples: RaySamples
         if self.training:
             ray_samples, weights = outputs["ray_samples_list"][-1], outputs["weights_list"][-1]
         else:
-            ray_samples, weights_list, ray_samples_list = self.proposal_sampler(ray_bundle, density_fns=self.density_fns)
+            ray_samples, weights_list, ray_samples_list = self.proposal_sampler(
+                ray_bundle, density_fns=self.density_fns
+            )
             field_outputs = self.field.forward(ray_samples, compute_normals=self.config.predict_normals)
             if self.config.use_gradient_scaling:
                 field_outputs = scale_gradients_by_distance_squared(field_outputs, ray_samples)
@@ -100,17 +96,11 @@ def get_outputs(self, ray_bundle: RayBundle) -> Dict[str, TensorType]:
         # Choose the top k samples with the highest weights, to be used for grouping.
         # This is to decrease # of samples queried for grouping, while sampling close to the scene density.
         def gather_fn(tens):
-            return torch.gather(
-                tens, -2, best_ids.expand(*best_ids.shape[:-1], tens.shape[-1])
-            )
+            return torch.gather(tens, -2, best_ids.expand(*best_ids.shape[:-1], tens.shape[-1]))
 
         dataclass_fn = lambda dc: dc._apply_fn_to_fields(gather_fn, dataclass_fn)
-        grouping_weights, best_ids = torch.topk(
-            weights, self.config.num_feat_samples, dim=-2, sorted=False
-        )
-        grouping_samples: RaySamples = ray_samples._apply_fn_to_fields(
-            gather_fn, dataclass_fn
-        )
+        grouping_weights, best_ids = torch.topk(weights, self.config.num_feat_samples, dim=-2, sorted=False)
+        grouping_samples: RaySamples = ray_samples._apply_fn_to_fields(gather_fn, dataclass_fn)
 
         # Define the scale for each sample. If the scale is not provided, use the selected scale.
         # "scale" is included in ray_bundle.metadata only from training batches, but
@@ -123,17 +113,12 @@ def gather_fn(tens):
             instance_scales = scales.view(grouping_samples.shape[0], 1)
         else:
             slider_value = self.scale_slider.value
-            instance_scales = (
-                torch.ones(grouping_samples.shape[0], 1, device=self.device)
-                * slider_value
-            )
+            instance_scales = torch.ones(grouping_samples.shape[0], 1, device=self.device) * slider_value
 
         # Calculate features for the scale-conditioned grouping field.
         # Hash values need to be included in the outputs for the loss calculation.
         hash = self.grouping_field.get_hash(grouping_samples)
-        hash_rendered = self.renderer_feat(
-            embeds=hash, weights=grouping_weights.detach().half()
-        )
+        hash_rendered = self.renderer_feat(embeds=hash, weights=grouping_weights.detach().half())
         if self.training:
             outputs["instance_hash"] = hash_rendered  # normalized!
         outputs["instance"] = self.grouping_field.get_mlp(hash_rendered, instance_scales).float()
@@ -188,9 +173,9 @@ def get_loss_dict_group(self, outputs, batch, metrics_dict=None):
             torch.eye(num_chunks, device=self.device, dtype=bool),
             torch.ones((chunk_size, chunk_size), device=self.device, dtype=bool),
         )  # block-diagonal matrix, to consider only pairs within the same image
-        
+
         # Only consider upper triangle to avoid double-counting
-        block_mask = torch.triu(block_mask, diagonal=0)  
+        block_mask = torch.triu(block_mask, diagonal=0)
         # Only consider pairs where both points are valid (-1 means not in mask / invalid)
         block_mask = block_mask * (labels1_expanded != -1) * (labels2_expanded != -1)
 
@@ -210,34 +195,22 @@ def get_loss_dict_group(self, outputs, batch, metrics_dict=None):
         # Note that `use_single_scale` (for ablation only) causes grouping_field to ignore the scale input.
         instance = self.grouping_field.get_mlp(hash_rendered, scale)
         mask = torch.where(mask_full_positive * block_mask * (~diag_mask))
-        instance_loss_1 = torch.norm(
-            instance[mask[0]] - instance[mask[1]], p=2, dim=-1
-        ).nansum()
+        instance_loss_1 = torch.norm(instance[mask[0]] - instance[mask[1]], p=2, dim=-1).nansum()
         total_loss += instance_loss_1
 
         # 2. If ", then also supervise them to be similar at s > s_A
         if self.config.use_hierarchy_losses and (not self.config.use_single_scale):
-            scale_diff = torch.max(
-                torch.zeros_like(scale), (self.config.max_grouping_scale - scale)
-            )
-            larger_scale = scale + scale_diff * torch.rand(
-                size=(1,), device=scale.device
-            )
+            scale_diff = torch.max(torch.zeros_like(scale), (self.config.max_grouping_scale - scale))
+            larger_scale = scale + scale_diff * torch.rand(size=(1,), device=scale.device)
             instance = self.grouping_field.get_mlp(hash_rendered, larger_scale)
             mask = torch.where(mask_full_positive * block_mask * (~diag_mask))
-            instance_loss_2 = torch.norm(
-                instance[mask[0]] - instance[mask[1]], p=2, dim=-1
-            ).nansum()
+            instance_loss_2 = torch.norm(instance[mask[0]] - instance[mask[1]], p=2, dim=-1).nansum()
             total_loss += instance_loss_2
 
         # 4. Also supervising A, B to be dissimilar at scales s_A, s_B respectively seems to help.
         instance = self.grouping_field.get_mlp(hash_rendered, scale)
         mask = torch.where(mask_full_negative * block_mask)
-        instance_loss_4 = (
-            F.relu(
-                margin - torch.norm(instance[mask[0]] - instance[mask[1]], p=2, dim=-1)
-            )
-        ).nansum()
+        instance_loss_4 = (F.relu(margin - torch.norm(instance[mask[0]] - instance[mask[1]], p=2, dim=-1))).nansum()
         total_loss += instance_loss_4
 
         loss_dict["instance_loss"] = total_loss / torch.sum(block_mask).float()
diff --git a/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_pipeline.py b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_pipeline.py
index bd5eceb3e2..4e793c1831 100644
--- a/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_pipeline.py
+++ b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_pipeline.py
@@ -1,16 +1,14 @@
 import typing
 from dataclasses import dataclass, field
-from typing import Literal, Type, Mapping, Any
+from typing import Any, Literal, Mapping, Type
 
 import torch
-from nerfstudio.pipelines.base_pipeline import VanillaPipeline, VanillaPipelineConfig
-from torch.cuda.amp.grad_scaler import GradScaler
-
 import tqdm
-
-from sklearn.preprocessing import QuantileTransformer
-from garfield.garfield_datamanager import GarfieldDataManagerConfig, GarfieldDataManager
+from garfield.garfield_datamanager import GarfieldDataManager, GarfieldDataManagerConfig
 from garfield.garfield_model import GarfieldModel, GarfieldModelConfig
+from nerfstudio.pipelines.base_pipeline import VanillaPipeline, VanillaPipelineConfig
+from sklearn.preprocessing import QuantileTransformer
+from torch.cuda.amp.grad_scaler import GradScaler
 
 
 @dataclass
@@ -64,9 +62,7 @@ def get_train_loss_dict(self, step: int):
                 # Initialize grouping statistics. This will be automatically loaded from a checkpoint next time.
                 scale_stats = self.datamanager.scale_3d_statistics
                 self.grouping_stats = torch.nn.Parameter(scale_stats)
-                self.model.grouping_field.quantile_transformer = (
-                    self._get_quantile_func(scale_stats)
-                )
+                self.model.grouping_field.quantile_transformer = self._get_quantile_func(scale_stats)
             # Set the number of rays per image to the number of rays per image for grouping
             pixel_sampler = self.datamanager.train_pixel_sampler
             pixel_sampler.num_rays_per_image = pixel_sampler.config.num_rays_per_image
@@ -76,16 +72,12 @@ def get_train_loss_dict(self, step: int):
             # also set the grouping info in the batch; in-place operation
             self.datamanager.next_group(ray_bundle, batch)
 
-        model_outputs = self._model(
-            ray_bundle
-        )  # train distributed data parallel model if world_size > 1
+        model_outputs = self._model(ray_bundle)  # train distributed data parallel model if world_size > 1
 
         metrics_dict = self.model.get_metrics_dict(model_outputs, batch)
         loss_dict = self.model.get_loss_dict(model_outputs, batch, metrics_dict)
         if step >= self.config.start_grouping_step:
-            loss_dict.update(
-                self.model.get_loss_dict_group(model_outputs, batch, metrics_dict)
-            )
+            loss_dict.update(self.model.get_loss_dict_group(model_outputs, batch, metrics_dict))
 
         return model_outputs, loss_dict, metrics_dict
 
@@ -101,13 +93,9 @@ def populate_grouping_info(self):
         scales_3d_list, pixel_level_keys_list, group_cdf_list = [], [], []
         train_cameras = self.datamanager.train_dataset.cameras
         for i in tqdm.trange(len(train_cameras), desc="Calculating 3D masks"):
-            camera_ray_bundle = train_cameras.generate_rays(camera_indices=i).to(
-                self.device
-            )
+            camera_ray_bundle = train_cameras.generate_rays(camera_indices=i).to(self.device)
             with torch.no_grad():
-                outputs = self.model.get_outputs_for_camera_ray_bundle(
-                    camera_ray_bundle
-                )
+                outputs = self.model.get_outputs_for_camera_ray_bundle(camera_ray_bundle)
 
             # Get RGB (for SAM mask generation), depth and 3D point locations (for 3D scale calculation)
             rgb = self.datamanager.train_dataset[i]["image"]
@@ -118,9 +106,7 @@ def populate_grouping_info(self):
                 pixel_level_keys,
                 scale_3d,
                 group_cdf,
-            ) = self.datamanager._calculate_3d_groups(
-                rgb, depth, points, max_scale=self.config.max_grouping_scale
-            )
+            ) = self.datamanager._calculate_3d_groups(rgb, depth, points, max_scale=self.config.max_grouping_scale)
 
             pixel_level_keys_list.append(pixel_level_keys)
             scales_3d_list.append(scale_3d)
@@ -128,20 +114,14 @@ def populate_grouping_info(self):
 
         # Save grouping data, and set it in the datamanager for current training.
         # This will be cached, so we don't need to calculate it again.
-        self.datamanager.save_sam_data(
-            pixel_level_keys_list, scales_3d_list, group_cdf_list
-        )
-        self.datamanager.pixel_level_keys = torch.nested.nested_tensor(
-            pixel_level_keys_list
-        )
+        self.datamanager.save_sam_data(pixel_level_keys_list, scales_3d_list, group_cdf_list)
+        self.datamanager.pixel_level_keys = torch.nested.nested_tensor(pixel_level_keys_list)
         self.datamanager.scale_3d = torch.nested.nested_tensor(scales_3d_list)
         self.datamanager.group_cdf = torch.nested.nested_tensor(group_cdf_list)
 
         # Initialize grouping statistics. This will be automatically loaded from a checkpoint next time.
         self.grouping_stats = torch.nn.Parameter(torch.cat(scales_3d_list))
-        self.model.grouping_field.quantile_transformer = self._get_quantile_func(
-            torch.cat(scales_3d_list)
-        )
+        self.model.grouping_field.quantile_transformer = self._get_quantile_func(torch.cat(scales_3d_list))
 
         # Turn model back to train mode
         self.model.train()
@@ -153,13 +133,9 @@ def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True):
         """
         # Load 3D group scale statistics
         grouping_stats = state_dict["grouping_stats"]
-        self.grouping_stats = torch.nn.Parameter(torch.zeros_like(grouping_stats)).to(
-            self.device
-        )
+        self.grouping_stats = torch.nn.Parameter(torch.zeros_like(grouping_stats)).to(self.device)
         # Calculate quantile transformer
-        self.model.grouping_field.quantile_transformer = self._get_quantile_func(
-            grouping_stats
-        )
+        self.model.grouping_field.quantile_transformer = self._get_quantile_func(grouping_stats)
 
         return super().load_state_dict(state_dict, strict)
 
@@ -179,8 +155,6 @@ def _get_quantile_func(self, scales: torch.Tensor, distribution="normal"):
         def quantile_transformer_func(scales):
             # This function acts as a wrapper for QuantileTransformer.
             # QuantileTransformer expects a numpy array, while we have a torch tensor.
-            return torch.Tensor(
-                quantile_transformer.transform(scales.cpu().numpy())
-            ).to(scales.device)
+            return torch.Tensor(quantile_transformer.transform(scales.cpu().numpy())).to(scales.device)
 
         return quantile_transformer_func
diff --git a/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_pixel_sampler.py b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_pixel_sampler.py
index 69deed66d8..bfd1f60701 100644
--- a/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_pixel_sampler.py
+++ b/fvdb/projects/panoptic_segmentation/garfield/garfield/garfield_pixel_sampler.py
@@ -9,12 +9,9 @@
 
 import torch
 from jaxtyping import Int
-from torch import Tensor
-from nerfstudio.data.pixel_samplers import (
-    PixelSampler,
-    PixelSamplerConfig,
-)
+from nerfstudio.data.pixel_samplers import PixelSampler, PixelSamplerConfig
 from rich.progress import Console
+from torch import Tensor
 
 CONSOLE = Console(width=120)
 
@@ -27,7 +24,9 @@ class GarfieldPixelSamplerConfig(PixelSamplerConfig):
 
 class GarfieldPixelSampler(PixelSampler):
     def __init__(self, config: GarfieldPixelSamplerConfig, **kwargs):
-        self.num_rays_per_image = 1  # Start with 1 (i.e., no indices grouped by image. Will be updated later in pipeline)
+        self.num_rays_per_image = (
+            1  # Start with 1 (i.e., no indices grouped by image. Will be updated later in pipeline)
+        )
         super().__init__(config, **kwargs)
 
     def sample_method(
@@ -50,9 +49,7 @@ def sample_method(
         ]
         """
         if isinstance(mask, Tensor):
-            raise NotImplementedError(
-                "GarfieldPixelSampler does not support masks yet."
-            )
+            raise NotImplementedError("GarfieldPixelSampler does not support masks yet.")
 
         indices = super().sample_method(
             batch_size,
diff --git a/fvdb/projects/panoptic_segmentation/garfield/garfield/img_group_model.py b/fvdb/projects/panoptic_segmentation/garfield/garfield/img_group_model.py
index d527e7d0cd..1642c90393 100644
--- a/fvdb/projects/panoptic_segmentation/garfield/garfield/img_group_model.py
+++ b/fvdb/projects/panoptic_segmentation/garfield/garfield/img_group_model.py
@@ -3,15 +3,13 @@
 """
 
 from dataclasses import dataclass, field
-from typing import Type, Union, Literal
+from typing import Literal, Type, Union
 
-import torch
 import numpy as np
-from transformers import pipeline
-
-from PIL import Image
-
+import torch
 from nerfstudio.configs import base_config as cfg
+from PIL import Image
+from transformers import pipeline
 
 
 @dataclass
@@ -32,7 +30,7 @@ class ImgGroupModelConfig(cfg.InstantiateConfig):
     "Arguments for SAM model (fb)."
 
     # # Settings used for the paper:
-    # model_type="sam_fb",  
+    # model_type="sam_fb",
     # sam_model_type="vit_h",
     # sam_model_ckpt="models/sam_vit_h_4b8939.pth",
     # sam_kwargs={
@@ -50,6 +48,7 @@ class ImgGroupModel:
     Original paper uses SAM, but we can use any model that outputs masks.
     The code currently assumes that every image has at least one group/mask.
     """
+
     def __init__(self, config: ImgGroupModelConfig, **kwargs):
         self.config = config
         self.kwargs = kwargs
@@ -67,35 +66,34 @@ def __call__(self, img: np.ndarray):
                 self.model = pipeline("mask-generation", model="facebook/sam-vit-huge", device=self.device)
             img = Image.fromarray(img)
             masks = self.model(img, points_per_side=32, pred_iou_thresh=0.90, stability_score_thresh=0.90)
-            masks = masks['masks']
+            masks = masks["masks"]
             masks = sorted(masks, key=lambda x: x.sum())
             return masks
-        
+
         elif self.config.model_type == "sam_fb":
             # For using the original SAM model
             if self.model is None:
-                from segment_anything import SamAutomaticMaskGenerator, sam_model_registry
+                from segment_anything import (
+                    SamAutomaticMaskGenerator,
+                    sam_model_registry,
+                )
+
                 registry = sam_model_registry[self.config.sam_model_type]
                 model = registry(checkpoint=self.config.sam_model_ckpt)
                 model = model.to(device=self.config.device)
-                self.model = SamAutomaticMaskGenerator(
-                    model=model, **self.config.sam_kwargs
-                )
+                self.model = SamAutomaticMaskGenerator(model=model, **self.config.sam_kwargs)
             masks = self.model.generate(img)
-            masks = [m['segmentation'] for m in masks] # already as bool
+            masks = [m["segmentation"] for m in masks]  # already as bool
             masks = sorted(masks, key=lambda x: x.sum())
             return masks
-        
+
         elif self.config.model_type == "maskformer":
             # For using another model (e.g., MaskFormer)
             if self.model is None:
                 self.model = pipeline(model="facebook/maskformer-swin-large-coco", device=self.device)
             img = Image.fromarray(img)
             masks = self.model(img)
-            masks = [
-                (np.array(m['mask']) != 0)
-                for m in masks
-            ]
+            masks = [(np.array(m["mask"]) != 0) for m in masks]
             masks = sorted(masks, key=lambda x: x.sum())
             return masks
 

From 8be8734a12174e40c9a1333b1e54e628d56e8dbb Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Tue, 24 Dec 2024 00:09:06 +1300
Subject: [PATCH 38/59] Fix trailing whitespace

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 .../panoptic_segmentation/garfield/README.md       | 14 +++++++-------
 .../garfield/garfield_environment.yml              |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/fvdb/projects/panoptic_segmentation/garfield/README.md b/fvdb/projects/panoptic_segmentation/garfield/README.md
index b42aef587a..37e1dec05d 100644
--- a/fvdb/projects/panoptic_segmentation/garfield/README.md
+++ b/fvdb/projects/panoptic_segmentation/garfield/README.md
@@ -2,7 +2,7 @@
 
 This is the official implementation for [GARField](https://www.garfield.studio).
 
-Tested on Python 3.10, cuda 12.0, using conda. 
+Tested on Python 3.10, cuda 12.0, using conda.
 
 <div align='center'>
 <img src="https://www.garfield.studio/data/garfield_training.jpg" height="230px">
@@ -28,9 +28,9 @@ The best way to install it is through conda: `conda install -c rapidsai -c conda
 
 , or with pip: `pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12==24.2.* cuml-cu12==24.2.*`.
 
-Important: I used [`libmamba`](https://www.anaconda.com/blog/a-faster-conda-for-a-growing-community) for conda. I have been told multiple times that the conda solver is very slow / gets stuck, but this seems to be key. 
+Important: I used [`libmamba`](https://www.anaconda.com/blog/a-faster-conda-for-a-growing-community) for conda. I have been told multiple times that the conda solver is very slow / gets stuck, but this seems to be key.
 
-If you get `ClobberError`, try `conda clean --all` -- see [here](https://stackoverflow.com/questions/51217876/conda-update-anaconda-fails-clobbererror). It seems that `pip` installed packages from `nerfstudio` may conflict with the `conda` install here. 
+If you get `ClobberError`, try `conda clean --all` -- see [here](https://stackoverflow.com/questions/51217876/conda-update-anaconda-fails-clobbererror). It seems that `pip` installed packages from `nerfstudio` may conflict with the `conda` install here.
 
 3. Install GARField!
 ```
@@ -65,14 +65,14 @@ Afterwards, you can start interacting with the affinity field.
 https://github.com/chungmin99/garfield/assets/10284938/e193d7e8-da7c-4176-b7c5-a7ec75513c16
 
 2. Affinity visualization between 3D point and scene: use "Click" button to
-   select the point, and select `instance_interact` as the output type. 
+   select the point, and select `instance_interact` as the output type.
    You might need to drag the viewer window slightly to see this output type.
    Again, interact with the `scale` slider!
-Here, with `invert` True and output unnormalized, red color means high affinity (i.e., features at click point and rendered point are close to each other). Blue means low affinity. 
+Here, with `invert` True and output unnormalized, red color means high affinity (i.e., features at click point and rendered point are close to each other). Blue means low affinity.
 
 https://github.com/chungmin99/garfield/assets/10284938/6edbdad6-d356-4b32-b44e-0df8ec1dca16
 
-Also, note: the results can change a lot between 2k to 30k steps. 
+Also, note: the results can change a lot between 2k to 30k steps.
 
 Once the model is trained to completion, you can use the outputted config file for `garfield-gauss`.
 
@@ -96,7 +96,7 @@ https://github.com/chungmin99/garfield/assets/10284938/82ea7145-d8d1-485d-bab2-f
 https://github.com/chungmin99/garfield/assets/10284938/541fe037-925c-418f-929d-a9397f8d57d3
 
 
-   
+
 ## Citation
 If you use this work or find it helpful, please consider citing: (bibtex)
 
diff --git a/fvdb/projects/panoptic_segmentation/garfield/garfield_environment.yml b/fvdb/projects/panoptic_segmentation/garfield/garfield_environment.yml
index 651d195945..77303baed6 100644
--- a/fvdb/projects/panoptic_segmentation/garfield/garfield_environment.yml
+++ b/fvdb/projects/panoptic_segmentation/garfield/garfield_environment.yml
@@ -15,7 +15,7 @@ dependencies:
   - cuda-nvcc=12.1
   - cuda-cccl=12.1
   - cuda-libraries-static
-  # specifically need these 12.1.1 versions of cudart 
+  # specifically need these 12.1.1 versions of cudart
   #  because of awkward overwriting with conda-forge versions that get picked up
   - nvidia/label/cuda-12.1.1::cuda-cudart-static
   - nvidia/label/cuda-12.1.1::cuda-cudart

From 6abaae22dda70e6903cc7fa65e07020d11706a81 Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Fri, 3 Jan 2025 17:10:16 +1300
Subject: [PATCH 39/59] Add download_example_data script for downloading
 example colmap data from LERF project used by GARField paper. Updated README

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 .../panoptic_segmentation/garfield/.gitignore |   1 +
 .../panoptic_segmentation/garfield/README.md  | 121 ++++--------------
 .../garfield/download_example_data.py         |  51 ++++++++
 .../garfield/garfield_environment.yml         |   1 +
 4 files changed, 75 insertions(+), 99 deletions(-)
 create mode 100755 fvdb/projects/panoptic_segmentation/garfield/download_example_data.py

diff --git a/fvdb/projects/panoptic_segmentation/garfield/.gitignore b/fvdb/projects/panoptic_segmentation/garfield/.gitignore
index 48702ef2fc..5c3efa347e 100644
--- a/fvdb/projects/panoptic_segmentation/garfield/.gitignore
+++ b/fvdb/projects/panoptic_segmentation/garfield/.gitignore
@@ -2,3 +2,4 @@
 __pycache__/
 *.egg-info/
 outputs/
+/data
diff --git a/fvdb/projects/panoptic_segmentation/garfield/README.md b/fvdb/projects/panoptic_segmentation/garfield/README.md
index 37e1dec05d..e2805a6c6e 100644
--- a/fvdb/projects/panoptic_segmentation/garfield/README.md
+++ b/fvdb/projects/panoptic_segmentation/garfield/README.md
@@ -1,110 +1,33 @@
 # <img src="https://www.garfield.studio/data/favicon.png" height="30px"> GARField: Group Anything with Radiance Fields
 
-This is the official implementation for [GARField](https://www.garfield.studio).
-
-Tested on Python 3.10, cuda 12.0, using conda.
-
-<div align='center'>
-<img src="https://www.garfield.studio/data/garfield_training.jpg" height="230px">
-</div>
+This code is based on the official implementation for [GARField](https://github.com/chungmin99/garfield).
 
 ## Installation
-1. Install nerfstudio from source, and its dependencies. This project requires the latest version of nerfstudio
-(more specifically, the new viewer based on viser).
-```
-# install dependencies
-pip3 install torch torchvision torchaudio
-conda install -c "nvidia/label/cuda-12.0.0" cuda-toolkit
-pip install ninja git+https://github.com/NVlabs/tiny-cuda-nn/#subdirectory=bindings/torch
-
-# install nerfstudio!
-git clone git@github.com:nerfstudio-project/nerfstudio.git
-cd nerfstudio
-pip install -e .
-```
-
-2. To use GARField with Gaussian Splatting, [`cuml`](https://docs.rapids.ai/install) is required (for global clustering).
-The best way to install it is through conda: `conda install -c rapidsai -c conda-forge -c nvidia cuml`
-
-, or with pip: `pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12==24.2.* cuml-cu12==24.2.*`.
-
-Important: I used [`libmamba`](https://www.anaconda.com/blog/a-faster-conda-for-a-growing-community) for conda. I have been told multiple times that the conda solver is very slow / gets stuck, but this seems to be key.
-
-If you get `ClobberError`, try `conda clean --all` -- see [here](https://stackoverflow.com/questions/51217876/conda-update-anaconda-fails-clobbererror). It seems that `pip` installed packages from `nerfstudio` may conflict with the `conda` install here.
 
-3. Install GARField!
-```
-git clone git@github.com:chungmin99/garfield.git
-pip install -e .
-```
+1. Create the `fvdb_garfield` environment with conda.  This will install or build all the necessary dependencies.
+   ```bash
+   conda env create -f ./garfield_environment.yml
+   ```
 
-This installs both `garfield` (NeRF geometry), and `garfield-gauss` (Gaussian geometry).
-Note that `garfield-gauss` requires reference to a fully trained `garfield` checkpoint,
-as it relies on the affinity field from `garfield`. See the main paper for more details.
-
-4. (Optional) If you wish to use a different version of the SAM model (by default, the Hugging Face Transformer's SAM model facebook/sam-vit-huge is used), please install the 'segment_anything' package.
-
-```
-pip install git+https://github.com/facebookresearch/segment-anything.git
-```
+2. Activate the `fvdb_garfield` environment and install the `garfield` package.
+   ```bash
+   conda activate fvdb_garfield
+   pip install -e .
+   ```
 
 ## Running GARField
 
-Note: using colmap-based image data makes it more convenient to run both `garfield` and `garfield-gauss` on the same dataset. Although `splatfacto` (Gaussian Splatting in nerfstudio) is supported with `NerfstudioDataParser`, and also supports random point initialization with non-colmap datasets, the NeRF and GS geometries will align better with colmap since 1) we will start from colmap points and 2) camera optimization is minimized.
-
-You can use it like any other third-party nerfstudio project.
-```
-ns-train garfield --data /your/data/here
-```
-Note that GARField will pause to generate groups using Segment-Anything at around 2000 steps
-(set by default, this can be set in GarfieldPipeline).
-Afterwards, you can start interacting with the affinity field.
-1. PCA visualization of affinity field: select `instance` as the output type,
-   and change the value of `scale` slider.
-
-https://github.com/chungmin99/garfield/assets/10284938/e193d7e8-da7c-4176-b7c5-a7ec75513c16
-
-2. Affinity visualization between 3D point and scene: use "Click" button to
-   select the point, and select `instance_interact` as the output type.
-   You might need to drag the viewer window slightly to see this output type.
-   Again, interact with the `scale` slider!
-Here, with `invert` True and output unnormalized, red color means high affinity (i.e., features at click point and rendered point are close to each other). Blue means low affinity.
-
-https://github.com/chungmin99/garfield/assets/10284938/6edbdad6-d356-4b32-b44e-0df8ec1dca16
-
-Also, note: the results can change a lot between 2k to 30k steps.
-
-Once the model is trained to completion, you can use the outputted config file for `garfield-gauss`.
-
-## Running GARField with Gaussian Splatting geometry!
-Although GARField's affinity field is optimized using NeRF geometry, it can be
-used to group and cluster gaussians in 3D!
-```
-ns-train garfield-gauss --data /your/data/here --pipeline.garfield-ckpt outputs/your/data/garfield/.../config.yml
-```
-
-There are two main ways to interact with the scene -- make sure to pause training first!
-1. Interactive selection: click anywhere in the scene, and use "Crop to Click" button to retrieve different groups (scale=group level*0.05). Use "Drag Current Crop" to move it around!
-
-
-https://github.com/chungmin99/garfield/assets/10284938/82ea7145-d8d1-485d-bab2-f6e8b0ebd632
-
-
-2. Global clustering: cluster the currently visible gaussians (either globally or just for the crop), at the scale specified by "Cluster Scale".
-
-
-https://github.com/chungmin99/garfield/assets/10284938/541fe037-925c-418f-929d-a9397f8d57d3
-
-
+1. Download example colmap data
+   ```bash
+   ./download_example-data.py
+   ```
 
-## Citation
-If you use this work or find it helpful, please consider citing: (bibtex)
+2. Run GARField on the example data
+   ```bash
+   ns-train garfield --data ./data/figurines
+   ```
 
-```
-@inproceedings{garfield2024,
- author = {Kim, Chung Min* and Wu, Mingxuan* and Kerr, Justin* and Tancik, Matthew and Goldberg, Ken and Kanazawa, Angjoo},
- title = {GARField: Group Anything with Radiance Fields},
- booktitle = {arXiv},
- year = {2024},
-}
-```
+3. (Optional) Run GARField with Gaussian Splatting geometry
+   ```bash
+   ns-train garfield-gauss --data ./data/figurines --pipeline.garfield-ckpt outputs/figurines/garfied/[datetimestamp]/config.yml
+   ```
diff --git a/fvdb/projects/panoptic_segmentation/garfield/download_example_data.py b/fvdb/projects/panoptic_segmentation/garfield/download_example_data.py
new file mode 100755
index 0000000000..b77bf15068
--- /dev/null
+++ b/fvdb/projects/panoptic_segmentation/garfield/download_example_data.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
+#
+"""Script to download test GARFIELD dataset"""
+
+import os
+import zipfile
+from dataclasses import dataclass
+from pathlib import Path
+
+import gdown
+import tyro
+
+
+@dataclass
+class DownloadData:
+    dataset: str = "figurines"
+    save_dir: Path = Path(os.getcwd() + "/data")
+
+    def main(self):
+        self.save_dir.mkdir(parents=True, exist_ok=True)
+        dataset_savedir: Path = self.save_dir / self.dataset
+
+        if not dataset_savedir.exists():
+            downloaded_filename = Path(self.dataset_download(self.dataset))
+            if not downloaded_filename.exists():
+                raise FileNotFoundError(f"Downloaded file {downloaded_filename} does not exist, something went wrong")
+            self.unzip_download(downloaded_filename)
+
+    def dataset_download(self, dataset) -> str:
+        lerf_datasets_url = "https://drive.google.com/drive/folders/119bheSoSrgekkgQdSGa6E1jkNKC84HWL"
+        files = gdown.download_folder(url=lerf_datasets_url, skip_download=True, quiet=True)
+        download_target_file = self.save_dir / (self.dataset + ".zip")
+        for file in files:
+            if file.path == self.dataset + ".zip":
+                downloaded_filename = gdown.download(id=file.id, output=str(download_target_file))
+                break
+        if not download_target_file.exists():
+            raise FileNotFoundError(f"Dataset {self.dataset} not found in the Google Drive folder {lerf_datasets_url}")
+        return downloaded_filename
+
+    def unzip_download(self, downloaded_filename: Path):
+        with zipfile.ZipFile(downloaded_filename, "r") as zip_ref:
+            zip_ref.extractall(downloaded_filename.parent)
+        downloaded_filename.unlink()
+        print(f"{self.dataset} downloaded and unzipped succesfully.")
+
+
+if __name__ == "__main__":
+    tyro.cli(DownloadData).main()
diff --git a/fvdb/projects/panoptic_segmentation/garfield/garfield_environment.yml b/fvdb/projects/panoptic_segmentation/garfield/garfield_environment.yml
index 77303baed6..28e4c52e40 100644
--- a/fvdb/projects/panoptic_segmentation/garfield/garfield_environment.yml
+++ b/fvdb/projects/panoptic_segmentation/garfield/garfield_environment.yml
@@ -30,6 +30,7 @@ dependencies:
   - tqdm
   - numpy<2
   - tyro
+  - gdown
   ##  nerfstudio
   - tensorboard
   - torchvision

From 99630eb403b4eaf027430dad946c714f7b672e73 Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Wed, 8 Jan 2025 16:15:02 +1300
Subject: [PATCH 40/59] Make 3dgs download_example_data executable

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 fvdb/examples/3dgs/download_example_data.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 fvdb/examples/3dgs/download_example_data.py

diff --git a/fvdb/examples/3dgs/download_example_data.py b/fvdb/examples/3dgs/download_example_data.py
old mode 100644
new mode 100755

From b4ba1cbeab52589cac6949872d7f20714d02257c Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Wed, 8 Jan 2025 16:15:40 +1300
Subject: [PATCH 41/59] Fix colmap dataset distortion parameters; wrong number
 of params for simple radial distortion

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 fvdb/fvdb/utils/data/colmap_dataset.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fvdb/fvdb/utils/data/colmap_dataset.py b/fvdb/fvdb/utils/data/colmap_dataset.py
index 4bce06a547..097e371222 100644
--- a/fvdb/fvdb/utils/data/colmap_dataset.py
+++ b/fvdb/fvdb/utils/data/colmap_dataset.py
@@ -5,6 +5,7 @@
 from typing import Any, Dict, List, Optional
 
 import cv2
+import cv2.data
 import imageio.v2 as imageio
 import numpy as np
 import torch
@@ -215,7 +216,7 @@ def __init__(
                 params = np.empty(0, dtype=np.float32)
                 camtype = "perspective"
             if type_ == 2 or type_ == "SIMPLE_RADIAL":
-                params = np.array([cam.k1], dtype=np.float32)
+                params = np.array([cam.k1, 0.0, 0.0, 0.0], dtype=np.float32)
                 camtype = "perspective"
             elif type_ == 3 or type_ == "RADIAL":
                 params = np.array([cam.k1, cam.k2, 0.0, 0.0], dtype=np.float32)

From 7b542ae02130d2d97ee5ff4aa3b4582d7dbba259 Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Wed, 8 Jan 2025 16:16:26 +1300
Subject: [PATCH 42/59] Garfield download_example_data now also downloads
 colmap data. dozer_nerfgun_waldo changed to the default dataset

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 .../garfield/download_example_data.py         | 30 +++++++++++++++----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/fvdb/projects/panoptic_segmentation/garfield/download_example_data.py b/fvdb/projects/panoptic_segmentation/garfield/download_example_data.py
index b77bf15068..ae2af80bb1 100755
--- a/fvdb/projects/panoptic_segmentation/garfield/download_example_data.py
+++ b/fvdb/projects/panoptic_segmentation/garfield/download_example_data.py
@@ -15,7 +15,7 @@
 
 @dataclass
 class DownloadData:
-    dataset: str = "figurines"
+    dataset: str = "dozer_nerfgun_waldo"
     save_dir: Path = Path(os.getcwd() + "/data")
 
     def main(self):
@@ -23,12 +23,34 @@ def main(self):
         dataset_savedir: Path = self.save_dir / self.dataset
 
         if not dataset_savedir.exists():
-            downloaded_filename = Path(self.dataset_download(self.dataset))
+            # Download and unzip the dataset
+            print(f"Downloading the {self.dataset} LERF dataset...")
+            downloaded_filename = Path(self.dataset_download())
             if not downloaded_filename.exists():
                 raise FileNotFoundError(f"Downloaded file {downloaded_filename} does not exist, something went wrong")
             self.unzip_download(downloaded_filename)
+            print(f"{self.dataset} dataset downloaded and unzipped succesfully.")
 
-    def dataset_download(self, dataset) -> str:
+            # Download the colmap data for 3dgs
+            print(f"Downloading the {self.dataset} colmap data...")
+            colmap_data = Path(self.colmap_data_download())
+            self.unzip_download(colmap_data)
+            print(f"{self.dataset} COLMAP data downloaded and unzipped succesfully.")
+
+    def colmap_data_download(self)->Path:
+        colmap_datasets_url = "https://drive.google.com/drive/folders/1EQiZTPWk25yfRFA7UOEwd6hWlosIcOvB"
+        files = gdown.download_folder(url=colmap_datasets_url, skip_download=True, quiet=True)
+
+        download_target_file = self.save_dir / (self.dataset + ".zip")
+        for file in files:
+            if file.path == self.dataset + ".zip":
+                downloaded_filename = gdown.download(id=file.id, output=str(download_target_file))
+                break
+        if not download_target_file.exists():
+            raise FileNotFoundError(f"COLMAP data {self.dataset} not found in the Google Drive folder {colmap_datasets_url}")
+        return downloaded_filename
+
+    def dataset_download(self) -> str:
         lerf_datasets_url = "https://drive.google.com/drive/folders/119bheSoSrgekkgQdSGa6E1jkNKC84HWL"
         files = gdown.download_folder(url=lerf_datasets_url, skip_download=True, quiet=True)
         download_target_file = self.save_dir / (self.dataset + ".zip")
@@ -44,8 +66,6 @@ def unzip_download(self, downloaded_filename: Path):
         with zipfile.ZipFile(downloaded_filename, "r") as zip_ref:
             zip_ref.extractall(downloaded_filename.parent)
         downloaded_filename.unlink()
-        print(f"{self.dataset} downloaded and unzipped succesfully.")
-
 
 if __name__ == "__main__":
     tyro.cli(DownloadData).main()

From 527bf5eae8a6686dc2259cf920f1f1092cde6aca Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Wed, 8 Jan 2025 16:36:31 +1300
Subject: [PATCH 43/59] Updated README

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 .../panoptic_segmentation/garfield/README.md      | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/fvdb/projects/panoptic_segmentation/garfield/README.md b/fvdb/projects/panoptic_segmentation/garfield/README.md
index e2805a6c6e..7663ab8cdd 100644
--- a/fvdb/projects/panoptic_segmentation/garfield/README.md
+++ b/fvdb/projects/panoptic_segmentation/garfield/README.md
@@ -4,7 +4,7 @@ This code is based on the official implementation for [GARField](https://github.
 
 ## Installation
 
-1. Create the `fvdb_garfield` environment with conda.  This will install or build all the necessary dependencies.
+1. Create the `fvdb_garfield` environment with conda.  This will install or build all the necessary dependencies.  This may take a while because some dependencies require building from source to build CUDA 12.1 versions.
    ```bash
    conda env create -f ./garfield_environment.yml
    ```
@@ -17,17 +17,22 @@ This code is based on the official implementation for [GARField](https://github.
 
 ## Running GARField
 
-1. Download example colmap data
+1. Download example image, camera info and COLMAP data
    ```bash
    ./download_example-data.py
    ```
 
-2. Run GARField on the example data
+2. Run the original GARField implementation on the example data
    ```bash
-   ns-train garfield --data ./data/figurines
+   ns-train garfield --data ./data/dozer_nerfgun_waldo
    ```
 
 3. (Optional) Run GARField with Gaussian Splatting geometry
    ```bash
-   ns-train garfield-gauss --data ./data/figurines --pipeline.garfield-ckpt outputs/figurines/garfied/[datetimestamp]/config.yml
+   ns-train garfield-gauss --data ./data/dozer_nerfgun_waldo --pipeline.garfield-ckpt outputs/dozer_nerfgun_waldo/garfied/[datetimestamp]/config.yml
+   ```
+
+4. Run fVDB 3D Gaussian Splatting on the example data
+   ```bash
+   python [fVDB_root]/examples/3dgs/train_colmap.py --data ./data/dozer_nerfgun_waldo
    ```

From 7fdb6283ff8ab65cb60e7d2805d873de005c140a Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <2375296+swahtz@users.noreply.github.com>
Date: Fri, 10 Jan 2025 15:09:48 +1300
Subject: [PATCH 44/59] black formatting fix for garfield download_example_data
 (#119)

* black formatting fix for garfield download_example_data
* Removing cv2 import that is only valid for pip opencv package
---------

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 fvdb/fvdb/utils/data/colmap_dataset.py                     | 1 -
 .../garfield/download_example_data.py                      | 7 +++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/fvdb/fvdb/utils/data/colmap_dataset.py b/fvdb/fvdb/utils/data/colmap_dataset.py
index 097e371222..3a083fdf81 100644
--- a/fvdb/fvdb/utils/data/colmap_dataset.py
+++ b/fvdb/fvdb/utils/data/colmap_dataset.py
@@ -5,7 +5,6 @@
 from typing import Any, Dict, List, Optional
 
 import cv2
-import cv2.data
 import imageio.v2 as imageio
 import numpy as np
 import torch
diff --git a/fvdb/projects/panoptic_segmentation/garfield/download_example_data.py b/fvdb/projects/panoptic_segmentation/garfield/download_example_data.py
index ae2af80bb1..1c5e75561a 100755
--- a/fvdb/projects/panoptic_segmentation/garfield/download_example_data.py
+++ b/fvdb/projects/panoptic_segmentation/garfield/download_example_data.py
@@ -37,7 +37,7 @@ def main(self):
             self.unzip_download(colmap_data)
             print(f"{self.dataset} COLMAP data downloaded and unzipped succesfully.")
 
-    def colmap_data_download(self)->Path:
+    def colmap_data_download(self) -> Path:
         colmap_datasets_url = "https://drive.google.com/drive/folders/1EQiZTPWk25yfRFA7UOEwd6hWlosIcOvB"
         files = gdown.download_folder(url=colmap_datasets_url, skip_download=True, quiet=True)
 
@@ -47,7 +47,9 @@ def colmap_data_download(self)->Path:
                 downloaded_filename = gdown.download(id=file.id, output=str(download_target_file))
                 break
         if not download_target_file.exists():
-            raise FileNotFoundError(f"COLMAP data {self.dataset} not found in the Google Drive folder {colmap_datasets_url}")
+            raise FileNotFoundError(
+                f"COLMAP data {self.dataset} not found in the Google Drive folder {colmap_datasets_url}"
+            )
         return downloaded_filename
 
     def dataset_download(self) -> str:
@@ -67,5 +69,6 @@ def unzip_download(self, downloaded_filename: Path):
             zip_ref.extractall(downloaded_filename.parent)
         downloaded_filename.unlink()
 
+
 if __name__ == "__main__":
     tyro.cli(DownloadData).main()

From 36f3e0921ba39d0a667a6a52de3ef8ba1b52e1ea Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <2375296+swahtz@users.noreply.github.com>
Date: Fri, 10 Jan 2025 17:15:25 +1300
Subject: [PATCH 45/59] CI:  Update feature/fvdb from develop (#118)

Updating feature/fvdb branch from current develop branch to pickup the 'enforce-all-checks' action as well as disabled Windows tests setup.

---------

Signed-off-by: Ken Museth <ken.museth@gmail.com>
Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
Co-authored-by: Ken Museth <1495380+kmuseth@users.noreply.github.com>
---
 .github/workflows/build.yml                   |  82 +-
 .github/workflows/nanovdb.yml                 |  88 +-
 .github/workflows/status_checks.yml           |  16 +
 .github/workflows/weekly.yml                  | 850 +++++++++---------
 .../nanovdb/tools/cuda/SignedFloodFill.cuh    |  22 +-
 nanovdb/nanovdb/unittest/TestNanoVDB.cu       |   8 +-
 nanovdb/nanovdb/util/cuda/Util.h              |   3 +-
 7 files changed, 545 insertions(+), 524 deletions(-)
 create mode 100644 .github/workflows/status_checks.yml

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 5fdeafb99e..fe91055487 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -109,47 +109,47 @@ jobs:
       if: matrix.config.build == 'Release'
       run: ccache --evict-older-than 1d
 
-  windows:
-    # Windows CI. Tests a dynamic build with MD.
-    if: |
-      github.event_name != 'workflow_dispatch' ||
-      github.event.inputs.type == 'all' ||
-      github.event.inputs.type == 'win'
-    runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'windows-2022-8c-32g-300h') || 'windows-latest' }}
-    name: windows
-    env:
-      VCPKG_DEFAULT_TRIPLET: x64-windows
-    strategy:
-      fail-fast: false
-    steps:
-    - uses: actions/checkout@v3
-    - name: path
-      shell: pwsh
-      run: |
-        # note: system path must be modified in a previous step to it's use
-        echo "$Env:VCPKG_INSTALLATION_ROOT\installed\x64-windows\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-        echo "${{github.workspace}}\build\openvdb\openvdb\Release" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-    - name: install
-      shell: powershell
-      run: .\ci\install_windows.ps1
-    - name: build
-      run: >
-        ./ci/build.sh -v
-        --config='Release'
-        --components='core,bin,view,render,python,test'
-        --cargs=\'
-        -A x64 -G \"Visual Studio 17 2022\" -DOPENVDB_CORE_STATIC=OFF
-        -DMSVC_COMPRESS_PDB=ON
-        -DUSE_EXR=ON
-        -DUSE_PNG=ON
-        -DVCPKG_TARGET_TRIPLET=${VCPKG_DEFAULT_TRIPLET}
-        -DCMAKE_TOOLCHAIN_FILE=\"${VCPKG_INSTALLATION_ROOT}\\scripts\\buildsystems\\vcpkg.cmake\"
-        \'
-    - name: size
-      # Print the build directy size (monitor if we're hitting runner limits)
-      run: du -h build
-    - name: test
-      run: cd build && ctest -V -C Release
+  # windows:
+  #   # Windows CI. Tests a dynamic build with MD.
+  #   if: |
+  #     github.event_name != 'workflow_dispatch' ||
+  #     github.event.inputs.type == 'all' ||
+  #     github.event.inputs.type == 'win'
+  #   runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'windows-2022-8c-32g-300h') || 'windows-latest' }}
+  #   name: windows
+  #   env:
+  #     VCPKG_DEFAULT_TRIPLET: x64-windows
+  #   strategy:
+  #     fail-fast: false
+  #   steps:
+  #   - uses: actions/checkout@v3
+  #   - name: path
+  #     shell: pwsh
+  #     run: |
+  #       # note: system path must be modified in a previous step to it's use
+  #       echo "$Env:VCPKG_INSTALLATION_ROOT\installed\x64-windows\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+  #       echo "${{github.workspace}}\build\openvdb\openvdb\Release" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+  #   - name: install
+  #     shell: powershell
+  #     run: .\ci\install_windows.ps1
+  #   - name: build
+  #     run: >
+  #       ./ci/build.sh -v
+  #       --config='Release'
+  #       --components='core,bin,view,render,python,test'
+  #       --cargs=\'
+  #       -A x64 -G \"Visual Studio 17 2022\" -DOPENVDB_CORE_STATIC=OFF
+  #       -DMSVC_COMPRESS_PDB=ON
+  #       -DUSE_EXR=ON
+  #       -DUSE_PNG=ON
+  #       -DVCPKG_TARGET_TRIPLET=${VCPKG_DEFAULT_TRIPLET}
+  #       -DCMAKE_TOOLCHAIN_FILE=\"${VCPKG_INSTALLATION_ROOT}\\scripts\\buildsystems\\vcpkg.cmake\"
+  #       \'
+  #   - name: size
+  #     # Print the build directy size (monitor if we're hitting runner limits)
+  #     run: du -h build
+  #   - name: test
+  #     run: cd build && ctest -V -C Release
 
   macos:
     if: |
diff --git a/.github/workflows/nanovdb.yml b/.github/workflows/nanovdb.yml
index f3581cbc7d..c3ccb3829d 100644
--- a/.github/workflows/nanovdb.yml
+++ b/.github/workflows/nanovdb.yml
@@ -77,50 +77,50 @@ jobs:
       - name: test
         run: cd build && sudo ctest -V -E ".*cuda.*"
 
-  windows-nanovdb:
-    if: |
-      github.event_name != 'workflow_dispatch' ||
-      github.event.inputs.type == 'all' ||
-      github.event.inputs.type == 'win'
-    runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'windows-2022-8c-32g-300h') || 'windows-latest' }}
-    env:
-      VCPKG_DEFAULT_TRIPLET: 'x64-windows'
-      visual_studio: "Visual Studio 17 2022"
-      cuda: "12.4.0"
-    strategy:
-      fail-fast: false
-    steps:
-    - uses: actions/checkout@v3
-    - name: path
-      run: |
-        # note: system path must be modified in a previous step to it's use
-        echo "$Env:VCPKG_INSTALLATION_ROOT\installed\x64-windows\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-        echo "${{github.workspace}}\build\openvdb\openvdb\Release" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-    - name: install_cuda
-      shell: powershell
-      run: .\ci\install_windows_cuda.ps1
-    - name: install
-      shell: powershell
-      run: .\ci\install_windows.ps1
-    - name: build
-      shell: bash
-      run: >
-        ./ci/build.sh -v
-        --config=Release
-        --components=core,nano,nanotest,nanoexam,nanobench,nanotool
-        --cargs=\'
-        -A x64 -G \"Visual Studio 17 2022\" -DOPENVDB_CORE_STATIC=OFF
-        -DMSVC_COMPRESS_PDB=ON
-        -DUSE_EXPLICIT_INSTANTIATION=OFF
-        -DNANOVDB_USE_CUDA=ON
-        -DCMAKE_CUDA_ARCHITECTURES="80"
-        -DNANOVDB_USE_OPENVDB=ON
-        -DVCPKG_TARGET_TRIPLET=${VCPKG_DEFAULT_TRIPLET}
-        -DCMAKE_TOOLCHAIN_FILE=\"${VCPKG_INSTALLATION_ROOT}\\scripts\\buildsystems\\vcpkg.cmake\"
-        \'
-    - name: test
-      shell: bash
-      run: cd build && ctest -V -E ".*cuda.*"
+  # windows-nanovdb:
+  #   if: |
+  #     github.event_name != 'workflow_dispatch' ||
+  #     github.event.inputs.type == 'all' ||
+  #     github.event.inputs.type == 'win'
+  #   runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'windows-2022-8c-32g-300h') || 'windows-latest' }}
+  #   env:
+  #     VCPKG_DEFAULT_TRIPLET: 'x64-windows'
+  #     visual_studio: "Visual Studio 17 2022"
+  #     cuda: "12.4.0"
+  #   strategy:
+  #     fail-fast: false
+  #   steps:
+  #   - uses: actions/checkout@v3
+  #   - name: path
+  #     run: |
+  #       # note: system path must be modified in a previous step to it's use
+  #       echo "$Env:VCPKG_INSTALLATION_ROOT\installed\x64-windows\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+  #       echo "${{github.workspace}}\build\openvdb\openvdb\Release" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+  #   - name: install_cuda
+  #     shell: powershell
+  #     run: .\ci\install_windows_cuda.ps1
+  #   - name: install
+  #     shell: powershell
+  #     run: .\ci\install_windows.ps1
+  #   - name: build
+  #     shell: bash
+  #     run: >
+  #       ./ci/build.sh -v
+  #       --config=Release
+  #       --components=core,nano,nanotest,nanoexam,nanobench,nanotool
+  #       --cargs=\'
+  #       -A x64 -G \"Visual Studio 17 2022\" -DOPENVDB_CORE_STATIC=OFF
+  #       -DMSVC_COMPRESS_PDB=ON
+  #       -DUSE_EXPLICIT_INSTANTIATION=OFF
+  #       -DNANOVDB_USE_CUDA=ON
+  #       -DCMAKE_CUDA_ARCHITECTURES="80"
+  #       -DNANOVDB_USE_OPENVDB=ON
+  #       -DVCPKG_TARGET_TRIPLET=${VCPKG_DEFAULT_TRIPLET}
+  #       -DCMAKE_TOOLCHAIN_FILE=\"${VCPKG_INSTALLATION_ROOT}\\scripts\\buildsystems\\vcpkg.cmake\"
+  #       \'
+  #   - name: test
+  #     shell: bash
+  #     run: cd build && ctest -V -E ".*cuda.*"
 
   macos-nanovdb:
     if: |
diff --git a/.github/workflows/status_checks.yml b/.github/workflows/status_checks.yml
new file mode 100644
index 0000000000..c6d3560d00
--- /dev/null
+++ b/.github/workflows/status_checks.yml
@@ -0,0 +1,16 @@
+name: Wait for Status Checks
+on:
+  pull_request:
+  push:
+    branches:
+      - "pull-request/[0-9]+"
+jobs:
+  enforce-all-checks:
+    runs-on: ubuntu-latest
+    permissions:
+      checks: read
+    steps:
+      - name: GitHub Checks
+        uses: poseidon/wait-for-status-checks@v0.6.0
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml
index d5ea701aef..2275e843d5 100644
--- a/.github/workflows/weekly.yml
+++ b/.github/workflows/weekly.yml
@@ -30,110 +30,110 @@ jobs:
   ################################## Houdini ##################################
   #############################################################################
 
-  # Check that valid github secrets have been set for the ability to
-  # download Houdini and cache it. The secrets are used in download_houdini.py
-  checksecret:
-    name: Verify Houdini Secrets
-    runs-on: ubuntu-latest
-    outputs:
-      HOUDINI_SECRETS: ${{ steps.check.outputs.HOUDINI_SECRETS }}
-    steps:
-      - id: check
-        env:
-            HOUDINI_CLIENT_ID: ${{ secrets.HOUDINI_CLIENT_ID }}
-            HOUDINI_SECRET_KEY: ${{ secrets.HOUDINI_SECRET_KEY }}
-        run: echo "HOUDINI_SECRETS=${{ env.HOUDINI_CLIENT_ID != '' && env.HOUDINI_SECRET_KEY != '' }}" >> $GITHUB_OUTPUT
-      - name: Skip Next Jobs
-        if: steps.check.outputs.HOUDINI_SECRETS != 'true'
-        run: echo "HOUDINI_CLIENT_ID and HOUDINI_SECRET_KEY GitHub Action Secrets needs to be set to install Houdini builds"
-      # Explicitly error on the ASWF repo, we expect this secret to always exist
-      - name: Error ASWF
-        if: steps.check.outputs.HOUDINI_SECRETS != 'true' && github.repository_owner == 'AcademySoftwareFoundation'
-        run: exit 1
+  # # Check that valid github secrets have been set for the ability to
+  # # download Houdini and cache it. The secrets are used in download_houdini.py
+  # checksecret:
+  #   name: Verify Houdini Secrets
+  #   runs-on: ubuntu-latest
+  #   outputs:
+  #     HOUDINI_SECRETS: ${{ steps.check.outputs.HOUDINI_SECRETS }}
+  #   steps:
+  #     - id: check
+  #       env:
+  #           HOUDINI_CLIENT_ID: ${{ secrets.HOUDINI_CLIENT_ID }}
+  #           HOUDINI_SECRET_KEY: ${{ secrets.HOUDINI_SECRET_KEY }}
+  #       run: echo "HOUDINI_SECRETS=${{ env.HOUDINI_CLIENT_ID != '' && env.HOUDINI_SECRET_KEY != '' }}" >> $GITHUB_OUTPUT
+  #     - name: Skip Next Jobs
+  #       if: steps.check.outputs.HOUDINI_SECRETS != 'true'
+  #       run: echo "HOUDINI_CLIENT_ID and HOUDINI_SECRET_KEY GitHub Action Secrets needs to be set to install Houdini builds"
+  #     # Explicitly error on the ASWF repo, we expect this secret to always exist
+  #     - name: Error ASWF
+  #       if: steps.check.outputs.HOUDINI_SECRETS != 'true' && github.repository_owner == 'AcademySoftwareFoundation'
+  #       run: exit 1
 
-  # download the latest production version of Houdini X, strip out headers,
-  # libraries and binaries required for building OpenVDB and put it into
-  # the GitHub Actions cache
-  linux_houdini:
-    needs: [checksecret]
-    if: |
-      (needs.checksecret.outputs.HOUDINI_SECRETS == 'true') &&
-      (github.event_name != 'workflow_dispatch' ||
-       github.event.inputs.type == 'all' ||
-       github.event.inputs.type == 'houdini')
-    runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
-    name: linux-houdini:${{ matrix.config.hou_hash }}
-    env:
-      CXX: clang++
-      HOUDINI_CLIENT_ID: ${{ secrets.HOUDINI_CLIENT_ID }}
-      HOUDINI_SECRET_KEY: ${{ secrets.HOUDINI_SECRET_KEY }}
-    strategy:
-      matrix:
-        config:
-          - { houdini_version: '20.0', platform: 'linux_x86_64_gcc11.2', hou_hash: '20_0-newabi' }
-          - { houdini_version: '20.5', platform: 'linux_x86_64_gcc11.2', hou_hash: '20_5' }
-      fail-fast: false
-    container:
-      image: aswf/ci-base:2024
-    steps:
-    - uses: actions/checkout@v3
-    # We bumped from the 2021 CI image to 2023 here to fix some OpenSSL issues
-    # with the Houdini download script. In so doing we broke some of the caching
-    # between this job and the jobs in houdini.yml which _don't_ use the 2023
-    # image yet. The issue is that the cache action will use zstd if it's
-    # available to zip the cache and this causes it to be inserted with a unique
-    # hash which images without zstd (i.e. the 2021/2022 images don't have
-    # access to). For now, uninstall zstd here instead of installing it
-    # everywhere and ask the LF to add zstd to the older base images.
-    - name: remove zstd
-      run: yum -y remove zstd
-    - name: timestamp
-      id: timestamp
-      run: echo "timestamp=$(date -u +'%Y-%m-%dT%H:%M:%SZ')" >> $GITHUB_OUTPUT
-    - name: download_houdini
-      run: ./ci/download_houdini.sh ${{ matrix.config.houdini_version }} ${{ matrix.config.platform }} --prod
-    - name: install_houdini
-      run: |
-        mkdir $HOME/houdini_install
-        cp hou/hou.tar.gz $HOME/houdini_install/hou.tar.gz
-        cd $HOME/houdini_install && tar -xzf hou.tar.gz && cd -
-    - name: write_houdini_cache
-      uses: actions/cache/save@v3
-      with:
-        path: hou
-        key: vdb-v5-houdini${{ matrix.config.hou_hash }}-${{ steps.timestamp.outputs.timestamp }}
+  # # download the latest production version of Houdini X, strip out headers,
+  # # libraries and binaries required for building OpenVDB and put it into
+  # # the GitHub Actions cache
+  # linux_houdini:
+  #   needs: [checksecret]
+  #   if: |
+  #     (needs.checksecret.outputs.HOUDINI_SECRETS == 'true') &&
+  #     (github.event_name != 'workflow_dispatch' ||
+  #      github.event.inputs.type == 'all' ||
+  #      github.event.inputs.type == 'houdini')
+  #   runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
+  #   name: linux-houdini:${{ matrix.config.hou_hash }}
+  #   env:
+  #     CXX: clang++
+  #     HOUDINI_CLIENT_ID: ${{ secrets.HOUDINI_CLIENT_ID }}
+  #     HOUDINI_SECRET_KEY: ${{ secrets.HOUDINI_SECRET_KEY }}
+  #   strategy:
+  #     matrix:
+  #       config:
+  #         - { houdini_version: '20.0', platform: 'linux_x86_64_gcc11.2', hou_hash: '20_0-newabi' }
+  #         - { houdini_version: '20.5', platform: 'linux_x86_64_gcc11.2', hou_hash: '20_5' }
+  #     fail-fast: false
+  #   container:
+  #     image: aswf/ci-base:2024
+  #   steps:
+  #   - uses: actions/checkout@v3
+  #   # We bumped from the 2021 CI image to 2023 here to fix some OpenSSL issues
+  #   # with the Houdini download script. In so doing we broke some of the caching
+  #   # between this job and the jobs in houdini.yml which _don't_ use the 2023
+  #   # image yet. The issue is that the cache action will use zstd if it's
+  #   # available to zip the cache and this causes it to be inserted with a unique
+  #   # hash which images without zstd (i.e. the 2021/2022 images don't have
+  #   # access to). For now, uninstall zstd here instead of installing it
+  #   # everywhere and ask the LF to add zstd to the older base images.
+  #   - name: remove zstd
+  #     run: yum -y remove zstd
+  #   - name: timestamp
+  #     id: timestamp
+  #     run: echo "timestamp=$(date -u +'%Y-%m-%dT%H:%M:%SZ')" >> $GITHUB_OUTPUT
+  #   - name: download_houdini
+  #     run: ./ci/download_houdini.sh ${{ matrix.config.houdini_version }} ${{ matrix.config.platform }} --prod
+  #   - name: install_houdini
+  #     run: |
+  #       mkdir $HOME/houdini_install
+  #       cp hou/hou.tar.gz $HOME/houdini_install/hou.tar.gz
+  #       cd $HOME/houdini_install && tar -xzf hou.tar.gz && cd -
+  #   - name: write_houdini_cache
+  #     uses: actions/cache/save@v3
+  #     with:
+  #       path: hou
+  #       key: vdb-v5-houdini${{ matrix.config.hou_hash }}-${{ steps.timestamp.outputs.timestamp }}
 
-  macos_houdini:
-    needs: [checksecret]
-    if: |
-      (needs.checksecret.outputs.HOUDINI_SECRETS == 'true') &&
-      (github.event_name != 'workflow_dispatch' ||
-       github.event.inputs.type == 'all' ||
-       github.event.inputs.type == 'houdini')
-    # Note that macos-14 (current macos-latest) switches to M1. We could instead test
-    # the arm build here instead of the x86 one.
-    runs-on: macos-latest
-    name: macos-houdini-20
-    env:
-      HOUDINI_CLIENT_ID: ${{ secrets.HOUDINI_CLIENT_ID }}
-      HOUDINI_SECRET_KEY: ${{ secrets.HOUDINI_SECRET_KEY }}
-    steps:
-    - uses: actions/checkout@v3
-    - name: timestamp
-      id: timestamp
-      run: echo "timestamp=$(date -u +'%Y-%m-%dT%H:%M:%SZ')" >> $GITHUB_OUTPUT
-    - name: download_houdini
-      run: ./ci/download_houdini.sh 20.0 macosx_arm64_clang14.0_13 --prod
-    - name: install_houdini
-      run: |
-        mkdir $HOME/houdini_install
-        cp hou/hou.tar.gz $HOME/houdini_install/hou.tar.gz
-        cd $HOME/houdini_install && tar -xzf hou.tar.gz && cd -
-    - name: write_houdini_cache
-      uses: actions/cache/save@v3
-      with:
-        path: hou
-        key: vdb-v5-houdini-macos-${{ steps.timestamp.outputs.timestamp }}
+  # macos_houdini:
+  #   needs: [checksecret]
+  #   if: |
+  #     (needs.checksecret.outputs.HOUDINI_SECRETS == 'true') &&
+  #     (github.event_name != 'workflow_dispatch' ||
+  #      github.event.inputs.type == 'all' ||
+  #      github.event.inputs.type == 'houdini')
+  #   # Note that macos-14 (current macos-latest) switches to M1. We could instead test
+  #   # the arm build here instead of the x86 one.
+  #   runs-on: macos-latest
+  #   name: macos-houdini-20
+  #   env:
+  #     HOUDINI_CLIENT_ID: ${{ secrets.HOUDINI_CLIENT_ID }}
+  #     HOUDINI_SECRET_KEY: ${{ secrets.HOUDINI_SECRET_KEY }}
+  #   steps:
+  #   - uses: actions/checkout@v3
+  #   - name: timestamp
+  #     id: timestamp
+  #     run: echo "timestamp=$(date -u +'%Y-%m-%dT%H:%M:%SZ')" >> $GITHUB_OUTPUT
+  #   - name: download_houdini
+  #     run: ./ci/download_houdini.sh 20.0 macosx_arm64_clang14.0_13 --prod
+  #   - name: install_houdini
+  #     run: |
+  #       mkdir $HOME/houdini_install
+  #       cp hou/hou.tar.gz $HOME/houdini_install/hou.tar.gz
+  #       cd $HOME/houdini_install && tar -xzf hou.tar.gz && cd -
+  #   - name: write_houdini_cache
+  #     uses: actions/cache/save@v3
+  #     with:
+  #       path: hou
+  #       key: vdb-v5-houdini-macos-${{ steps.timestamp.outputs.timestamp }}
 
   #############################################################################
   ########################### Core Library Extras #############################
@@ -141,84 +141,84 @@ jobs:
 
   # Extra configuration tests for the OpenVDB Core library. These test a
   # variety of options with newer compilers.
-  linux-extra:
-    if: |
-      github.event_name != 'workflow_dispatch' ||
-      github.event.inputs.type == 'all' ||
-      github.event.inputs.type == 'extra'
-    runs-on: ${{ (github.repository_owner == 'NVIDIA-Omniverse' && 'linux-amd64-cpu32') || (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
-    name: linux-extra:${{ matrix.config.name }}
-    container:
-      image: aswf/ci-openvdb:2024
-    env:
-      CXX: clang++
-    strategy:
-      matrix:
-        config:
-          - { name: 'all',   build: 'Release', components: 'core,python,bin,view,render,test',               cmake: '-DUSE_BLOSC=ON  -DUSE_ZLIB=ON  -DUSE_EXR=ON  -DUSE_PNG=ON'  }
-          - { name: 'lite',  build: 'Release', components: 'core,python,bin,view,render,test',               cmake: '-DUSE_BLOSC=OFF -DUSE_ZLIB=OFF -DUSE_EXR=OFF -DUSE_PNG=OFF -DOPENVDB_USE_DELAYED_LOADING=OFF' }
-          - { name: 'half',  build: 'Release', components: 'core,python,bin,view,render,test',               cmake: '-DUSE_BLOSC=OFF -DUSE_IMATH_HALF=ON' }
-          - { name: 'sse',   build: 'Release', components: 'core,python,bin,view,render,test',               cmake: '-DOPENVDB_SIMD=SSE42' }
-          - { name: 'avx',   build: 'Release', components: 'core,python,bin,view,render,test',               cmake: '-DOPENVDB_SIMD=AVX' }
-          - { name: 'numpy', build: 'Release', components: 'core,python,bin,view,render,test',               cmake: '-DUSE_NUMPY=ON -DOPENVDB_PYTHON_WRAP_ALL_GRID_TYPES=ON' }
-          - { name: 'asan',  build: 'asan',    components: 'core,test',                                      cmake: '-DNANOVDB_USE_OPENVDB=ON -DOPENVDB_AX_STATIC=OFF -DOPENVDB_CORE_STATIC=OFF -DUSE_BLOSC=OFF' } # We never called blosc_destroy(), so disable blosc to silence these errors
-          - { name: 'ubsan', build: 'ubsan',   components: 'core,test',                                      cmake: '-DCMAKE_CXX_FLAGS="-Wno-deprecated-declarations" ' }
-          - { name: 'c++20', build: 'Release', components: 'core,test',                                      cmake: '-DCMAKE_CXX_STANDARD=20' }
-          - { name: 'conf',  build: 'Release', components: 'core,python,bin,view,render,test',               cmake: '-DCMAKE_FIND_PACKAGE_PREFER_CONFIG=ON' }
-      fail-fast: false
-    steps:
-    - uses: actions/checkout@v3
-    - name: nanobind
-      #if: contains(container.image, '2023') == false
-      run: ./ci/install_nanobind.sh 2.0.0
-    - name: build
-      run: >
-        ./ci/build.sh -v
-        --build-type=${{ matrix.config.build }}
-        --components="${{ matrix.config.components }}"
-        --cargs=\"-DOPENVDB_CXX_STRICT=ON ${{ matrix.config.cmake }}\"
-    - name: test
-      run: cd build && ctest -V
+  # linux-extra:
+  #   if: |
+  #     github.event_name != 'workflow_dispatch' ||
+  #     github.event.inputs.type == 'all' ||
+  #     github.event.inputs.type == 'extra'
+  #   runs-on: ${{ (github.repository_owner == 'NVIDIA-Omniverse' && 'linux-amd64-cpu32') || (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
+  #   name: linux-extra:${{ matrix.config.name }}
+  #   container:
+  #     image: aswf/ci-openvdb:2024
+  #   env:
+  #     CXX: clang++
+  #   strategy:
+  #     matrix:
+  #       config:
+  #         - { name: 'all',   build: 'Release', components: 'core,python,bin,view,render,test',               cmake: '-DUSE_BLOSC=ON  -DUSE_ZLIB=ON  -DUSE_EXR=ON  -DUSE_PNG=ON'  }
+  #         - { name: 'lite',  build: 'Release', components: 'core,python,bin,view,render,test',               cmake: '-DUSE_BLOSC=OFF -DUSE_ZLIB=OFF -DUSE_EXR=OFF -DUSE_PNG=OFF -DOPENVDB_USE_DELAYED_LOADING=OFF' }
+  #         - { name: 'half',  build: 'Release', components: 'core,python,bin,view,render,test',               cmake: '-DUSE_BLOSC=OFF -DUSE_IMATH_HALF=ON' }
+  #         - { name: 'sse',   build: 'Release', components: 'core,python,bin,view,render,test',               cmake: '-DOPENVDB_SIMD=SSE42' }
+  #         - { name: 'avx',   build: 'Release', components: 'core,python,bin,view,render,test',               cmake: '-DOPENVDB_SIMD=AVX' }
+  #         - { name: 'numpy', build: 'Release', components: 'core,python,bin,view,render,test',               cmake: '-DUSE_NUMPY=ON -DOPENVDB_PYTHON_WRAP_ALL_GRID_TYPES=ON' }
+  #         - { name: 'asan',  build: 'asan',    components: 'core,test',                                      cmake: '-DNANOVDB_USE_OPENVDB=ON -DOPENVDB_AX_STATIC=OFF -DOPENVDB_CORE_STATIC=OFF -DUSE_BLOSC=OFF' } # We never called blosc_destroy(), so disable blosc to silence these errors
+  #         - { name: 'ubsan', build: 'ubsan',   components: 'core,test',                                      cmake: '-DCMAKE_CXX_FLAGS="-Wno-deprecated-declarations" ' }
+  #         - { name: 'c++20', build: 'Release', components: 'core,test',                                      cmake: '-DCMAKE_CXX_STANDARD=20' }
+  #         - { name: 'conf',  build: 'Release', components: 'core,python,bin,view,render,test',               cmake: '-DCMAKE_FIND_PACKAGE_PREFER_CONFIG=ON' }
+  #     fail-fast: false
+  #   steps:
+  #   - uses: actions/checkout@v3
+  #   - name: nanobind
+  #     #if: contains(container.image, '2023') == false
+  #     run: ./ci/install_nanobind.sh 2.0.0
+  #   - name: build
+  #     run: >
+  #       ./ci/build.sh -v
+  #       --build-type=${{ matrix.config.build }}
+  #       --components="${{ matrix.config.components }}"
+  #       --cargs=\"-DOPENVDB_CXX_STRICT=ON ${{ matrix.config.cmake }}\"
+  #   - name: test
+  #     run: cd build && ctest -V
 
-  # Test latest dependencies, latest compilers and options
-  latest:
-    if: |
-      github.event_name != 'workflow_dispatch' ||
-      github.event.inputs.type == 'all' ||
-      github.event.inputs.type == 'latest'
-    runs-on: ${{ matrix.config.runson }}
-    env:
-      CXX: ${{ matrix.config.cxx }}
-    strategy:
-      matrix:
-        config:
-          - { runson: ubuntu-latest, cxx: g++,     cmake: '' }
-          # Disable the clang job for now. See https://github.com/actions/runner-images/issues/8659
-          # - { runson: ubuntu-latest, cxx: clang++, cmake: '' }
-          # @todo gcc on macos
-          - { runson: macos-latest,  cxx: '',      cmake: '-DCMAKE_CXX_COMPILER=/opt/homebrew/opt/llvm@15/bin/clang++ -DLLVM_DIR=/opt/homebrew/opt/llvm@15/lib/cmake/llvm' }
-      fail-fast: false
-    steps:
-      - uses: actions/checkout@v3
-      - name: install_deps
-        run: |
-          if [ "$RUNNER_OS" == "Linux" ]; then
-            sudo apt-get -q install -y libboost-dev libboost-iostreams-dev libtbb-dev libblosc-dev llvm-dev libgtest-dev libcppunit-dev
-            ./ci/install_nanobind.sh 2.0.0
-          elif [ "$RUNNER_OS" == "macOS" ]; then
-            ./ci/install_macos.sh 15
-            ./ci/install_tbb_macos.sh
-          else
-            echo "$RUNNER_OS not supported"; exit 1
-          fi
-      - name: build
-        run: >
-          ./ci/build.sh -v
-          --build-type=Release
-          --components=\"core,axcore,python,bin,render,test,axbin\"
-          --cargs=\"-DCMAKE_CXX_STANDARD=20 -DOPENVDB_USE_DELAYED_LOADING=OFF -DCMAKE_INSTALL_PREFIX=${{ github.workspace }}/install ${{ matrix.config.cmake }}\"
-      - name: test
-        run: cd build && ctest -V
+  # # Test latest dependencies, latest compilers and options
+  # latest:
+  #   if: |
+  #     github.event_name != 'workflow_dispatch' ||
+  #     github.event.inputs.type == 'all' ||
+  #     github.event.inputs.type == 'latest'
+  #   runs-on: ${{ matrix.config.runson }}
+  #   env:
+  #     CXX: ${{ matrix.config.cxx }}
+  #   strategy:
+  #     matrix:
+  #       config:
+  #         - { runson: ubuntu-latest, cxx: g++,     cmake: '' }
+  #         # Disable the clang job for now. See https://github.com/actions/runner-images/issues/8659
+  #         # - { runson: ubuntu-latest, cxx: clang++, cmake: '' }
+  #         # @todo gcc on macos
+  #         - { runson: macos-latest,  cxx: '',      cmake: '-DCMAKE_CXX_COMPILER=/opt/homebrew/opt/llvm@15/bin/clang++ -DLLVM_DIR=/opt/homebrew/opt/llvm@15/lib/cmake/llvm' }
+  #     fail-fast: false
+  #   steps:
+  #     - uses: actions/checkout@v3
+  #     - name: install_deps
+  #       run: |
+  #         if [ "$RUNNER_OS" == "Linux" ]; then
+  #           sudo apt-get -q install -y libboost-dev libboost-iostreams-dev libtbb-dev libblosc-dev llvm-dev libgtest-dev libcppunit-dev
+  #           ./ci/install_nanobind.sh 2.0.0
+  #         elif [ "$RUNNER_OS" == "macOS" ]; then
+  #           ./ci/install_macos.sh 15
+  #           ./ci/install_tbb_macos.sh
+  #         else
+  #           echo "$RUNNER_OS not supported"; exit 1
+  #         fi
+  #     - name: build
+  #       run: >
+  #         ./ci/build.sh -v
+  #         --build-type=Release
+  #         --components=\"core,axcore,python,bin,render,test,axbin\"
+  #         --cargs=\"-DCMAKE_CXX_STANDARD=20 -DOPENVDB_USE_DELAYED_LOADING=OFF -DCMAKE_INSTALL_PREFIX=${{ github.workspace }}/install ${{ matrix.config.cmake }}\"
+  #     - name: test
+  #       run: cd build && ctest -V
 
   windows:
     # Windows CI. Tests static and dynamic builds with MT and MD respectively.
@@ -279,152 +279,152 @@ jobs:
   ############################ AX Library Extras ##############################
   #############################################################################
 
-  linux-ax:
-    if: |
-      github.event_name != 'workflow_dispatch' ||
-      github.event.inputs.type == 'all' ||
-      github.event.inputs.type == 'ax'
-    runs-on: ${{ (github.repository_owner == 'NVIDIA-Omniverse' && 'linux-amd64-cpu32') || (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
-    name: >
-      linux-ax:${{ matrix.config.image }}-cxx:${{ matrix.config.cxx }}-${{ matrix.config.build }}
-    container:
-      image: aswf/ci-openvdb:${{ matrix.config.image }}
-    env:
-      CXX: ${{ matrix.config.cxx }}
-    strategy:
-      matrix:
-        config:
-          # Unified
-          - { image: '2023-clang15', cxx: 'clang++', build: 'Release', components: 'core,bin,axcore,axbin,axtest', cmake: '' }
-          - { image: '2023-clang15', cxx: 'g++',     build: 'Release', components: 'core,bin,axcore,axbin,axtest', cmake: '' }
-      fail-fast: false
-    steps:
-      - uses: actions/checkout@v3
-      - name: nanobind
-        #f: contains(matrix.config.image, '2023') == false
-        run: ./ci/install_nanobind.sh 2.0.0
-      - name: build
-        run: >
-          ./ci/build.sh -v
-          --build-type=${{ matrix.config.build }}
-          --components=${{ matrix.config.components }}
-          --cargs=\"
-          ${{ matrix.config.cmake }}
-          -DOPENVDB_AX_TEST_CMD_DOWNLOADS=ON
-          -DUSE_EXPLICIT_INSTANTIATION=OFF
-          -DOPENVDB_CXX_STRICT=ON
-          \"
-      - name: clean
-        if: matrix.config.components == 'core'
-        run: rm -rf build
-      - name: build
-        if: matrix.config.components == 'core'
-        run: >
-          ./ci/build.sh -v
-          --build-type=${{ matrix.config.build }}
-          --components="bin,axcore,axbin,axtest,python"
-          --cargs=\"
-          ${{ matrix.config.cmake }}
-          -DOPENVDB_AX_TEST_CMD_DOWNLOADS=ON
-          -DUSE_EXPLICIT_INSTANTIATION=OFF
-          -DOPENVDB_CXX_STRICT=ON
-          \"
-      - name: test
-        run: cd build && ctest -V
-      - name: test_doxygen_examples
-        run: ./ci/extract_test_examples.sh
+  # linux-ax:
+  #   if: |
+  #     github.event_name != 'workflow_dispatch' ||
+  #     github.event.inputs.type == 'all' ||
+  #     github.event.inputs.type == 'ax'
+  #   runs-on: ${{ (github.repository_owner == 'NVIDIA-Omniverse' && 'linux-amd64-cpu32') || (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
+  #   name: >
+  #     linux-ax:${{ matrix.config.image }}-cxx:${{ matrix.config.cxx }}-${{ matrix.config.build }}
+  #   container:
+  #     image: aswf/ci-openvdb:${{ matrix.config.image }}
+  #   env:
+  #     CXX: ${{ matrix.config.cxx }}
+  #   strategy:
+  #     matrix:
+  #       config:
+  #         # Unified
+  #         - { image: '2023-clang15', cxx: 'clang++', build: 'Release', components: 'core,bin,axcore,axbin,axtest', cmake: '' }
+  #         - { image: '2023-clang15', cxx: 'g++',     build: 'Release', components: 'core,bin,axcore,axbin,axtest', cmake: '' }
+  #     fail-fast: false
+  #   steps:
+  #     - uses: actions/checkout@v3
+  #     - name: nanobind
+  #       #f: contains(matrix.config.image, '2023') == false
+  #       run: ./ci/install_nanobind.sh 2.0.0
+  #     - name: build
+  #       run: >
+  #         ./ci/build.sh -v
+  #         --build-type=${{ matrix.config.build }}
+  #         --components=${{ matrix.config.components }}
+  #         --cargs=\"
+  #         ${{ matrix.config.cmake }}
+  #         -DOPENVDB_AX_TEST_CMD_DOWNLOADS=ON
+  #         -DUSE_EXPLICIT_INSTANTIATION=OFF
+  #         -DOPENVDB_CXX_STRICT=ON
+  #         \"
+  #     - name: clean
+  #       if: matrix.config.components == 'core'
+  #       run: rm -rf build
+  #     - name: build
+  #       if: matrix.config.components == 'core'
+  #       run: >
+  #         ./ci/build.sh -v
+  #         --build-type=${{ matrix.config.build }}
+  #         --components="bin,axcore,axbin,axtest,python"
+  #         --cargs=\"
+  #         ${{ matrix.config.cmake }}
+  #         -DOPENVDB_AX_TEST_CMD_DOWNLOADS=ON
+  #         -DUSE_EXPLICIT_INSTANTIATION=OFF
+  #         -DOPENVDB_CXX_STRICT=ON
+  #         \"
+  #     - name: test
+  #       run: cd build && ctest -V
+  #     - name: test_doxygen_examples
+  #       run: ./ci/extract_test_examples.sh
 
-  macos-ax:
-    if: |
-      github.event_name != 'workflow_dispatch' ||
-      github.event.inputs.type == 'all' ||
-      github.event.inputs.type == 'ax'
-    runs-on: macos-13
-    name: macos-cxx:${{ matrix.config.cxx }}-llvm:${{ matrix.config.llvm }}-${{ matrix.config.build }}
-    env:
-      CXX: ${{ matrix.config.cxx }}
-    strategy:
-      matrix:
-        config:
-          - { cxx: 'clang++', build: 'Release', llvm: '15' }
-      fail-fast: false
-    steps:
-      - uses: actions/checkout@v3
-      - name: install_deps
-        run: |
-          ./ci/install_macos.sh ${{ matrix.config.llvm }}
-          ./ci/install_tbb_macos.sh
-      - name: build
-        run: >
-          ./ci/build.sh -v
-          --build-type=${{ matrix.config.build }}
-          --components="core,python,bin,axcore,axbin,axtest"
-          --cargs=\"
-          -DOPENVDB_AX_TEST_CMD_DOWNLOADS=ON
-          -DUSE_EXPLICIT_INSTANTIATION=OFF
-          -DCMAKE_INSTALL_PREFIX=${{ github.workspace }}/install
-          -DLLVM_DIR=/opt/homebrew/opt/llvm@${{ matrix.config.llvm }}/lib/cmake/llvm
-          \"
-      - name: test
-        run: cd build && ctest -V
-      - name: test_doxygen_examples
-        run: ./ci/extract_test_examples.sh
+  # macos-ax:
+  #   if: |
+  #     github.event_name != 'workflow_dispatch' ||
+  #     github.event.inputs.type == 'all' ||
+  #     github.event.inputs.type == 'ax'
+  #   runs-on: macos-13
+  #   name: macos-cxx:${{ matrix.config.cxx }}-llvm:${{ matrix.config.llvm }}-${{ matrix.config.build }}
+  #   env:
+  #     CXX: ${{ matrix.config.cxx }}
+  #   strategy:
+  #     matrix:
+  #       config:
+  #         - { cxx: 'clang++', build: 'Release', llvm: '15' }
+  #     fail-fast: false
+  #   steps:
+  #     - uses: actions/checkout@v3
+  #     - name: install_deps
+  #       run: |
+  #         ./ci/install_macos.sh ${{ matrix.config.llvm }}
+  #         ./ci/install_tbb_macos.sh
+  #     - name: build
+  #       run: >
+  #         ./ci/build.sh -v
+  #         --build-type=${{ matrix.config.build }}
+  #         --components="core,python,bin,axcore,axbin,axtest"
+  #         --cargs=\"
+  #         -DOPENVDB_AX_TEST_CMD_DOWNLOADS=ON
+  #         -DUSE_EXPLICIT_INSTANTIATION=OFF
+  #         -DCMAKE_INSTALL_PREFIX=${{ github.workspace }}/install
+  #         -DLLVM_DIR=/opt/homebrew/opt/llvm@${{ matrix.config.llvm }}/lib/cmake/llvm
+  #         \"
+  #     - name: test
+  #       run: cd build && ctest -V
+  #     - name: test_doxygen_examples
+  #       run: ./ci/extract_test_examples.sh
 
-  windows-ax:
-    if: |
-      github.event_name != 'workflow_dispatch' ||
-      github.event.inputs.type == 'all' ||
-      github.event.inputs.type == 'ax'
-    runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'windows-2022-8c-32g-300h') || 'windows-latest' }}
-    name: windows-vc:${{ matrix.config.vc }}-type:${{ matrix.config.build }}
-    env:
-      VCPKG_DEFAULT_TRIPLET: ${{ matrix.config.vc }}
-      # Export this with '' avoid bash treating \ as escape
-      VDB_INSTALL_PREFIX: '${{ github.workspace }}\\install'
-    strategy:
-      matrix:
-        config:
-          # static build of blosc from vcpkg does not build internal sources.
-          # USE_STATIC_DEPENDENCIES is required for IlmBase/OpenEXR defines and
-          # Boost as both shared and static libs are installed.
-          # @todo  We don't currently run the axtests with shared builds of ax
-          # due to symbol issues using LLVM as a static lib (which is the only
-          # option on Windows).
-          - { vc: 'x64-windows',        crt: 'MD',  components: 'core,bin,axcore,axbin,python', build: 'Release', cmake: '-DOPENVDB_CORE_STATIC=OFF -DOPENVDB_AX_STATIC=OFF' }
-          - { vc: 'x64-windows-static', crt: 'MT',  components: 'core,bin,axcore,axbin,axtest', build: 'Release', cmake: '-DOPENVDB_CORE_SHARED=OFF -DOPENVDB_AX_SHARED=OFF -DUSE_STATIC_DEPENDENCIES=ON -DBLOSC_USE_EXTERNAL_SOURCES=ON' }
-          - { vc: 'x64-windows-static', crt: 'MTd', components: 'core,bin,axcore,axbin,axtest', build: 'Debug',   cmake: '-DOPENVDB_CORE_SHARED=OFF -DOPENVDB_AX_SHARED=OFF -DUSE_STATIC_DEPENDENCIES=ON -DBLOSC_USE_EXTERNAL_SOURCES=ON' }
-      fail-fast: false
-    steps:
-    - uses: actions/checkout@v3
-    - name: llvm
-      run: ./ci/install_llvm_windows.sh ${{ matrix.config.crt }}
-    - name: install
-      shell: powershell
-      run: .\ci\install_windows.ps1
-    - name: build
-      run: >
-        ./ci/build.sh -v
-        --config=${{ matrix.config.build }}
-        --components="${{ matrix.config.components }}"
-        --cargs=\'
-        -A x64 -G \"Visual Studio 17 2022\"
-        -DVCPKG_TARGET_TRIPLET=${VCPKG_DEFAULT_TRIPLET}
-        -DCMAKE_TOOLCHAIN_FILE=\"${VCPKG_INSTALLATION_ROOT}\\scripts\\buildsystems\\vcpkg.cmake\"
-        -DMSVC_COMPRESS_PDB=ON
-        -DOPENVDB_AX_TEST_CMD_DOWNLOADS=ON
-        -DUSE_EXPLICIT_INSTANTIATION=OFF
-        -DLLVM_DIR=\"${HOME}\\llvm_install\\lib\\cmake\\llvm\"
-        -DCMAKE_INSTALL_PREFIX=\"${VDB_INSTALL_PREFIX}\"
-        ${{ matrix.config.cmake }}
-        \'
-    - name: runtime_path
-      shell: pwsh
-      run: |
-        # note: system path must be modified in a previous step to it's use
-        echo "$Env:VCPKG_INSTALLATION_ROOT\installed\${{ matrix.config.vc }}\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-        echo "$Env:VDB_INSTALL_PREFIX\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-    - name: test
-      run: cd build && ctest -V -C ${{ matrix.config.build }}
+  # windows-ax:
+  #   if: |
+  #     github.event_name != 'workflow_dispatch' ||
+  #     github.event.inputs.type == 'all' ||
+  #     github.event.inputs.type == 'ax'
+  #   runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'windows-2022-8c-32g-300h') || 'windows-latest' }}
+  #   name: windows-vc:${{ matrix.config.vc }}-type:${{ matrix.config.build }}
+  #   env:
+  #     VCPKG_DEFAULT_TRIPLET: ${{ matrix.config.vc }}
+  #     # Export this with '' avoid bash treating \ as escape
+  #     VDB_INSTALL_PREFIX: '${{ github.workspace }}\\install'
+  #   strategy:
+  #     matrix:
+  #       config:
+  #         # static build of blosc from vcpkg does not build internal sources.
+  #         # USE_STATIC_DEPENDENCIES is required for IlmBase/OpenEXR defines and
+  #         # Boost as both shared and static libs are installed.
+  #         # @todo  We don't currently run the axtests with shared builds of ax
+  #         # due to symbol issues using LLVM as a static lib (which is the only
+  #         # option on Windows).
+  #         - { vc: 'x64-windows',        crt: 'MD',  components: 'core,bin,axcore,axbin,python', build: 'Release', cmake: '-DOPENVDB_CORE_STATIC=OFF -DOPENVDB_AX_STATIC=OFF' }
+  #         - { vc: 'x64-windows-static', crt: 'MT',  components: 'core,bin,axcore,axbin,axtest', build: 'Release', cmake: '-DOPENVDB_CORE_SHARED=OFF -DOPENVDB_AX_SHARED=OFF -DUSE_STATIC_DEPENDENCIES=ON -DBLOSC_USE_EXTERNAL_SOURCES=ON' }
+  #         - { vc: 'x64-windows-static', crt: 'MTd', components: 'core,bin,axcore,axbin,axtest', build: 'Debug',   cmake: '-DOPENVDB_CORE_SHARED=OFF -DOPENVDB_AX_SHARED=OFF -DUSE_STATIC_DEPENDENCIES=ON -DBLOSC_USE_EXTERNAL_SOURCES=ON' }
+  #     fail-fast: false
+  #   steps:
+  #   - uses: actions/checkout@v3
+  #   - name: llvm
+  #     run: ./ci/install_llvm_windows.sh ${{ matrix.config.crt }}
+  #   - name: install
+  #     shell: powershell
+  #     run: .\ci\install_windows.ps1
+  #   - name: build
+  #     run: >
+  #       ./ci/build.sh -v
+  #       --config=${{ matrix.config.build }}
+  #       --components="${{ matrix.config.components }}"
+  #       --cargs=\'
+  #       -A x64 -G \"Visual Studio 17 2022\"
+  #       -DVCPKG_TARGET_TRIPLET=${VCPKG_DEFAULT_TRIPLET}
+  #       -DCMAKE_TOOLCHAIN_FILE=\"${VCPKG_INSTALLATION_ROOT}\\scripts\\buildsystems\\vcpkg.cmake\"
+  #       -DMSVC_COMPRESS_PDB=ON
+  #       -DOPENVDB_AX_TEST_CMD_DOWNLOADS=ON
+  #       -DUSE_EXPLICIT_INSTANTIATION=OFF
+  #       -DLLVM_DIR=\"${HOME}\\llvm_install\\lib\\cmake\\llvm\"
+  #       -DCMAKE_INSTALL_PREFIX=\"${VDB_INSTALL_PREFIX}\"
+  #       ${{ matrix.config.cmake }}
+  #       \'
+  #   - name: runtime_path
+  #     shell: pwsh
+  #     run: |
+  #       # note: system path must be modified in a previous step to it's use
+  #       echo "$Env:VCPKG_INSTALLATION_ROOT\installed\${{ matrix.config.vc }}\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+  #       echo "$Env:VDB_INSTALL_PREFIX\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+  #   - name: test
+  #     run: cd build && ctest -V -C ${{ matrix.config.build }}
 
   #############################################################################
   ################################## Blosc ####################################
@@ -488,110 +488,110 @@ jobs:
   ################################## Blosc ####################################
   #############################################################################
 
-  linux-blosc:
-    if: |
-      github.event_name != 'workflow_dispatch' ||
-      github.event.inputs.type == 'all' ||
-      github.event.inputs.type == 'blosc'
-    runs-on: ${{ (github.repository_owner == 'NVIDIA-Omniverse' && 'linux-amd64-cpu32') || (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
-    name: linux-blosc:${{ matrix.blosc }}
-    container:
-      image: aswf/ci-base:2023
-    strategy:
-      matrix:
-        blosc: ['1.18.0','1.19.0','1.20.0','1.21.0']
-      fail-fast: false
-    steps:
-    - uses: actions/checkout@v3
-    - name: install_blosc
-      run: sudo ./ci/install_blosc.sh ${{ matrix.blosc }}
-    - name: build
-      run: >
-        sudo ./ci/build.sh -v
-        --build-type=Release
-        --components=\"core,test\"
-    - name: test
-      run: cd build && sudo ctest -V
+  # linux-blosc:
+  #   if: |
+  #     github.event_name != 'workflow_dispatch' ||
+  #     github.event.inputs.type == 'all' ||
+  #     github.event.inputs.type == 'blosc'
+  #   runs-on: ${{ (github.repository_owner == 'NVIDIA-Omniverse' && 'linux-amd64-cpu32') || (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
+  #   name: linux-blosc:${{ matrix.blosc }}
+  #   container:
+  #     image: aswf/ci-base:2023
+  #   strategy:
+  #     matrix:
+  #       blosc: ['1.18.0','1.19.0','1.20.0','1.21.0']
+  #     fail-fast: false
+  #   steps:
+  #   - uses: actions/checkout@v3
+  #   - name: install_blosc
+  #     run: sudo ./ci/install_blosc.sh ${{ matrix.blosc }}
+  #   - name: build
+  #     run: >
+  #       sudo ./ci/build.sh -v
+  #       --build-type=Release
+  #       --components=\"core,test\"
+  #   - name: test
+  #     run: cd build && sudo ctest -V
 
   #############################################################################
   ################################## ABI ######################################
   #############################################################################
 
-  linux-abi-checker:
-    if: |
-      github.event_name == 'workflow_dispatch' &&
-      (github.event.inputs.type == 'all' ||
-       github.event.inputs.type == 'abi')
-    runs-on: ubuntu-22.04
-    env:
-      # The 'abicheck' build type sets these, but older versions of the library
-      # may not have this build type. See OpenVDBCXX.cmake
-      CXXFLAGS: "-gdwarf-4 -g3 -ggdb -Og"
-    steps:
-    - name: Enable Node 16
-      run: |
-        echo "ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION=true" >> $GITHUB_ENV
-    - uses: actions/checkout@v3
-      with:
-        fetch-depth: 0
-        fetch-tags: true
-    # Compute the latest major version - that is used as our baseline
-    # note: For CI forks, make sure you have your tags synced
-    - name: get_major_version
-      run: |
-        LATEST_VERSION_TAG=$(git tag --merged | sort --version-sort | tail -n1)
-        echo "Computed latest VDB tag: ${LATEST_VERSION_TAG}"
-        VDB_MAJOR_VERSION=$(echo ${LATEST_VERSION_TAG} | cut -f1 -d '.' | tr -d -c 0-9)
-        echo "Using major version: ${VDB_MAJOR_VERSION}"
-        echo "VDB_MAJOR_VERSION=${VDB_MAJOR_VERSION}" >> "$GITHUB_ENV"
-    - name: install_deps
-      run: sudo apt-get -q install -y libboost-iostreams-dev libtbb-dev libblosc-dev elfutils
-    - name: install_abi_checker
-      run: sudo apt-get -q install -y abi-dumper abi-compliance-checker
-    - name: build_new
-      run: >
-        ./ci/build.sh -v
-        --build-dir=build_new
-        --build-type=abicheck
-        --target=openvdb_shared
-        --components=\"core\"
-        --cargs=\'-DUSE_EXPLICIT_INSTANTIATION=OFF -DDISABLE_DEPENDENCY_VERSION_CHECKS=ON\'
-    - name: checkout_baseline
-      run: git checkout v${VDB_MAJOR_VERSION}.0.0
-    - name: build_old
-      run: >
-        ./ci/build.sh -v
-        --build-dir=build_old
-        --build-type=abicheck
-        --target=openvdb_shared
-        --components=\"core\"
-        --cargs=\'-DUSE_EXPLICIT_INSTANTIATION=OFF -DDISABLE_DEPENDENCY_VERSION_CHECKS=ON\'
-    - name: abi_dump
-      run: |
-        abi-dumper build_new/openvdb/openvdb/libopenvdb.so -o ABI-NEW.dump -lver 1
-        abi-dumper build_old/openvdb/openvdb/libopenvdb.so -o ABI-OLD.dump -lver 2
-      # Replace the version namespace in the latest ABI dump with the baseline
-      # version we're comparing against. We should probably instead build the
-      # latest with the baseline version number but no CMake/defines allow us to
-      # do this.
-    - name: replace_symbols
-      run: sed -i -E 's/openvdb([^v]*)v[0-9]*_[0-9]/openvdb\1v'${VDB_MAJOR_VERSION}'_0/g' ABI-NEW.dump
-    - name: abi_check
-      # -strict treats warnings as errors
-      # -extended checks all member data
-      # we check everything _not_ in openvdb::**::internal namespace
-      run: >
-        abi-compliance-checker -l OPENVDB
-        -old ABI-OLD.dump
-        -new ABI-NEW.dump
-        -skip-internal-symbols "\d(openvdb.*internal)"
-        -skip-internal-types "(openvdb.*internal)::"
-        -strict
-        -extended
-    - name: upload_report
-      uses: actions/upload-artifact@v4
-      if: always()
-      with:
-        name: abi_report
-        path: ./compat_reports/OPENVDB/2_to_1/compat_report.html
-        retention-days: 5
+  # linux-abi-checker:
+  #   if: |
+  #     github.event_name == 'workflow_dispatch' &&
+  #     (github.event.inputs.type == 'all' ||
+  #      github.event.inputs.type == 'abi')
+  #   runs-on: ubuntu-22.04
+  #   env:
+  #     # The 'abicheck' build type sets these, but older versions of the library
+  #     # may not have this build type. See OpenVDBCXX.cmake
+  #     CXXFLAGS: "-gdwarf-4 -g3 -ggdb -Og"
+  #   steps:
+  #   - name: Enable Node 16
+  #     run: |
+  #       echo "ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION=true" >> $GITHUB_ENV
+  #   - uses: actions/checkout@v3
+  #     with:
+  #       fetch-depth: 0
+  #       fetch-tags: true
+  #   # Compute the latest major version - that is used as our baseline
+  #   # note: For CI forks, make sure you have your tags synced
+  #   - name: get_major_version
+  #     run: |
+  #       LATEST_VERSION_TAG=$(git tag --merged | sort --version-sort | tail -n1)
+  #       echo "Computed latest VDB tag: ${LATEST_VERSION_TAG}"
+  #       VDB_MAJOR_VERSION=$(echo ${LATEST_VERSION_TAG} | cut -f1 -d '.' | tr -d -c 0-9)
+  #       echo "Using major version: ${VDB_MAJOR_VERSION}"
+  #       echo "VDB_MAJOR_VERSION=${VDB_MAJOR_VERSION}" >> "$GITHUB_ENV"
+  #   - name: install_deps
+  #     run: sudo apt-get -q install -y libboost-iostreams-dev libtbb-dev libblosc-dev elfutils
+  #   - name: install_abi_checker
+  #     run: sudo apt-get -q install -y abi-dumper abi-compliance-checker
+  #   - name: build_new
+  #     run: >
+  #       ./ci/build.sh -v
+  #       --build-dir=build_new
+  #       --build-type=abicheck
+  #       --target=openvdb_shared
+  #       --components=\"core\"
+  #       --cargs=\'-DUSE_EXPLICIT_INSTANTIATION=OFF -DDISABLE_DEPENDENCY_VERSION_CHECKS=ON\'
+  #   - name: checkout_baseline
+  #     run: git checkout v${VDB_MAJOR_VERSION}.0.0
+  #   - name: build_old
+  #     run: >
+  #       ./ci/build.sh -v
+  #       --build-dir=build_old
+  #       --build-type=abicheck
+  #       --target=openvdb_shared
+  #       --components=\"core\"
+  #       --cargs=\'-DUSE_EXPLICIT_INSTANTIATION=OFF -DDISABLE_DEPENDENCY_VERSION_CHECKS=ON\'
+  #   - name: abi_dump
+  #     run: |
+  #       abi-dumper build_new/openvdb/openvdb/libopenvdb.so -o ABI-NEW.dump -lver 1
+  #       abi-dumper build_old/openvdb/openvdb/libopenvdb.so -o ABI-OLD.dump -lver 2
+  #     # Replace the version namespace in the latest ABI dump with the baseline
+  #     # version we're comparing against. We should probably instead build the
+  #     # latest with the baseline version number but no CMake/defines allow us to
+  #     # do this.
+  #   - name: replace_symbols
+  #     run: sed -i -E 's/openvdb([^v]*)v[0-9]*_[0-9]/openvdb\1v'${VDB_MAJOR_VERSION}'_0/g' ABI-NEW.dump
+  #   - name: abi_check
+  #     # -strict treats warnings as errors
+  #     # -extended checks all member data
+  #     # we check everything _not_ in openvdb::**::internal namespace
+  #     run: >
+  #       abi-compliance-checker -l OPENVDB
+  #       -old ABI-OLD.dump
+  #       -new ABI-NEW.dump
+  #       -skip-internal-symbols "\d(openvdb.*internal)"
+  #       -skip-internal-types "(openvdb.*internal)::"
+  #       -strict
+  #       -extended
+  #   - name: upload_report
+  #     uses: actions/upload-artifact@v4
+  #     if: always()
+  #     with:
+  #       name: abi_report
+  #       path: ./compat_reports/OPENVDB/2_to_1/compat_report.html
+  #       retention-days: 5
diff --git a/nanovdb/nanovdb/tools/cuda/SignedFloodFill.cuh b/nanovdb/nanovdb/tools/cuda/SignedFloodFill.cuh
index af37422455..0a07117ea8 100644
--- a/nanovdb/nanovdb/tools/cuda/SignedFloodFill.cuh
+++ b/nanovdb/nanovdb/tools/cuda/SignedFloodFill.cuh
@@ -10,7 +10,7 @@
 
     \brief Performs signed flood-fill operation on the hierarchical tree structure on the device
 
-    \todo This tools needs to handle the (extremely) rare case when root node
+    \todo This tools needs to handle the (extremely) rare case when the root node
           needs to be modified during the signed flood fill operation. This happens
           when the root-table needs to be expanded with tile values (of size 4096^3)
           that are completely inside the implicit surface.
@@ -66,7 +66,7 @@ private:
 namespace kernels {// kernels namespace
 
 template<typename BuildT>
-__global__ void processRootKernel(NanoTree<BuildT> *tree)
+__global__ void processRoot(NanoTree<BuildT> *tree)
 {
     // auto &root = tree->root();
     /*
@@ -91,12 +91,12 @@ __global__ void processRootKernel(NanoTree<BuildT> *tree)
     }
     */
     //root.setBackground(mOutside, /*updateChildNodes=*/false);
-}// processRootKernel
+}// processRoot
 
 //================================================================================================
 
 template<typename BuildT, int LEVEL>
-__global__ void processNodeKernel(NanoTree<BuildT> *tree, size_t count)
+__global__ void processNode(NanoTree<BuildT> *tree, size_t count)
 {
     using NodeT = typename NanoNode<BuildT, LEVEL>::type;
     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -115,12 +115,12 @@ __global__ void processNodeKernel(NanoTree<BuildT> *tree, size_t count)
         value = -value;
     }
     node.setValue(nValue, value);
-}// processNodeKernel
+}// processNode
 
 //================================================================================================
 
 template<typename BuildT>
-__global__ void processLeafKernel(NanoTree<BuildT> *tree, size_t count)
+__global__ void processLeaf(NanoTree<BuildT> *tree, size_t count)
 {
     using LeafT = NanoLeaf<BuildT>;
     const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -133,7 +133,7 @@ __global__ void processLeafKernel(NanoTree<BuildT> *tree, size_t count)
     auto n = mask.template findNext<true>(nVoxel);
     if (n == LeafT::SIZE && (n = mask.template findPrev<true>(nVoxel)) == LeafT::SIZE) n = 0u;
     buffer[nVoxel] = buffer[n]<0 ? -tree->background() : tree->background();
-}// processLeafKernel
+}// processLeaf
 
 //================================================================================================
 
@@ -166,19 +166,19 @@ void SignedFloodFill<BuildT>::operator()(NanoGrid<BuildT> *d_grid)
     auto *tree = reinterpret_cast<NanoTree<BuildT>*>(d_grid + 1);
 
     if (mVerbose) mTimer.start("\nProcess leaf nodes");
-    kernels::processLeafKernel<BuildT><<<blocksPerGrid(count[0]<<9), threadsPerBlock, 0, mStream>>>(tree, count[0]<<9);
+    kernels::processLeaf<BuildT><<<blocksPerGrid(count[0]<<9), threadsPerBlock, 0, mStream>>>(tree, count[0]<<9);
     cudaCheckError();
 
     if (mVerbose) mTimer.restart("Process lower internal nodes");
-    kernels::processNodeKernel<BuildT,1><<<blocksPerGrid(count[1]<<12), threadsPerBlock, 0, mStream>>>(tree, count[1]<<12);
+    kernels::processNode<BuildT,1><<<blocksPerGrid(count[1]<<12), threadsPerBlock, 0, mStream>>>(tree, count[1]<<12);
     cudaCheckError();
 
     if (mVerbose) mTimer.restart("Process upper internal nodes");
-    kernels::processNodeKernel<BuildT,2><<<blocksPerGrid(count[2]<<15), threadsPerBlock, 0, mStream>>>(tree, count[2]<<15);
+    kernels::processNode<BuildT,2><<<blocksPerGrid(count[2]<<15), threadsPerBlock, 0, mStream>>>(tree, count[2]<<15);
     cudaCheckError();
 
     //if (mVerbose) mTimer.restart("Process root node");
-    //kernels::processRootKernel<BuildT><<<1, 1, 0, mStream>>>(tree);
+    //kernels::processRoot<BuildT><<<1, 1, 0, mStream>>>(tree);
     if (mVerbose) mTimer.stop();
     cudaCheckError();
 }// SignedFloodFill::operator()
diff --git a/nanovdb/nanovdb/unittest/TestNanoVDB.cu b/nanovdb/nanovdb/unittest/TestNanoVDB.cu
index 6896c6ca09..1b1a88f23d 100644
--- a/nanovdb/nanovdb/unittest/TestNanoVDB.cu
+++ b/nanovdb/nanovdb/unittest/TestNanoVDB.cu
@@ -164,7 +164,7 @@ __global__ void testKernel(int device)
 {
     int dev;
     cudaError_t err = cudaGetDevice(&dev);
-    if (err != cudaSuccess) printf("kernel cuda error: %d\n", (int)err);
+    //if (err != cudaSuccess) printf("kernel cuda error: %d\n", (int)err);
     if (dev != device) printf("Error: expected device ID = %i but was called with %i\n", dev, device);
 }
 
@@ -178,7 +178,11 @@ TEST(TestNanoVDBCUDA, DeviceStreamMap)
         EXPECT_EQ(count,   nanovdb::util::cuda::deviceCount());
         EXPECT_EQ(current, nanovdb::util::cuda::currentDevice());
         float *ptr = new float;
-        EXPECT_EQ(cudaInvalidDeviceId, nanovdb::util::cuda::ptrToDevice(ptr));
+        const int deviceID = nanovdb::util::cuda::ptrToDevice(ptr);
+        EXPECT_TRUE(cudaInvalidDeviceId == deviceID || cudaCpuDeviceId == deviceID);
+        EXPECT_GT(0, deviceID);
+        //EXPECT_EQ(cudaInvalidDeviceId, deviceID);
+        //EXPECT_EQ(cudaCpuDeviceId, deviceID);
         delete ptr;
         cudaCheck(cudaMalloc((void**)&ptr, sizeof(float)));
         EXPECT_EQ(current, nanovdb::util::cuda::ptrToDevice(ptr));
diff --git a/nanovdb/nanovdb/util/cuda/Util.h b/nanovdb/nanovdb/util/cuda/Util.h
index 8233ca276b..236211bf50 100644
--- a/nanovdb/nanovdb/util/cuda/Util.h
+++ b/nanovdb/nanovdb/util/cuda/Util.h
@@ -110,10 +110,11 @@ inline cudaError_t freeAsync(void* d_ptr, cudaStream_t stream){return cudaFreeAs
 #endif
 
 /// @brief Returns the device ID associated with the specified pointer
+/// @note  If @c ptr points to host memory (only) the return ID is either cudaInvalidDeviceId = -2 or cudaCpuDeviceId = -1
 inline int ptrToDevice(void *ptr)
 {
     cudaPointerAttributes ptrAtt;
-    cudaPointerGetAttributes(&ptrAtt, ptr);
+    cudaCheck(cudaPointerGetAttributes(&ptrAtt, ptr));
     return ptrAtt.device;
 }
 

From 1acb6477ed0ebcc675c82cb8a4bf8c295bf4357a Mon Sep 17 00:00:00 2001
From: Mark Harris <783069+harrism@users.noreply.github.com>
Date: Sat, 11 Jan 2025 06:32:36 +1100
Subject: [PATCH 46/59] Make setup.py handle case where cudnn directory is
 created but empty (#50)

Co-authored-by: Jonathan Swartz <2375296+swahtz@users.noreply.github.com>
---
 fvdb/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fvdb/setup.py b/fvdb/setup.py
index 37d756f6d5..bd7c3db295 100644
--- a/fvdb/setup.py
+++ b/fvdb/setup.py
@@ -236,7 +236,7 @@ def download_and_install_cudnn() -> Tuple[List[str], List[str]]:
         if cudnn_hash != cudnn_hash_output:
             raise RuntimeError("Hash of cudnn.tar.xz does not match")
 
-    if not folder_filepath.exists():
+    if (not folder_filepath.exists()) or (len(os.listdir(folder_filepath)) == 0):
         logging.info("Extracting cudnn…")
         with tarfile.open(tar_filepath, "r:xz") as tar:
             tar.extractall(folder_filepath)

From 95166cf5cc50dac91d4783302aed6dacaef7c863 Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <2375296+swahtz@users.noreply.github.com>
Date: Sat, 11 Jan 2025 09:03:53 +1300
Subject: [PATCH 47/59] Fix docs copyright notice to be OpenVDB contributors
 (#124)

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 fvdb/docs/conf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fvdb/docs/conf.py b/fvdb/docs/conf.py
index e30dca314b..cccb3c609a 100644
--- a/fvdb/docs/conf.py
+++ b/fvdb/docs/conf.py
@@ -16,8 +16,8 @@
 # -- Project information -----------------------------------------------------
 
 project = "fVDB"
-copyright = "2023, NVIDIA Corporation"
-author = "NVIDIA Corporation"
+copyright = "Contributors to the OpenVDB Project"
+author = "Contributors to the OpenVDB Project"
 
 
 # -- General configuration ---------------------------------------------------

From dc887b6509316d8f04bad0463ecf75094b54a6f7 Mon Sep 17 00:00:00 2001
From: Francis Williams <fwilliams@users.noreply.github.com>
Date: Mon, 13 Jan 2025 16:16:57 -0500
Subject: [PATCH 48/59] vscode path settings (#126)

---
 fvdb/.vscode/c_cpp_properties.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fvdb/.vscode/c_cpp_properties.json b/fvdb/.vscode/c_cpp_properties.json
index 733b60b871..622c2e3311 100644
--- a/fvdb/.vscode/c_cpp_properties.json
+++ b/fvdb/.vscode/c_cpp_properties.json
@@ -6,7 +6,7 @@
                 "${env:CONDA_PREFIX}/envs/fvdb/include",
                 "${env:CONDA_PREFIX}/envs/fvdb/include/cuda",
                 "${workspaceFolder}/src",
-                "${workspaceFolder}/external/openvdb/nanovdb",
+                "${workspaceFolder}/../nanovdb",
                 "${workspaceFolder}/external/glm",
                 "${workspaceFolder}/external/cudnn_fe/include",
                 "${workspaceFolder}/external/cutlass/include",

From 8f3eaa471ac29524db26bbd7801089c35b7585dd Mon Sep 17 00:00:00 2001
From: Francis Williams <fwilliams@users.noreply.github.com>
Date: Mon, 13 Jan 2025 22:01:13 -0500
Subject: [PATCH 49/59] Fw/3dgs lifestyle (#127)

* fix spelling mistake and add log

* quality of life helper
---
 fvdb/examples/3dgs/train_colmap.py       |  9 ++---
 fvdb/examples/3dgs/train_segmentation.py |  2 +-
 fvdb/fvdb/nn/gaussian_splatting.py       | 46 +++++++++++++++++++++---
 3 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/fvdb/examples/3dgs/train_colmap.py b/fvdb/examples/3dgs/train_colmap.py
index 44c7cf9279..2291b34269 100644
--- a/fvdb/examples/3dgs/train_colmap.py
+++ b/fvdb/examples/3dgs/train_colmap.py
@@ -117,7 +117,8 @@ def save_checkpoint(self, step):
             "ellapsed_time": time.time() - self.train_start_time,
             "num_gaussians": self.model.num_gaussians,
         }
-        self.logger.info(f"Save checkpoint at step: {step}. Stats: {stats}")
+        checkpoint_path = f"{self.checkpoint_dir}/ckpt_{step:04d}.pt"
+        self.logger.info(f"Save checkpoint at step {step} to path {checkpoint_path}. Stats: {stats}.")
         with open(
             f"{self.stats_dir}/train_step{step:04d}.json",
             "w",
@@ -306,7 +307,7 @@ def train(self, start_step: int = 0):
                     image_w=image_width,
                     image_h=image_height,
                     extrinsics_mat=world_to_cam_mats,
-                    intrincs_mat=intrinsics_mats,
+                    intrinsics_mats=intrinsics_mats,
                     rasterize_mode="classic",
                     sh_degree=sh_degree_to_use,
                     image_crop=crop,
@@ -399,7 +400,7 @@ def eval(self, step: int, stage: str = "val"):
                 image_w=width,
                 image_h=height,
                 extrinsics_mat=world_to_cam_mats,
-                intrincs_mat=intrinsics_mats,
+                intrinsics_mats=intrinsics_mats,
                 rasterize_mode="classic",
                 sh_degree=cfg.sh_degree,
                 render_depth=False,
@@ -465,7 +466,7 @@ def _viewer_render_fn(self, camera_state: CameraState, img_wh: Tuple[int, int]):
             image_w=W,
             image_h=H,
             extrinsics_mat=c2w[None],
-            intrincs_mat=K[None],
+            intrinsics_mats=K[None],
             sh_degree=self.cfg.sh_degree,
             radius_clip=3.0,
             render_depth=False,
diff --git a/fvdb/examples/3dgs/train_segmentation.py b/fvdb/examples/3dgs/train_segmentation.py
index 3650a3b1b4..c19083032b 100644
--- a/fvdb/examples/3dgs/train_segmentation.py
+++ b/fvdb/examples/3dgs/train_segmentation.py
@@ -117,7 +117,7 @@ def train(self, dataset):
 
             # Forward pass
             feats, alphas, info = self.gs_model(
-                image_w=img_w, image_h=img_h, intrincs_mat=intrinsics, extrinsics_mat=world_to_cam
+                image_w=img_w, image_h=img_h, intrinsics_mats=intrinsics, extrinsics_mat=world_to_cam
             )
 
             # TODO (Francis): Don't use Pytorch caching allocator which causes massive fragmentation
diff --git a/fvdb/fvdb/nn/gaussian_splatting.py b/fvdb/fvdb/nn/gaussian_splatting.py
index e39862d602..9cb47d3d03 100644
--- a/fvdb/fvdb/nn/gaussian_splatting.py
+++ b/fvdb/fvdb/nn/gaussian_splatting.py
@@ -173,6 +173,42 @@ def _rgb_to_sh(rgb: torch.Tensor) -> torch.Tensor:
     def clear_cache(self):
         self._info_cache = {}
 
+    def render_rgb_and_depth(
+        self,
+        image_w: int,
+        image_h: int,
+        extrinsics_mat: torch.Tensor,
+        intrinsics_mat: torch.Tensor,
+        near_plane: float = 0.01,
+        far_plane: float = 1e10,
+        sh_degree: int = -1,
+        eps_2d: float = 0.3,
+        radius_clip: float = 0.0,
+        tile_size: int = 16,
+        rasterize_mode: Literal["classic", "antialiased"] = "classic",
+    ):
+        rgbd, alphas, _ = self(
+            image_w=image_w,
+            image_h=image_h,
+            extrinsics_mat=extrinsics_mat,
+            intrinsics_mat=intrinsics_mat,
+            near_plane=near_plane,
+            far_plane=far_plane,
+            sh_degree=sh_degree,
+            eps_2d=eps_2d,
+            radius_clip=radius_clip,
+            tile_size=tile_size,
+            image_crop=False,
+            render_depth=True,
+            rasterize_mode=rasterize_mode,
+            cache_info=False,
+            depth_only=False,
+        )
+        rgb = rgbd[..., :3]  # [B, H, W, 1]
+        depth = rgbd[..., 3:4] / alphas.clamp(min=1e-10)  # [B, H, W, 1]
+
+        return rgb, depth
+
     def render_depth_points(
         self,
         image_w: int,
@@ -191,7 +227,7 @@ def render_depth_points(
             image_w=image_w,
             image_h=image_h,
             extrinsics_mat=extrinsics_mat,
-            intrincs_mat=intrinsics_mat,
+            intrinsics_mat=intrinsics_mat,
             near_plane=near_plane,
             far_plane=far_plane,
             sh_degree=sh_degree,
@@ -223,7 +259,7 @@ def forward(
         image_w: int,
         image_h: int,
         extrinsics_mat: torch.Tensor,
-        intrincs_mat: torch.Tensor,
+        intrinsics_mat: torch.Tensor,
         near_plane: float = 0.01,
         far_plane: float = 1e10,
         sh_degree: int = -1,
@@ -267,7 +303,7 @@ def forward(
                     opacities=opacities,
                     sh_coeffs=sh,
                     viewmats=extrinsics_mat,
-                    Ks=intrincs_mat,
+                    Ks=intrinsics_mat,
                     image_width=image_w,
                     image_height=image_h,
                     eps2d=eps_2d,
@@ -305,7 +341,7 @@ def forward(
                     scales=scales,
                     opacities=opacities,
                     viewmats=extrinsics_mat,
-                    Ks=intrincs_mat,
+                    Ks=intrinsics_mat,
                     image_width=image_w,
                     image_height=image_h,
                     near_plane=near_plane,
@@ -324,7 +360,7 @@ def forward(
                     opacities=opacities,
                     sh_coeffs=sh,
                     viewmats=extrinsics_mat,
-                    Ks=intrincs_mat,
+                    Ks=intrinsics_mat,
                     image_width=image_w,
                     image_height=image_h,
                     eps2d=eps_2d,

From 64f47dec279080c104191bdecae5aba79f5037fe Mon Sep 17 00:00:00 2001
From: Mark Harris <783069+harrism@users.noreply.github.com>
Date: Wed, 15 Jan 2025 08:34:27 +1100
Subject: [PATCH 50/59] Add basic CMake infrastructure for benchmarks (#75)

* Add basic cmake and an example gbenchmark

* Modify setup.py to exclude benchmarks directory

* gitignore benchmarks build dir

* Make CMake work with conda-forge Torch, and make simple benchmark use Torch.

* Add working C++11 ABI Torch conda environment

* Modularize the cmake.

* Whitespace cleanup

* empty

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>

* Add comment about excluding benchmarks directory

* Modularize python, torch and fvdb dependencies, and add better error checking

* Empty commit to test signing

* Empty commit to test signing

* Use `find_library` to find libfvdb.so

---------

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
Co-authored-by: Jonathan Swartz <jonathan@jswartz.info>
Co-authored-by: Francis Williams <fwilliams@users.noreply.github.com>
---
 .gitignore                                |  1 +
 fvdb/env/torch_cxx11.yml                  | 25 ++++++++
 fvdb/setup.py                             | 45 ++++++++++----
 fvdb/src/benchmarks/CMakeLists.txt        | 71 +++++++++++++++++++++++
 fvdb/src/benchmarks/simple/simple.cpp     | 22 +++++++
 fvdb/src/cmake/get_cpm.cmake              | 11 ++++
 fvdb/src/cmake/get_fvdb.cmake             | 21 +++++++
 fvdb/src/cmake/get_google_benchmark.cmake | 10 ++++
 fvdb/src/cmake/get_nvtx.cmake             |  9 +++
 fvdb/src/cmake/get_torch.cmake            | 32 ++++++++++
 10 files changed, 236 insertions(+), 11 deletions(-)
 create mode 100644 fvdb/env/torch_cxx11.yml
 create mode 100644 fvdb/src/benchmarks/CMakeLists.txt
 create mode 100644 fvdb/src/benchmarks/simple/simple.cpp
 create mode 100644 fvdb/src/cmake/get_cpm.cmake
 create mode 100644 fvdb/src/cmake/get_fvdb.cmake
 create mode 100644 fvdb/src/cmake/get_google_benchmark.cmake
 create mode 100644 fvdb/src/cmake/get_nvtx.cmake
 create mode 100644 fvdb/src/cmake/get_torch.cmake

diff --git a/.gitignore b/.gitignore
index 8cde74749e..9e309e06a2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 /build/*
+**/src/benchmarks/build/*
 .DS_Store
diff --git a/fvdb/env/torch_cxx11.yml b/fvdb/env/torch_cxx11.yml
new file mode 100644
index 0000000000..3ed06ba099
--- /dev/null
+++ b/fvdb/env/torch_cxx11.yml
@@ -0,0 +1,25 @@
+name: torch_cxx11
+channels:
+  - conda-forge
+dependencies:
+  - python=3.10
+  - pytorch-gpu=2.5[build=cuda120*]
+  - ca-certificates
+  - openssl
+  - cuda-version=12.0
+  - gitpython
+  - tqdm
+  - numpy
+  - make
+  - cmake
+  - ninja
+  - cxx-compiler
+  - gxx_linux-64=11
+  - gcc_linux-64=11
+  - requests
+  - cuda-toolkit=12.0
+  - cuda-compiler=12.0
+  - cuda-nvcc=12.0
+  - cuda-cccl=12.0
+  - cuda-libraries-static=12.0
+  - cuda-cudart-static=12.0
diff --git a/fvdb/setup.py b/fvdb/setup.py
index bd7c3db295..5d71418a94 100644
--- a/fvdb/setup.py
+++ b/fvdb/setup.py
@@ -88,12 +88,18 @@ def build_cmake_project(base_path, cmake_args, parallel_jobs: int = 1):
         os.makedirs(cmake_build_dir, exist_ok=True)
         os.makedirs(cmake_install_dir, exist_ok=True)
         subprocess.check_call(
-            ["cmake", base_path, f"-DCMAKE_INSTALL_PREFIX={cmake_install_dir}", "-DCMAKE_INSTALL_LIBDIR=lib"]
+            [
+                "cmake",
+                base_path,
+                f"-DCMAKE_INSTALL_PREFIX={cmake_install_dir}",
+                "-DCMAKE_INSTALL_LIBDIR=lib",
+            ]
             + cmake_args,
             cwd=cmake_build_dir,
         )
         subprocess.check_call(
-            ["cmake", "--build", ".", "--target", "install", f"-j{parallel_jobs}"], cwd=cmake_build_dir
+            ["cmake", "--build", ".", "--target", "install", f"-j{parallel_jobs}"],
+            cwd=cmake_build_dir,
         )
         return cmake_install_dir
 
@@ -135,7 +141,9 @@ def run(self) -> None:
             self.download_external_dep(name="openvdb", git_url=openvdb_url, git_tag="feature/nanovdb_v32.7")
 
         _, cutlass_repo = self.download_external_dep(
-            name="cutlass", git_url="https://github.com/NVIDIA/cutlass.git", git_tag="v3.4.0"
+            name="cutlass",
+            git_url="https://github.com/NVIDIA/cutlass.git",
+            git_tag="v3.4.0",
         )
         try:
             # NOTE:  In python <=3.8, __file__ will be a relative path and >3.8 it is an absolute path
@@ -144,11 +152,15 @@ def run(self) -> None:
             logging.info(f"Failed to apply cutlass patch: {str(e)}, continuing without patching")
 
         self.download_external_dep(
-            name="cudnn_fe", git_url="https://github.com/NVIDIA/cudnn-frontend", git_tag="v1.3.0"
+            name="cudnn_fe",
+            git_url="https://github.com/NVIDIA/cudnn-frontend",
+            git_tag="v1.3.0",
         )
 
         blosc_source_dir, _ = self.download_external_dep(
-            name="c-blosc", git_url="https://github.com/Blosc/c-blosc.git", git_tag="v1.21.4"
+            name="c-blosc",
+            git_url="https://github.com/Blosc/c-blosc.git",
+            git_tag="v1.21.4",
         )
         self.build_cmake_project(
             blosc_source_dir,
@@ -178,9 +190,10 @@ def run(self) -> None:
                 shutil.copy(header_file, os.path.join(self.build_lib, header_folder))
 
 
-def get_source_files_recursive(base_path, include_bindings=True) -> List[str]:
+def get_source_files_recursive(base_path, exclude=[], include_bindings=True) -> List[str]:
     source_files = []
-    for dir_name, _, dir_files in os.walk(base_path):
+    for dir_name, dir, dir_files in os.walk(base_path, topdown=True):
+        dir[:] = [d for d in dir if d not in exclude]
         if not include_bindings and os.path.basename(dir_name) == "python":
             continue
         cpp_files = [os.path.join(dir_name, t) for t in dir_files if t.endswith(".cpp")]
@@ -308,10 +321,14 @@ def download_and_install_cudnn() -> Tuple[List[str], List[str]]:
     user_nvcc_flags = os.getenv("NVCC_FLAGS", "").split()
     nvcc_flags += user_nvcc_flags
 
+    # benchmarks are built separately using CMake, so exclude the source
+    # directory from the extension build
+    exclude = ["benchmarks"]
+
     cwd = get_cwd()
     lib_ext = cpp_extension.CUDAExtension(
         name="fvdb.fvdblib",
-        sources=get_source_files_recursive("src", include_bindings=False),
+        sources=get_source_files_recursive("src", exclude, include_bindings=False),
         include_dirs=[
             cwd / "src",
             cwd / get_nanovdb_source_dir(),
@@ -325,13 +342,16 @@ def download_and_install_cudnn() -> Tuple[List[str], List[str]]:
             "external/c-blosc/install/lib/libblosc.a",
         ]
         + cudnn_static_libs,
-        extra_compile_args={"cxx": cpp_flags + ["-fvisibility=default"], "nvcc": nvcc_flags},
+        extra_compile_args={
+            "cxx": cpp_flags + ["-fvisibility=default"],
+            "nvcc": nvcc_flags,
+        },
         language="c++",
     )
 
     bind_ext = cpp_extension.CUDAExtension(
         name="fvdb._Cpp",
-        sources=get_source_files_recursive("src/python/"),
+        sources=get_source_files_recursive("src/python/", exclude),
         include_dirs=[
             cwd / "src",
             cwd / get_nanovdb_source_dir(),
@@ -342,7 +362,10 @@ def download_and_install_cudnn() -> Tuple[List[str], List[str]]:
         library_dirs=[str(cwd / "fvdb")],
         libraries=["fvdb"],
         extra_link_args=["-Wl,-rpath,$ORIGIN"],
-        extra_compile_args={"cxx": cpp_flags + ["-fvisibility=hidden"], "nvcc": nvcc_flags},
+        extra_compile_args={
+            "cxx": cpp_flags + ["-fvisibility=hidden"],
+            "nvcc": nvcc_flags,
+        },
         language="c++",
     )
 
diff --git a/fvdb/src/benchmarks/CMakeLists.txt b/fvdb/src/benchmarks/CMakeLists.txt
new file mode 100644
index 0000000000..2dcea266e8
--- /dev/null
+++ b/fvdb/src/benchmarks/CMakeLists.txt
@@ -0,0 +1,71 @@
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.25 FATAL_ERROR)
+
+project(fvdb_benchmarks LANGUAGES CXX CUDA)
+
+if(DEFINED ENV{CONDA_PREFIX})
+    set(CONDA_ENV_PATH $ENV{CONDA_PREFIX})
+    message(STATUS "Conda environment path: ${CONDA_ENV_PATH}")
+else()
+    message(FATAL_ERROR "Conda environment path not found. Please activate the fvdb conda environment.")
+endif()
+
+# Get dependencies
+include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/get_cpm.cmake)
+include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/get_google_benchmark.cmake)
+include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/get_nvtx.cmake)
+include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/get_torch.cmake)
+include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/get_fvdb.cmake)
+
+add_custom_command(
+  OUTPUT FVDB_BENCHMARKS
+  COMMAND echo Running benchmarks
+  #COMMAND mkdir -p results
+  VERBATIM
+  COMMENT "Running fvdb benchmarks."
+  USES_TERMINAL
+)
+
+# This function takes in a benchmark name and benchmark source and handles setting all of the
+# associated properties and linking to build the benchmark
+function(ConfigureBench CMAKE_BENCH_NAME)
+  add_executable(${CMAKE_BENCH_NAME} ${ARGN})
+  set_target_properties(
+    ${CMAKE_BENCH_NAME}
+    PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/benchmarks>"
+               INSTALL_RPATH "\$ORIGIN/../../../lib"
+               CXX_STANDARD 17
+               CXX_STANDARD_REQUIRED ON
+               CUDA_STANDARD 17
+               CUDA_STANDARD_REQUIRED ON
+  )
+  target_include_directories(${CMAKE_BENCH_NAME} PRIVATE "${FVDB_BUILD_DIR}/include/fvdb")
+  target_include_directories(${CMAKE_BENCH_NAME} PRIVATE "${FVDB_BUILD_DIR}/include/nanovdb")
+  target_include_directories(${CMAKE_BENCH_NAME} PRIVATE "${CONDA_ENV_PATH}/include/python3.10")
+  target_link_libraries(
+    ${CMAKE_BENCH_NAME}
+    fvdb ${TORCH_LIBRARIES} ${Python3_LIBRARIES}
+    benchmark::benchmark_main
+    $<TARGET_NAME_IF_EXISTS:conda_env>
+  )
+  add_custom_command(
+    OUTPUT FVDB_BENCHMARKS
+    COMMAND ${CMAKE_BENCH_NAME} --benchmark_out_format=json
+            --benchmark_out=results/${CMAKE_BENCH_NAME}.json
+    APPEND
+    COMMENT "Adding ${CMAKE_BENCH_NAME}"
+  )
+
+  install(
+    TARGETS ${CMAKE_BENCH_NAME}
+    COMPONENT testing
+    DESTINATION bin/benchmarks/fvdb
+    EXCLUDE_FROM_ALL
+  )
+endfunction()
+
+# Configure the benchmarks
+ConfigureBench(simple "simple/simple.cpp")
+#ConfigureBench(gsplat_rasterize "gsplat_rasterize/gsplat_rasterize.cpp")
diff --git a/fvdb/src/benchmarks/simple/simple.cpp b/fvdb/src/benchmarks/simple/simple.cpp
new file mode 100644
index 0000000000..8003253f36
--- /dev/null
+++ b/fvdb/src/benchmarks/simple/simple.cpp
@@ -0,0 +1,22 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+//
+
+// A simple minimal benchmark to serve as a starting point for fVDB benchmarks.
+
+#include <torch/torch.h>
+
+#include <benchmark/benchmark.h>
+
+static void
+BM_SimpleTensor(benchmark::State &state) {
+    for (auto _: state) {
+        torch::Tensor tensor = torch::eye(100);
+    }
+}
+
+// Register the function as a benchmark
+BENCHMARK(BM_SimpleTensor);
+
+// Run the benchmark
+BENCHMARK_MAIN();
diff --git a/fvdb/src/cmake/get_cpm.cmake b/fvdb/src/cmake/get_cpm.cmake
new file mode 100644
index 0000000000..2b35c05b98
--- /dev/null
+++ b/fvdb/src/cmake/get_cpm.cmake
@@ -0,0 +1,11 @@
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
+
+# download CPM.cmake
+file(
+  DOWNLOAD
+  https://github.com/cpm-cmake/CPM.cmake/releases/download/v0.40.2/CPM.cmake
+  ${CMAKE_CURRENT_BINARY_DIR}/cmake/CPM.cmake
+  EXPECTED_HASH SHA256=c8cdc32c03816538ce22781ed72964dc864b2a34a310d3b7104812a5ca2d835d
+)
+include(${CMAKE_CURRENT_BINARY_DIR}/cmake/CPM.cmake)
diff --git a/fvdb/src/cmake/get_fvdb.cmake b/fvdb/src/cmake/get_fvdb.cmake
new file mode 100644
index 0000000000..e5c346b72f
--- /dev/null
+++ b/fvdb/src/cmake/get_fvdb.cmake
@@ -0,0 +1,21 @@
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
+
+# TODO: once FVDB is built using the CMake build system, we can remove the hard-coded path
+# to library and use a target name.
+
+set(FVDB_BUILD_DIR "${CMAKE_BINARY_DIR}/../../../build/lib.linux-x86_64-cpython-310/fvdb/")
+find_library(
+    FVDB_LIBRARY
+    NAMES fvdb
+    HINTS ${FVDB_BUILD_DIR})
+
+# check that FVDB library is found
+if (FVDB_LIBRARY)
+    message(STATUS "FVDB library: ${FVDB_LIBRARY}")
+else()
+    message(FATAL_ERROR "FVDB library not found. Please build FVDB first.")
+endif()
+
+add_library(fvdb SHARED IMPORTED)
+set_target_properties(fvdb PROPERTIES IMPORTED_LOCATION ${FVDB_LIBRARY})
diff --git a/fvdb/src/cmake/get_google_benchmark.cmake b/fvdb/src/cmake/get_google_benchmark.cmake
new file mode 100644
index 0000000000..8d34667c08
--- /dev/null
+++ b/fvdb/src/cmake/get_google_benchmark.cmake
@@ -0,0 +1,10 @@
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
+
+CPMAddPackage(
+  NAME benchmark
+  GITHUB_REPOSITORY google/benchmark
+  VERSION 1.7.1
+  OPTIONS "BENCHMARK_ENABLE_TESTING Off"
+  GIT_SHALLOW TRUE
+)
diff --git a/fvdb/src/cmake/get_nvtx.cmake b/fvdb/src/cmake/get_nvtx.cmake
new file mode 100644
index 0000000000..4f0baf0d10
--- /dev/null
+++ b/fvdb/src/cmake/get_nvtx.cmake
@@ -0,0 +1,9 @@
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
+CPMAddPackage(
+    NAME nvtx3
+    GITHUB_REPOSITORY NVIDIA/NVTX
+    GIT_TAG v3.1.0-c-cpp
+    GIT_SHALLOW TRUE
+)
+set(nvtx3_dir ${nvtx3_SOURCE_DIR})
diff --git a/fvdb/src/cmake/get_torch.cmake b/fvdb/src/cmake/get_torch.cmake
new file mode 100644
index 0000000000..a9ed933cf3
--- /dev/null
+++ b/fvdb/src/cmake/get_torch.cmake
@@ -0,0 +1,32 @@
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
+
+# find Python3 and site-packages path
+find_package(Python3 REQUIRED COMPONENTS Interpreter Development)
+execute_process(
+  COMMAND "${Python3_EXECUTABLE}" -c "if True:
+    from distutils import sysconfig as sc
+    print(sc.get_python_lib(prefix='', plat_specific=True))"
+  OUTPUT_VARIABLE PYTHON_SITE
+  OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+set (PYTHON_SITE "${CONDA_ENV_PATH}/${PYTHON_SITE}")
+
+# Check that PyTorch package uses the C++11 ABI
+execute_process(
+  COMMAND "${Python3_EXECUTABLE}" -c "import torch; print(torch._C._GLIBCXX_USE_CXX11_ABI)"
+  OUTPUT_VARIABLE TORCH_CXX11_ABI
+  OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+if (NOT TORCH_CXX11_ABI)
+    message(FATAL_ERROR "PyTorch package does not use the C++11 ABI. "
+                        "Please install PyTorch with the C++11 ABI (e.g. conda-forge package).")
+endif()
+
+# find torch, looking in site-packages
+set(Torch_DIR ${PYTHON_SITE}/torch/share/cmake/Torch)
+# needed to correctly configure Torch with the conda-forge build
+set(CUDA_TOOLKIT_ROOT_DIR "${CONDA_ENV_PATH}/targets/x86_64-linux")
+find_package(Torch REQUIRED)
+
+

From d9b35645a365721ffc4aeccae3abf89f6879096e Mon Sep 17 00:00:00 2001
From: Mark Harris <783069+harrism@users.noreply.github.com>
Date: Thu, 16 Jan 2025 03:58:10 +1100
Subject: [PATCH 51/59] Add CMake support for unit tests (#133)

* Add basic cmake and an example gbenchmark

* Modify setup.py to exclude benchmarks directory

* gitignore benchmarks build dir

* Make CMake work with conda-forge Torch, and make simple benchmark use Torch.

* Add working C++11 ABI Torch conda environment

* Modularize the cmake.

* Whitespace cleanup

* empty

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>

* Add comment about excluding benchmarks directory

* Modularize python, torch and fvdb dependencies, and add better error checking

* Empty commit to test signing

* Empty commit to test signing

* Add cmake and example for gtest-based C++ unit tests

* remove whitespace change.

* set `files.insertFinalNewLine` to true in workspace settings, fix EOF newlines

* newline at EOF

* Rename ExampleTest

* Add CTest support.

* "benchmarks" --> "tests"

* Fix typo

* Ignore tests directory (oops)

---------

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
Co-authored-by: Jonathan Swartz <jonathan@jswartz.info>
---
 fvdb/.gitignore                      |  2 +
 fvdb/.vscode/settings.json           |  3 +-
 fvdb/setup.py                        |  2 +-
 fvdb/src/cmake/get_google_test.cmake | 10 ++++
 fvdb/src/tests/CMakeLists.txt        | 79 ++++++++++++++++++++++++++++
 fvdb/src/tests/ExampleTest.cpp       | 13 +++++
 6 files changed, 107 insertions(+), 2 deletions(-)
 create mode 100644 fvdb/src/cmake/get_google_test.cmake
 create mode 100644 fvdb/src/tests/CMakeLists.txt
 create mode 100644 fvdb/src/tests/ExampleTest.cpp

diff --git a/fvdb/.gitignore b/fvdb/.gitignore
index b3f49659f4..a9fd12e967 100644
--- a/fvdb/.gitignore
+++ b/fvdb/.gitignore
@@ -1,4 +1,6 @@
 /build/*
+/src/benchmarks/build/*
+/src/tests/build/*
 *.creator
 *.includes
 *.files
diff --git a/fvdb/.vscode/settings.json b/fvdb/.vscode/settings.json
index 60e196bb77..c9d39c771e 100644
--- a/fvdb/.vscode/settings.json
+++ b/fvdb/.vscode/settings.json
@@ -154,4 +154,5 @@
     "json.format.keepLines": true,
     "C_Cpp.formatting": "clangFormat",
     "C_Cpp.clang_format_path": "${env:CONDA_PREFIX}/envs/fvdb/bin/clang-format-18",
-}
\ No newline at end of file
+    "files.insertFinalNewline": true,
+}
diff --git a/fvdb/setup.py b/fvdb/setup.py
index 5d71418a94..1b04331161 100644
--- a/fvdb/setup.py
+++ b/fvdb/setup.py
@@ -323,7 +323,7 @@ def download_and_install_cudnn() -> Tuple[List[str], List[str]]:
 
     # benchmarks are built separately using CMake, so exclude the source
     # directory from the extension build
-    exclude = ["benchmarks"]
+    exclude = ["benchmarks", "tests"]
 
     cwd = get_cwd()
     lib_ext = cpp_extension.CUDAExtension(
diff --git a/fvdb/src/cmake/get_google_test.cmake b/fvdb/src/cmake/get_google_test.cmake
new file mode 100644
index 0000000000..464a871c79
--- /dev/null
+++ b/fvdb/src/cmake/get_google_test.cmake
@@ -0,0 +1,10 @@
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
+
+CPMAddPackage(
+  NAME googletest
+  GITHUB_REPOSITORY google/googletest
+  GIT_TAG v1.15.2
+  VERSION 1.15.2
+  OPTIONS "INSTALL_GTEST OFF"
+)
diff --git a/fvdb/src/tests/CMakeLists.txt b/fvdb/src/tests/CMakeLists.txt
new file mode 100644
index 0000000000..4dda0c76d1
--- /dev/null
+++ b/fvdb/src/tests/CMakeLists.txt
@@ -0,0 +1,79 @@
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.25 FATAL_ERROR)
+
+project(fvdb_tests LANGUAGES CXX CUDA)
+
+if(DEFINED ENV{CONDA_PREFIX})
+    set(CONDA_ENV_PATH $ENV{CONDA_PREFIX})
+    message(STATUS "Conda environment path: ${CONDA_ENV_PATH}")
+else()
+    message(FATAL_ERROR "Conda environment path not found. Please activate the fvdb conda environment.")
+endif()
+
+# Get dependencies
+include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/get_cpm.cmake)
+include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/get_google_test.cmake)
+include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/get_nvtx.cmake)
+include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/get_torch.cmake)
+include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/get_fvdb.cmake)
+
+add_custom_command(
+  OUTPUT FVDB_TESTS
+  COMMAND echo Running tests
+  #COMMAND mkdir -p results
+  VERBATIM
+  COMMENT "Running fvdb C++ tests."
+  USES_TERMINAL
+)
+
+# output directory
+set(TEST_BINARY_DIRECTORY "$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/gtests>")
+
+# This function takes in a unit test name and test source and handles setting all of the
+# associated properties and linking to build the test binary
+function(ConfigureTest CMAKE_TEST_NAME)
+  add_executable(${CMAKE_TEST_NAME} ${ARGN})
+
+  set_target_properties(
+    ${CMAKE_TEST_NAME}
+    PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${TEST_BINARY_DIRECTORY}
+               INSTALL_RPATH "\$ORIGIN/../../../lib"
+               CXX_STANDARD 17
+               CXX_STANDARD_REQUIRED ON
+               CUDA_STANDARD 17
+               CUDA_STANDARD_REQUIRED ON
+  )
+  target_include_directories(${CMAKE_TEST_NAME} PRIVATE "${FVDB_BUILD_DIR}/include/fvdb")
+  target_include_directories(${CMAKE_TEST_NAME} PRIVATE "${FVDB_BUILD_DIR}/include/nanovdb")
+  target_include_directories(${CMAKE_TEST_NAME} PRIVATE "${CONDA_ENV_PATH}/include/python3.10")
+  target_link_libraries(
+    ${CMAKE_TEST_NAME}
+    fvdb ${TORCH_LIBRARIES} ${Python3_LIBRARIES}
+    GTest::gtest
+    GTest::gtest_main
+    $<TARGET_NAME_IF_EXISTS:conda_env>
+  )
+  add_test(NAME ${CMAKE_TEST_NAME}
+         COMMAND ${CMAKE_TEST_NAME}
+         WORKING_DIRECTORY ${TEST_BINARY_DIRECTORY})
+  add_custom_command(
+    OUTPUT FVDB_TESTS
+    COMMAND ${CMAKE_TEST_NAME}
+    APPEND
+    COMMENT "Adding ${CMAKE_TEST_NAME}"
+  )
+
+  install(
+    TARGETS ${CMAKE_TEST_NAME}
+    COMPONENT testing
+    DESTINATION bin/gtests/fvdb
+    EXCLUDE_FROM_ALL
+  )
+endfunction()
+
+enable_testing()
+
+# Configure an example test
+ConfigureTest(ExampleTest "ExampleTest.cpp")
diff --git a/fvdb/src/tests/ExampleTest.cpp b/fvdb/src/tests/ExampleTest.cpp
new file mode 100644
index 0000000000..3425c943bc
--- /dev/null
+++ b/fvdb/src/tests/ExampleTest.cpp
@@ -0,0 +1,13 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+// A simple minimal unit test to serve as a starting point for fVDB unit tests.
+
+#include <torch/torch.h>
+
+#include <gtest/gtest.h>
+
+TEST(Example, ExampleTest) {
+    std::size_t const size = 100;
+    EXPECT_TRUE(torch::equal(torch::diagonal(torch::eye(size)), torch::ones(size)));
+}

From 1b7f60fe53716599ae6d76c523aa3bfc817ffc91 Mon Sep 17 00:00:00 2001
From: bbartlett-nv <95764047+bbartlett-nv@users.noreply.github.com>
Date: Thu, 16 Jan 2025 13:50:58 -0500
Subject: [PATCH 52/59] remove tensor conversion on images from dataloader to
 save on ram useage for large datasets (#129)

Co-authored-by: Francis Williams <fwilliams@users.noreply.github.com>
Co-authored-by: Mark Harris <783069+harrism@users.noreply.github.com>
---
 fvdb/fvdb/utils/data/colmap_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fvdb/fvdb/utils/data/colmap_dataset.py b/fvdb/fvdb/utils/data/colmap_dataset.py
index 3a083fdf81..27c878985a 100644
--- a/fvdb/fvdb/utils/data/colmap_dataset.py
+++ b/fvdb/fvdb/utils/data/colmap_dataset.py
@@ -403,7 +403,7 @@ def __getitem__(self, item: int) -> Dict[str, Any]:
         data = {
             "K": torch.from_numpy(K).float(),
             "camtoworld": torch.from_numpy(camtoworlds).float(),
-            "image": torch.from_numpy(image).float(),
+            "image": image,
             "image_id": item,  # the index of the image in the dataset
             "image_path": self.parser.image_paths[index],
         }

From a75a507ecbcdedf792d2287260f9e8fd934359b2 Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <2375296+swahtz@users.noreply.github.com>
Date: Tue, 21 Jan 2025 15:01:25 +1300
Subject: [PATCH 53/59] PyTorch, CUDA package sources moved to `conda-forge`
 channel (#131)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Moved the recommended build/test/dev environments to use the conda-forge channel as their package source. This has required moving to CUDA 12.0, PyTorch 2.4.1.  Also removed the NKFW tests because they were breaking in the new environment but these tests’ functionality were covered by other unit tests.

closes #121 closes #135
---------

Signed-off-by: Ken Museth <ken.museth@gmail.com>
Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
Co-authored-by: Ken Museth <1495380+kmuseth@users.noreply.github.com>
---
 fvdb/env/build_environment.yml                |  20 +-
 fvdb/env/dev_environment.yml                  |  35 +-
 fvdb/env/learn_environment.yml                |  17 +-
 fvdb/env/test_environment.yml                 |  35 +-
 .../garfield/garfield_environment.yml         |  33 +-
 .../mask_pls/maskpls_environment.yml          |  10 +-
 fvdb/setup.py                                 |  10 +
 fvdb/src/detail/utils/cuda/Utils.cuh          |   7 +-
 fvdb/tests/unit/nkfw_api/__init__.py          |   3 -
 fvdb/tests/unit/nkfw_api/backend/__init__.py  |  16 -
 fvdb/tests/unit/nkfw_api/backend/abc.py       | 252 -------
 fvdb/tests/unit/nkfw_api/backend/fvdb.py      | 301 --------
 .../tests/unit/nkfw_api/backend/hash_table.py | 645 ------------------
 fvdb/tests/unit/nkfw_api/ext/__init__.py      |  89 ---
 fvdb/tests/unit/nkfw_api/ext/common/bind.cpp  |  23 -
 .../unit/nkfw_api/ext/common/hashmap_cuda.cu  | 399 -----------
 .../unit/nkfw_api/ext/common/hashmap_cuda.cuh | 241 -------
 .../unit/nkfw_api/ext/common/torch_ptr.cuh    |  30 -
 fvdb/tests/unit/nkfw_api/kernel.npz           | Bin 3186 -> 0 bytes
 fvdb/tests/unit/test_nkfw_api.py              | 329 ---------
 20 files changed, 71 insertions(+), 2424 deletions(-)
 delete mode 100644 fvdb/tests/unit/nkfw_api/__init__.py
 delete mode 100644 fvdb/tests/unit/nkfw_api/backend/__init__.py
 delete mode 100644 fvdb/tests/unit/nkfw_api/backend/abc.py
 delete mode 100644 fvdb/tests/unit/nkfw_api/backend/fvdb.py
 delete mode 100644 fvdb/tests/unit/nkfw_api/backend/hash_table.py
 delete mode 100644 fvdb/tests/unit/nkfw_api/ext/__init__.py
 delete mode 100644 fvdb/tests/unit/nkfw_api/ext/common/bind.cpp
 delete mode 100644 fvdb/tests/unit/nkfw_api/ext/common/hashmap_cuda.cu
 delete mode 100644 fvdb/tests/unit/nkfw_api/ext/common/hashmap_cuda.cuh
 delete mode 100644 fvdb/tests/unit/nkfw_api/ext/common/torch_ptr.cuh
 delete mode 100644 fvdb/tests/unit/nkfw_api/kernel.npz
 delete mode 100644 fvdb/tests/unit/test_nkfw_api.py

diff --git a/fvdb/env/build_environment.yml b/fvdb/env/build_environment.yml
index e3c268cd40..cc3dacecf0 100644
--- a/fvdb/env/build_environment.yml
+++ b/fvdb/env/build_environment.yml
@@ -1,25 +1,21 @@
 name: fvdb_build
 channels:
-  - pytorch
-  - nvidia
   - conda-forge
   - nodefaults
 dependencies:
-  - python=3.10
-  - pytorch::pytorch=2.4.0
-  - pytorch::pytorch-cuda=12.1
-  - pytorch::pytorch-mutex=*=cuda
+  - python=3.11
+  - pytorch-gpu=2.4.1[build=cuda120*]
   - git
   - gitpython
   - ca-certificates
   - certifi
   - openssl
-  - cuda-toolkit=12.1
-  - cuda-compiler=12.1
-  - cuda-nvcc=12.1
-  - cuda-cccl=12.1
-  - cuda-libraries-static=12.1
-  - cuda-cudart-static=12.1
+  - cuda-version=12.0
+  - cuda-command-line-tools
+  - cuda-compiler
+  - libcusparse-dev
+  - libcublas-dev
+  - libcusolver-dev
   - gcc_linux-64=11
   - gxx_linux-64=11
   - cxx-compiler
diff --git a/fvdb/env/dev_environment.yml b/fvdb/env/dev_environment.yml
index 1c3b4f8c47..ea658a694d 100644
--- a/fvdb/env/dev_environment.yml
+++ b/fvdb/env/dev_environment.yml
@@ -1,16 +1,12 @@
 name: fvdb
 channels:
-  - pytorch
-  - nvidia
   - conda-forge
   - nodefaults
 dependencies:
-  - python=3.10
+  - python=3.11
+  - pytorch-gpu=2.4.1[build=cuda120*]
   - jupyterlab
-  - pytorch::pytorch=2.4.0
-  - pytorch::pytorch-cuda=12.1
-  - pytorch::pytorch-mutex=*=cuda
-  - pytorch::torchvision
+  - torchvision=0.19
   - tensorboard
   - pip
   - git
@@ -18,12 +14,12 @@ dependencies:
   - ca-certificates
   - certifi
   - openssl
-  - cuda-toolkit=12.1
-  - cuda-compiler=12.1
-  - cuda-nvcc=12.1
-  - cuda-cccl=12.1
-  - cuda-libraries-static=12.1
-  - cuda-cudart-static=12.1
+  - cuda-version=12.0
+  - cuda-command-line-tools
+  - cuda-compiler
+  - libcusparse-dev
+  - libcublas-dev
+  - libcusolver-dev
   - libcurand-dev
   - gcc_linux-64=11
   - gxx_linux-64=11
@@ -40,7 +36,6 @@ dependencies:
   - numpy<2
   - tqdm
   - sparsehash
-  - pyg::pytorch-scatter=2.1.2
   - pandas
   - pytest-benchmark
   - polyscope
@@ -48,19 +43,21 @@ dependencies:
   - rich
   - parameterized
   - ipython
-  - py-openimageio
   - matplotlib
     ## 3dgs
   - imageio
   - torchmetrics
-  - py-opencv
+  - fastai::opencv-python-headless
   - tyro
   - pillow
   - pyyaml
   - scikit-learn
   - pip:
     - point-cloud-utils
-    - https://nksr.s3.ap-northeast-1.amazonaws.com/dev-whls/pt24cu121/torchsparse_20-2.0.0b0-cp310-cp310-linux_x86_64.whl
-    - https://nksr.s3.ap-northeast-1.amazonaws.com/dev-whls/pt24cu121/torchsparse-2.1.0-cp310-cp310-linux_x86_64.whl
+    - https://nksr.s3.ap-northeast-1.amazonaws.com/dev-whls/pt24cu120/torchsparse-2.1.0-cp311-cp311-linux_x86_64.whl
+    - https://nksr.s3.ap-northeast-1.amazonaws.com/dev-whls/pt24cu120/torchsparse_20-2.0.0b0-cp311-cp311-linux_x86_64.whl
+    - https://nksr.s3.ap-northeast-1.amazonaws.com/dev-whls/pt24cu120/torch_scatter-2.1.2-cp311-cp311-linux_x86_64.whl
     ## 3dgs
-    - viser
\ No newline at end of file
+    - viser
+    ## 3dgs tests
+    - oiio-static-python
diff --git a/fvdb/env/learn_environment.yml b/fvdb/env/learn_environment.yml
index affb5f596e..e2255e18a8 100644
--- a/fvdb/env/learn_environment.yml
+++ b/fvdb/env/learn_environment.yml
@@ -1,16 +1,13 @@
 name: fvdb_learn
 channels:
-  - pytorch
-  - nvidia
   - conda-forge
   - nodefaults
 dependencies:
-  - python=3.10
+  - python=3.11
+  - pytorch-gpu=2.4.1[build=cuda120*]
+  - cuda-version=12.0
   - jupyterlab
-  - pytorch::pytorch=2.4.0
-  - pytorch::pytorch-cuda=12.1
-  - pytorch::pytorch-mutex=*=cuda
-  - pytorch::torchvision
+  - torchvision
   - tensorboard
   - pip
   - git
@@ -23,7 +20,6 @@ dependencies:
   - matplotlib
   - tqdm
   - sparsehash
-  - pyg::pytorch-scatter=2.1.2
   - pandas
   - pytest-benchmark
   - polyscope
@@ -33,7 +29,7 @@ dependencies:
   ## 3dgs
   - imageio
   - torchmetrics
-  - py-opencv
+  - fastai::opencv-python-headless
   - tyro
   - pillow
   - pyyaml
@@ -41,4 +37,5 @@ dependencies:
   - pip:
     - point-cloud-utils
     ## 3dgs
-    - viser
\ No newline at end of file
+    - viser
+    - https://nksr.s3.ap-northeast-1.amazonaws.com/dev-whls/pt24cu120/torch_scatter-2.1.2-cp311-cp311-linux_x86_64.whl
diff --git a/fvdb/env/test_environment.yml b/fvdb/env/test_environment.yml
index 3566c66845..e007b15514 100644
--- a/fvdb/env/test_environment.yml
+++ b/fvdb/env/test_environment.yml
@@ -1,41 +1,22 @@
 name: fvdb_test
 channels:
-  - pytorch
-  - nvidia
   - conda-forge
   - nodefaults
 dependencies:
-  - python=3.10
-  - pytorch::pytorch=2.4.0
-  - pytorch::pytorch-cuda=12.1
-  - pytorch::pytorch-mutex=*=cuda
+  - python=3.11
+  - pytorch-gpu=2.4.1[build=cuda120*]
   - tensorboard
+  - cuda-version=12.0
   - pip>=23.3.1
   - git
   - gitpython
   - ca-certificates
   - certifi
   - openssl
-  - cuda-toolkit=12.1
-  - cuda-compiler=12.1
-  - cuda-nvcc=12.1
-  - cuda-cccl=12.1
-  - cuda-libraries-static=12.1
-  - cuda-cudart-static=12.1
-  - cuda-version=12.1
   - parameterized
-  - gcc_linux-64=11
-  - gxx_linux-64=11
-  - cxx-compiler
-  - setuptools>=68.2.2
-  - cmake
-  - make
-  - ninja
   - ipython
   - matplotlib
   - tqdm
-  - sparsehash
-  - pyg::pytorch-scatter=2.1.2
   - sphinx>=7.0.0
   - sphinx_rtd_theme
   - myst-parser
@@ -43,16 +24,18 @@ dependencies:
   - rich
   - pytest-benchmark
   - numpy<2
-  - py-openimageio
   - linkify-it-py
   - glm
   - polyscope
-  - py-opencv
+  - py-opencv=4.10[build=headless*]
   - imageio
   - scikit-learn
   - pip:
     - gsplat
     - pytest-markdown-docs
     - point-cloud-utils
-    - https://nksr.s3.ap-northeast-1.amazonaws.com/dev-whls/pt24cu121/torchsparse_20-2.0.0b0-cp310-cp310-linux_x86_64.whl
-    - https://nksr.s3.ap-northeast-1.amazonaws.com/dev-whls/pt24cu121/torchsparse-2.1.0-cp310-cp310-linux_x86_64.whl
+    - https://nksr.s3.ap-northeast-1.amazonaws.com/dev-whls/pt24cu120/torchsparse-2.1.0-cp311-cp311-linux_x86_64.whl
+    - https://nksr.s3.ap-northeast-1.amazonaws.com/dev-whls/pt24cu120/torchsparse_20-2.0.0b0-cp311-cp311-linux_x86_64.whl
+    - https://nksr.s3.ap-northeast-1.amazonaws.com/dev-whls/pt24cu120/torch_scatter-2.1.2-cp311-cp311-linux_x86_64.whl
+    ## 3dgs tests
+    - oiio-static-python
diff --git a/fvdb/projects/panoptic_segmentation/garfield/garfield_environment.yml b/fvdb/projects/panoptic_segmentation/garfield/garfield_environment.yml
index 28e4c52e40..0f60f015b8 100644
--- a/fvdb/projects/panoptic_segmentation/garfield/garfield_environment.yml
+++ b/fvdb/projects/panoptic_segmentation/garfield/garfield_environment.yml
@@ -1,25 +1,20 @@
 name: fvdb_garfield
 channels:
-  - pytorch
-  - nvidia/label/cuda-12.1.0
-  - rapidsai
   - conda-forge
   - nodefaults
 dependencies:
-  - python=3.10
-  - pytorch::pytorch=2.4.0
-  - pytorch::pytorch-cuda=12.1
-  - pytorch::pytorch-mutex=*=cuda
-  - cuda-toolkit
+  - python=3.11
+  - pytorch-gpu=2.4.1[build=cuda120*]
+  - cuda-version=12.0
+  - cuda-command-line-tools
   - cuda-compiler
-  - cuda-nvcc=12.1
-  - cuda-cccl=12.1
-  - cuda-libraries-static
-  # specifically need these 12.1.1 versions of cudart
-  #  because of awkward overwriting with conda-forge versions that get picked up
-  - nvidia/label/cuda-12.1.1::cuda-cudart-static
-  - nvidia/label/cuda-12.1.1::cuda-cudart
-  - nvidia/label/cuda-12.1.1::cuda-cudart-dev
+  - libcusparse-dev
+  - libcublas-dev
+  - libcusolver-dev
+  - libcurand-dev
+  - cuda-cudart-static
+  - cuda-cudart
+  - cuda-cudart-dev
   - gcc_linux-64=11
   - gxx_linux-64=11
   - cxx-compiler
@@ -48,9 +43,9 @@ dependencies:
   - rapidsai::libcumlprims
   - pip:
     ##  nerfstudio
-    # NOTE: have to build tiny-cuda-nn and nerfacc from source for CUDA 12
-    - git+https://github.com/nerfstudio-project/nerfacc.git@v0.5.2
-    - git+https://github.com/swahtz/tiny-cuda-nn/@cuda_libdir_fix#subdirectory=bindings/torch
+    # NOTE: have had to build tiny-cuda-nn and nerfacc from source for CUDA 12
+    - https://nksr.s3.ap-northeast-1.amazonaws.com/dev-whls/pt24cu120/nerfacc-0.5.2-cp311-cp311-linux_x86_64.whl
+    - https://nksr.s3.ap-northeast-1.amazonaws.com/dev-whls/pt24cu120/tinycudann-1.7-cp311-cp311-linux_x86_64.whl
     - gsplat
     - viser
     - nerfstudio>=1.0.0
diff --git a/fvdb/projects/panoptic_segmentation/mask_pls/maskpls_environment.yml b/fvdb/projects/panoptic_segmentation/mask_pls/maskpls_environment.yml
index b13364e9af..f6204f561b 100644
--- a/fvdb/projects/panoptic_segmentation/mask_pls/maskpls_environment.yml
+++ b/fvdb/projects/panoptic_segmentation/mask_pls/maskpls_environment.yml
@@ -1,14 +1,10 @@
 name: fvdb_maskpls
 channels:
-  - pytorch
-  - nvidia
   - conda-forge
   - nodefaults
 dependencies:
-  - python=3.10
-  - pytorch::pytorch=2.4.0
-  - pytorch::pytorch-cuda=12.1
-  - pytorch::pytorch-mutex=*=cuda
+  - python=3.11
+  - pytorch-gpu=2.4.1[build=cuda120*]
   - pip
   - git
   - gitpython
@@ -20,4 +16,4 @@ dependencies:
   - py-opencv
   - imageio
   - pip:
-    - pye57
\ No newline at end of file
+    - pye57
diff --git a/fvdb/setup.py b/fvdb/setup.py
index 1b04331161..1770933ef0 100644
--- a/fvdb/setup.py
+++ b/fvdb/setup.py
@@ -3,6 +3,7 @@
 #
 import logging
 import os
+import platform
 import re
 import shutil
 import subprocess
@@ -274,6 +275,15 @@ def download_and_install_cudnn() -> Tuple[List[str], List[str]]:
 
 
 if __name__ == "__main__":
+    # Set CUDA_INC_PATH from the appropriate conda target cross-compilation platform directory
+    # NOTE: This strategy will potentially have to change when compiling for different platforms but by then we will likely not be using setuptools…
+    target_platform_include_dir = (
+        Path(os.getenv("CONDA_PREFIX")) / "targets" / f"{platform.machine()}-{platform.system().lower()}" / "include"
+    )
+    # The cuda-toolkit headers (and other '-dev' package headers) from the packages on the `conda-forge` channel are installed in the `targets` directory
+    #   which is to support cross-compilation for different platforms. The headers are installed in the appropriate target platform directory.
+    if (target_platform_include_dir / "cuda.h").exists():
+        os.environ["CUDA_INC_PATH"] = str(target_platform_include_dir)
     # check we will be compiling for a supported compute architecture
     for arch_flag in cpp_extension._get_cuda_arch_flags():
         match = re.search(r"code=sm_(\d+)", arch_flag)
diff --git a/fvdb/src/detail/utils/cuda/Utils.cuh b/fvdb/src/detail/utils/cuda/Utils.cuh
index cef30ec4ee..16848e5d77 100644
--- a/fvdb/src/detail/utils/cuda/Utils.cuh
+++ b/fvdb/src/detail/utils/cuda/Utils.cuh
@@ -185,8 +185,9 @@ forEachVoxelCUDAKernel(fvdb::detail::GridBatchImpl::Accessor<GridType> grid,
 
 template <int32_t NDIMS, typename ScalarT, typename Func, typename... Args>
 __global__ void
-forEachJaggedElementChannelCUDAKernel(JaggedRAcc32<ScalarT, NDIMS> jaggedAcc,
-                                      int64_t channelsPerElement, Func func, Args... args) {
+__launch_bounds__(1024)
+    forEachJaggedElementChannelCUDAKernel(JaggedRAcc32<ScalarT, NDIMS> jaggedAcc,
+                                          int64_t channelsPerElement, Func func, Args... args) {
     const uint64_t idx         = (static_cast<uint64_t>(blockIdx.x) * blockDim.x) + threadIdx.x;
     const int64_t  numElements = jaggedAcc.elementCount();
     if (idx >= static_cast<uint64_t>(numElements) * channelsPerElement) {
@@ -823,4 +824,4 @@ template <typename T> struct RAIIRawDeviceBuffer {
 
 } // namespace fvdb
 
-#endif // FVDB_DETAIL_UTILS_CUDA_UTILS_CUH
\ No newline at end of file
+#endif // FVDB_DETAIL_UTILS_CUDA_UTILS_CUH
diff --git a/fvdb/tests/unit/nkfw_api/__init__.py b/fvdb/tests/unit/nkfw_api/__init__.py
deleted file mode 100644
index 6e140576d0..0000000000
--- a/fvdb/tests/unit/nkfw_api/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# Copyright Contributors to the OpenVDB Project
-# SPDX-License-Identifier: Apache-2.0
-#
diff --git a/fvdb/tests/unit/nkfw_api/backend/__init__.py b/fvdb/tests/unit/nkfw_api/backend/__init__.py
deleted file mode 100644
index 9d1ab508ea..0000000000
--- a/fvdb/tests/unit/nkfw_api/backend/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright Contributors to the OpenVDB Project
-# SPDX-License-Identifier: Apache-2.0
-#
-
-
-def load_backend(backend: str):
-    if backend == "hash_table":
-        from backend import hash_table
-
-        return hash_table
-    elif backend == "fvdb":
-        from backend import fvdb
-
-        return fvdb
-    else:
-        raise NotImplementedError
diff --git a/fvdb/tests/unit/nkfw_api/backend/abc.py b/fvdb/tests/unit/nkfw_api/backend/abc.py
deleted file mode 100644
index 94baaf046f..0000000000
--- a/fvdb/tests/unit/nkfw_api/backend/abc.py
+++ /dev/null
@@ -1,252 +0,0 @@
-# Copyright Contributors to the OpenVDB Project
-# SPDX-License-Identifier: Apache-2.0
-#
-from abc import ABC, abstractmethod
-from typing import Callable, Union
-
-import torch
-
-
-class BaseBackend(ABC):
-    """
-        Abstract base class for SparseFeatureHierarchy.
-    The full code should function normally if each function is correctly implemented.
-    """
-
-    @abstractmethod
-    def __init__(self, depth: int, voxel_size: float, device, range_kernel: Callable[[int], torch.Tensor]):
-        """
-        Initialize the metadata of the hierarchy.
-        :param depth: int, number of layers
-        :param voxel_size: float, width of the voxel at the finest level.
-        :param device: torch.Device, device where the data structure should reside
-        :param range_kernel: a helper function that specifies the relative offsets in the kernel.
-            *Note*: The sequence in the kernel only has to be respected when conv=True in get_self_neighbours!
-        """
-        pass
-
-    @property
-    @abstractmethod
-    def depth(self) -> int:
-        """
-        :return: total depth of the tree
-        """
-        pass
-
-    @property
-    @abstractmethod
-    def voxel_size(self) -> float:
-        """
-        :return: width of the voxel at the finest level.
-        """
-        pass
-
-    @abstractmethod
-    def get_stride(self, depth: int) -> int:
-        """
-        :return: the stride at depth. Usually this would be 2**depth
-        """
-        pass
-
-    @abstractmethod
-    def get_coords(self, depth: int, expand: int = 0, conforming: bool = False) -> torch.Tensor:
-        """
-        :param depth:
-        :param expand:
-        :param conforming:
-        :return: (N, 3) float32 torch.Tensor, each row is the bottom-left-near voxel corner coordinate in normalized space.
-            Note: This might be called multiple times within a function, so we could possibly cache it
-        instead of iterating over the tree multiple times.
-        """
-        pass
-
-    @abstractmethod
-    def get_num_voxels(self, depth: int) -> int:
-        """
-        :return: number of voxels in a given layer
-        """
-        pass
-
-    @abstractmethod
-    def get_voxel_centers(self, depth: int, normalized: bool = False):
-        """
-        Get the centroid coordinates of all existing voxels at depth.
-        :param depth: int
-        :param normalized: if True, then divide the coordinates by voxel size, so that unit 1 is a single voxel.
-        :return: (N, 3) float32 torch.Tensor
-        """
-        pass
-
-    @abstractmethod
-    def __repr__(self) -> str:
-        """
-        :return: str
-        """
-        pass
-
-    @abstractmethod
-    def get_coords_neighbours(
-        self,
-        source_coords: torch.Tensor,
-        source_stride: int,
-        target_depth: int,
-        nn_kernel: torch.Tensor,
-        conv_based: bool = False,
-        transposed: bool = False,
-        raw: bool = False,
-    ):
-        """
-        Get neighbourhood information of source_coords.
-        :param source_coords:
-        :param source_stride:
-        :param target_depth:
-        :param nn_kernel:
-        :param conv_based:
-        :param transposed:
-        :return:
-        """
-        pass
-
-    @abstractmethod
-    def get_self_neighbours(self, source_depth: int, target_depth: int, target_range: int, conv_based: bool = False):
-        """
-
-        :param source_depth:
-        :param target_depth:
-        :param target_range:
-        :param conv_based:
-        :return:
-        """
-        pass
-
-    @abstractmethod
-    def evaluate_voxel_status(self, coords: torch.Tensor, depth: int):
-        """
-        Evaluate status in the hierarchy, please refer to core.hashtree.VoxelStatus for numerical values:
-            VoxelStatus.VS_NON_EXIST: This voxel shouldn't exist
-            VoxelStatus.VS_EXIST_STOP: This voxel exists and is a leaf node
-            VoxelStatus.VS_EXIST_CONTINUE: This voxel exists and has >0 children
-        :param coords: (N, 3) torch.Tensor coordinates in the world space
-        :param depth: int
-        :return: (N, ) long torch.Tensor, indicating voxel status
-        """
-        pass
-
-    @abstractmethod
-    def split_data(self, xyz: torch.Tensor, data_depth: int, data: torch.Tensor):
-        """
-        Obtain the tri-linearly interpolated data located at xyz.
-        :param xyz: torch.Tensor (N, 3)
-        :param data_depth: int
-        :param data: torch.Tensor (M, K), where K is feature dimension, and M = self.get_num_voxels(data_depth)
-        :return: (N, K) torch.Tensor
-        """
-        pass
-
-    @abstractmethod
-    def splat_data(
-        self,
-        xyz: torch.Tensor,
-        data_depth: int,
-        data: torch.Tensor = None,
-        check_corr: bool = True,
-        return_nf_mask: bool = False,
-    ):
-        """
-        Splat data located at xyz to the tree voxels.
-        :param xyz: torch.Tensor (N, 3)
-        :param data_depth: int
-        :param data: torch.Tensor (N, K)
-        :param check_corr: if True, check if data is fully supported by its 8 neighbours
-        :param return_nf_mask: Legacy, do not use.
-        :return: (M, K), where M = self.get_num_voxels(data_depth)
-        """
-        pass
-
-    @abstractmethod
-    def build_hierarchy_dense(self, xyz: torch.Tensor, expand_range: int = 0):
-        """
-        Ignore for now
-        :param xyz:
-        :param expand_range:
-        :return:
-        """
-        pass
-
-    @abstractmethod
-    def build_hierarchy_subdivide(
-        self,
-        xyz: torch.Tensor,
-        subdivide_policy,
-        expand: bool = False,
-        limit_adaptive_depth: int = 100,
-        **policy_kwargs,
-    ):
-        """
-        Ignore for now
-        :param xyz:
-        :param subdivide_policy:
-        :param expand:
-        :param limit_adaptive_depth:
-        :param policy_kwargs:
-        :return:
-        """
-        pass
-
-    @abstractmethod
-    def build_hierarchy_adaptive(
-        self,
-        xyz: torch.Tensor,
-        xyz_density: torch.Tensor,
-        log_base: float = 4.0,
-        min_density: float = 8.0,
-        limit_adaptive_depth: int = 100,
-    ) -> torch.Tensor:
-        """
-        Build the hierarchy by first determine the integer level of each point (based on xyz_density, log_base and
-        min_density), then splat the points onto the tree structure.
-        :param xyz: (N, 3) torch.Tensor
-        :param xyz_density: (N, ) float torch.Tensor
-        :param log_base: float
-        :param min_density: float, minimum density in each voxel. If exceed, go to coarser level.
-        :param limit_adaptive_depth: int. Maximum adaptive number of levels.
-        :return torch.Tensor long. (N, ) level that the point lies in.
-        """
-        pass
-
-    @abstractmethod
-    def update_coords(self, depth: int, coords: Union[torch.Tensor, None]):
-        """
-        Update the structure of the tree. This is mainly used during decoder's structure building stage.
-            For now you could assume that the structure at depth does not exist yet.
-            But I think we should have some general function that alters the tree structure.
-        :param depth: int
-        :param coords: torch.Tensor (N, 3) or None, if None, then this layer would be empty.
-        :return:
-            - new_coords: torch.Tensor (N, 3)
-            - permutation: torch.Tensor (N, ):
-                f[p] maps f from input-seq to fvdb-seq
-                p[i] maps i from fvdb-seq to input-seq
-        """
-        pass
-
-    @abstractmethod
-    def trilinear_interpolate(
-        self,
-        queries: torch.Tensor,
-        depth: int,
-        feature: torch.Tensor,
-        feature_bg: torch.Tensor = None,
-        compute_grad: bool = False,
-    ):
-        """
-        Trilinearly interpolate the features, this is very similar to self.splat.
-            Maybe merge them in the future.
-        :param queries:
-        :param depth:
-        :param feature:
-        :param feature_bg:
-        :param compute_grad:
-        :return:
-        """
-        pass
diff --git a/fvdb/tests/unit/nkfw_api/backend/fvdb.py b/fvdb/tests/unit/nkfw_api/backend/fvdb.py
deleted file mode 100644
index cd7fc56f70..0000000000
--- a/fvdb/tests/unit/nkfw_api/backend/fvdb.py
+++ /dev/null
@@ -1,301 +0,0 @@
-# Copyright Contributors to the OpenVDB Project
-# SPDX-License-Identifier: Apache-2.0
-#
-import torch
-import torch_scatter
-
-import fvdb
-
-from .abc import BaseBackend
-from .hash_table import torch_unique
-
-print("SparseFeatureHierarchy Backend: fVDB 0.0.0")
-
-
-class SparseFeatureHierarchy(BaseBackend):
-
-    CONFORM_OFFSETS = [(0, 0, 0), (0, 0, 1), (0, 1, 0), (0, 1, 1), (1, 0, 0), (1, 0, 1), (1, 1, 0), (1, 1, 1)]
-
-    def __init__(self, depth: int, voxel_size: float, device, range_kernel):
-        super().__init__(depth, voxel_size, device, range_kernel)
-
-        self._depth = depth
-        self._voxel_size = voxel_size
-        self._device = device
-        self._range_kernel = range_kernel
-        self._vox_sizes = [voxel_size * (2**d) for d in range(depth)]
-        self._indexes = [fvdb.GridBatch(device=device) for d in range(self.depth)]
-
-    @property
-    def depth(self):
-        return self._depth
-
-    @property
-    def voxel_size(self):
-        return self._indexes[0].voxel_sizes[0]
-
-    def get_stride(self, depth: int):
-        return 2**depth
-
-    def get_coords(self, depth: int, expand: int = 0, conforming: bool = False):
-        scale = 2**depth
-        if self._indexes[depth].total_voxels == 0:
-            return torch.zeros(0, 3, device=self._device, dtype=torch.int32)
-
-        base_coords = self._indexes[depth].ijk.jdata.int()
-
-        if expand >= 3:
-            mc_offsets = self._range_kernel()(expand) * scale
-            base_coords = (
-                base_coords.unsqueeze(dim=1).repeat(1, mc_offsets.size(0), 1) + mc_offsets.unsqueeze(0)
-            ).view(-1, 3)
-            base_coords = torch_unique(base_coords, dim=0)
-
-        if conforming:
-            base_coords = (base_coords / scale / 2.0).floor().int() * scale * 2
-            base_coords = torch_unique(base_coords, dim=0)
-            conform_offsets = torch.tensor(self.CONFORM_OFFSETS, dtype=torch.int32, device=base_coords.device) * scale
-            base_coords = (base_coords.unsqueeze(dim=1).repeat(1, 8, 1) + conform_offsets.unsqueeze(0)).view(-1, 3)
-
-        return base_coords
-
-    def get_num_voxels(self, depth: int):
-        return self._indexes[depth].total_voxels
-
-    def get_voxel_centers(self, depth: int, normalized: bool = False):
-        return (self.get_coords(depth) + 2**depth / 2.0) * (self._voxel_size if not normalized else 1.0)
-
-    def __repr__(self):
-        return "fVDB"
-
-    def get_coords_neighbours(
-        self,
-        source_coords: torch.Tensor,
-        source_stride: int,
-        target_depth: int,
-        nn_kernel: torch.Tensor,
-        conv_based: bool = False,
-        transposed: bool = False,
-        raw: bool = False,
-    ):
-        assert 0 <= target_depth < self._depth
-
-        target_stride = 2**target_depth
-        if not conv_based:
-            # Flaw: If the layers are different (source stride < target stride), you may end up with
-            #   neighbours that has no overlap support.
-            assert source_stride <= target_stride, "Data must be deeper and has more nodes."
-            # Compute voxel center offsets.
-            quantized_source_coords = (
-                torch.div(source_coords.detach() + 0.5 * source_stride, target_stride, rounding_mode="floor").int()
-                * target_stride
-            )
-            c_offset = (quantized_source_coords - source_coords) / source_stride + (
-                target_stride // source_stride - 1
-            ) / 2.0
-        else:
-            assert not source_coords.requires_grad
-            assert source_stride >= target_stride, "Data must be sparser and shallower."
-            quantized_source_coords = source_coords
-
-        # (N, 3) x (K, 3) -> (K, N, 3)
-        queried_coords = quantized_source_coords.unsqueeze(0) + (nn_kernel * 2**target_depth).unsqueeze(1)
-        hash_res = self._indexes[target_depth].ijk_to_index(queried_coords.reshape(-1, 3))
-        hash_res = hash_res.jdata.reshape(-1, quantized_source_coords.size(0))
-
-        if transposed:
-            hash_res = hash_res.T
-
-        if raw:
-            return hash_res
-
-        nbsizes = torch.sum(hash_res != -1, dim=1)
-
-        if transposed:
-            source_ids, kernel_ids = torch.where(hash_res != -1)
-            target_ids = hash_res[source_ids, kernel_ids]
-        else:
-            kernel_ids, source_ids = torch.where(hash_res != -1)
-            target_ids = hash_res[kernel_ids, source_ids]
-
-        neighbour_types = nn_kernel[kernel_ids]
-
-        if not conv_based:
-            neighbour_types = neighbour_types.float()
-            neighbour_types *= 2**target_depth / source_stride
-            neighbour_types += c_offset[source_ids, :3]
-
-        return source_ids, target_ids, neighbour_types, nbsizes
-
-    def get_self_neighbours(self, source_depth: int, target_depth: int, target_range: int, conv_based: bool = False):
-        assert 0 <= source_depth < self.depth and 0 <= target_depth < self.depth
-
-        # conv_based flag will be ignored if source-depth == target-depth, because this is anyway
-        # covered in both situations.
-        inv_op = False
-        if not conv_based and source_depth != target_depth:
-            # In the case where source is shallower/fewer than target, we inverse the operation
-            if source_depth > target_depth:
-                source_depth, target_depth, inv_op = target_depth, source_depth, True
-
-        def recover_inv_op(inv_src_ids, inv_tgt_ids, inv_nts, inv_nbs):
-            if not inv_op:
-                return inv_src_ids, inv_tgt_ids, inv_nts, inv_nbs
-            else:
-                near_mask = torch.all(inv_nts.abs() < target_range / 2.0 + 1.0e-6, dim=1)
-                inv_nts = -inv_nts * 2 ** (source_depth - target_depth)
-                return inv_tgt_ids[near_mask], inv_src_ids[near_mask], inv_nts[near_mask], None
-
-        # Only compute incremental part:
-        neighbour_kernel = self._range_kernel()(target_range)
-        source_ids, target_ids, neighbour_types, nbsizes = self.get_coords_neighbours(
-            self._indexes[source_depth].ijk.jdata, 2**source_depth, target_depth, neighbour_kernel, conv_based
-        )
-
-        return recover_inv_op(source_ids, target_ids, neighbour_types, nbsizes)
-
-    def evaluate_voxel_status(self, coords: torch.Tensor, depth: int):
-        raise NotImplementedError
-
-    def split_data(self, xyz: torch.Tensor, data_depth: int, data: torch.Tensor):
-        raise NotImplementedError
-
-    def _trilinear_weights(
-        self, xyz: torch.Tensor, tree_stride: int, xyz_data: torch.Tensor = 1, compute_grad: bool = False
-    ):
-        # Gradient is alpha_data w.r.t. xyz.
-        q_coords = xyz / self._voxel_size
-        d_coords = (q_coords / tree_stride).floor() * tree_stride
-        rel_coords = q_coords - d_coords - tree_stride / 2.0
-        oct_sign = torch.sign(rel_coords)
-        oct_local = torch.abs(rel_coords) / tree_stride
-
-        alpha_coords = []
-        alpha_data = []
-        grad_alpha_data = []
-        for nx, ny, nz in self.CONFORM_OFFSETS:
-            alpha_coords.append(
-                (
-                    d_coords
-                    + torch.stack([nx * oct_sign[:, 0], ny * oct_sign[:, 1], nz * oct_sign[:, 2]], dim=1) * tree_stride
-                ).int()
-            )
-            alpha_x = oct_local[:, 0] if nx == 1 else 1 - oct_local[:, 0]
-            alpha_y = oct_local[:, 1] if ny == 1 else 1 - oct_local[:, 1]
-            alpha_z = oct_local[:, 2] if nz == 1 else 1 - oct_local[:, 2]
-            alpha_os = alpha_x * alpha_y * alpha_z
-
-            if compute_grad:
-                assert xyz_data == 1, "What do you want?"
-                d_alpha_x = (oct_sign[:, 0] if nx == 1 else -oct_sign[:, 0]) / (self._voxel_size * tree_stride)
-                d_alpha_y = (oct_sign[:, 1] if ny == 1 else -oct_sign[:, 1]) / (self._voxel_size * tree_stride)
-                d_alpha_z = (oct_sign[:, 2] if nz == 1 else -oct_sign[:, 2]) / (self._voxel_size * tree_stride)
-                grad_alpha_data.append(
-                    torch.stack(
-                        [d_alpha_x * alpha_y * alpha_z, alpha_x * d_alpha_y * alpha_z, alpha_x * alpha_y * d_alpha_z],
-                        dim=1,
-                    )
-                )
-
-            alpha_data.append(
-                alpha_os * xyz_data if isinstance(xyz_data, int) or xyz_data.ndim == 1 else alpha_os[:, None] * xyz_data
-            )
-        alpha_coords = torch.cat(alpha_coords, dim=0)
-        alpha_data = torch.cat(alpha_data, dim=0)
-
-        if compute_grad:
-            return alpha_coords, alpha_data, torch.cat(grad_alpha_data, dim=0)
-
-        return alpha_coords, alpha_data
-
-    def _identity_kernel(self):
-        return torch.tensor([[0, 0, 0]], dtype=torch.int32, device=self._device)
-
-    def splat_data(
-        self,
-        xyz: torch.Tensor,
-        data_depth: int,
-        data: torch.Tensor = None,
-        check_corr: bool = True,
-        return_nf_mask: bool = False,
-    ):
-        """
-        Splat the data onto the tree with tri-linear interpolation.
-        :param xyz: data position
-        :param data_depth: depth of the octree to splat onto.
-        :param data: (N,) or (N,C) None means all ones, weight should be pre-multiplied to data if applicable
-        :return: (V,) or (V,C).
-        """
-        if data is not None:
-            assert data.size(0) == xyz.size(0), "Input data must agree with xyz in size."
-        else:
-            data = 1
-
-        tree_stride = 2**data_depth
-        alpha_coords, alpha_data = self._trilinear_weights(xyz, tree_stride, data)
-
-        # align normal_coords and tree_coords.
-        alpha_source, alpha_target, _, nb_sizes = self.get_coords_neighbours(
-            alpha_coords, tree_stride, data_depth, self._identity_kernel(), transposed=True
-        )
-
-        # Make sure that each query coordinates has one correspondent:
-        if alpha_source.size(0) < alpha_coords.size(0) and check_corr:
-            print(
-                "Warning: Some grids that normal should be splatted onto is missing because expansion is too small. "
-                f"# Should = {alpha_coords.size(0)}, Actual = {alpha_source.size(0)}."
-            )
-        splat_res = torch_scatter.scatter_sum(
-            alpha_data[alpha_source], alpha_target, dim=0, dim_size=self.get_num_voxels(data_depth)
-        )
-        if return_nf_mask:
-            # If a point can only be splatted on to less than 4 voxels, it is a bad splat.
-            return splat_res, nb_sizes.reshape(8, -1).sum(0) < 4
-        return splat_res
-
-    def build_hierarchy_dense(self, xyz: torch.Tensor, expand_range: int = 0):
-        raise NotImplementedError
-
-    def build_hierarchy_subdivide(
-        self,
-        xyz: torch.Tensor,
-        subdivide_policy,
-        expand: bool = False,
-        limit_adaptive_depth: int = 100,
-        **policy_kwargs,
-    ):
-        raise NotImplementedError
-
-    def build_hierarchy_adaptive(
-        self,
-        xyz: torch.Tensor,
-        xyz_density: torch.Tensor,
-        log_base: float = 4.0,
-        min_density: float = 8.0,
-        limit_adaptive_depth: int = 100,
-    ):
-        raise NotImplementedError
-
-    def update_coords(self, depth: int, coords: torch.Tensor):
-        if coords is None:
-            return
-        assert coords.ndim == 2 and coords.size(1) == 3, coords.size()
-        self._indexes[depth].set_from_ijk(coords, [0, 0, 0], [0, 0, 0], voxel_sizes=self._vox_sizes[depth])
-        coords_idx = self._indexes[depth].ijk_to_index(coords)
-        permutation = torch.empty(coords.size(0), dtype=torch.long, device=self._device)
-        permutation[coords_idx.jdata] = torch.arange(coords.size(0), dtype=torch.long, device=self._device)
-        return coords[permutation], permutation
-
-    def trilinear_interpolate(
-        self,
-        queries: torch.Tensor,
-        depth: int,
-        feature: torch.Tensor,
-        feature_bg: torch.Tensor = None,
-        compute_grad: bool = False,
-    ):
-        raise NotImplementedError
-
-
-if __name__ == "__main__":
-    pass
diff --git a/fvdb/tests/unit/nkfw_api/backend/hash_table.py b/fvdb/tests/unit/nkfw_api/backend/hash_table.py
deleted file mode 100644
index a767d26446..0000000000
--- a/fvdb/tests/unit/nkfw_api/backend/hash_table.py
+++ /dev/null
@@ -1,645 +0,0 @@
-# Copyright Contributors to the OpenVDB Project
-# SPDX-License-Identifier: Apache-2.0
-#
-from typing import List, Union
-
-import numpy as np
-import torch
-import torch_scatter
-
-from ..ext import CuckooHashTable
-from .abc import BaseBackend
-
-print("SparseFeatureHierarchy Backend: Hash Table")
-
-
-def torch_unique(
-    input: torch.Tensor,
-    sorted: bool = False,
-    return_inverse: bool = False,
-    return_counts: bool = False,
-    dim: int = None,
-):
-    """
-    If used with dim, then torch.unique will return a flattened tensor. This fixes that behaviour.
-    :param input: (Tensor) – the input tensor
-    :param sorted: (bool) – Whether to sort the unique elements in ascending order before returning as output.
-    :param return_inverse: (bool) – Whether to also return the indices for where elements in the original input
-        ended up in the returned unique list.
-    :param return_counts: (bool) – Whether to also return the counts for each unique element.
-    :param dim: (int) – the dimension to apply unique. If None, the unique of the flattened input is returned.
-        default: None
-    :return: output, inverse_indices, counts
-    """
-    res = torch.unique(input, sorted, return_inverse, return_counts, dim)
-
-    if dim is not None and input.size(dim) == 0:
-        output_size = list(input.size())
-        output_size[dim] = 0
-        if isinstance(res, torch.Tensor):
-            res = res.reshape(output_size)
-        else:
-            res = list(res)
-            res[0] = res[0].reshape(output_size)
-
-    return res
-
-
-class NeighbourMaps:
-    """
-    A cache similar to kernel map, without the need of re-computing everything when enlarging neighbourhoods.
-    """
-
-    def __init__(self, device):
-        # Cached maps (src-depth, tgt-depth) -> (tgt-neighbour-size 1,3,5, src-id, tgt-id, neighbour-types, nbsizes)
-        #   Note: none of the relevant range here is in strided format!
-        self.cache = {}
-        self.device = device
-
-    def get_map(self, source_depth: int, target_depth: int, target_range: int, force_recompute: bool = False):
-        """
-        Given the query, return the existing part and also the part needed to be queried.
-        :return: tuple (src-id, tgt-id, neighbour-types, nbsizes, ranges lacked [a,b] )
-        """
-        if (source_depth, target_depth) in self.cache.keys():
-            if force_recompute:
-                del self.cache[(source_depth, target_depth)]
-                max_range, exist_src, exist_tgt, exist_nt, exist_nbs = -1, None, None, None, None
-            else:
-                max_range, exist_src, exist_tgt, exist_nt, exist_nbs = self.cache[(source_depth, target_depth)]
-        else:
-            max_range, exist_src, exist_tgt, exist_nt, exist_nbs = -1, None, None, None, None
-
-        if target_range == max_range:
-            return exist_src, exist_tgt, exist_nt, exist_nbs, None
-        elif target_range < max_range:
-            tr3 = target_range * target_range * target_range
-            n_query = torch.sum(exist_nbs[:tr3])
-            return exist_src[:n_query], exist_tgt[:n_query], exist_nt[:n_query], exist_nbs[:tr3], None
-        else:
-            return exist_src, exist_tgt, exist_nt, exist_nbs, [max_range + 2, target_range]
-
-    def update_map(self, source_depth: int, target_depth: int, target_range: int, res: list):
-        self.cache[(source_depth, target_depth)] = [target_range] + res
-
-
-class SparseFeatureHierarchy(BaseBackend):
-
-    CONFORM_OFFSETS = [(0, 0, 0), (0, 0, 1), (0, 1, 0), (0, 1, 1), (1, 0, 0), (1, 0, 1), (1, 1, 0), (1, 1, 1)]
-
-    def __init__(self, depth: int, voxel_size: float, device, range_kernel):
-        self._depth = depth
-        self._voxel_size = voxel_size
-        self._device = device
-        self._range_kernel = range_kernel
-
-        # Conv-based include same-level
-        self._conv_nmap = NeighbourMaps(self._device)
-        # Region-based exclude same-level
-        self._region_nmap = NeighbourMaps(self._device)
-
-        self._strides = [2**d for d in range(self.depth)]
-        # List of torch.Tensor (Nx3)
-        self._coords = [None for d in range(self.depth)]
-        self._hash_table: List[CuckooHashTable] = [None for d in range(self.depth)]
-
-    @property
-    def depth(self):
-        return self._depth
-
-    @property
-    def voxel_size(self):
-        return self._voxel_size
-
-    def get_stride(self, depth: int):
-        return self._strides[depth]
-
-    def get_coords(self, depth: int, expand: int = 0, conforming: bool = False):
-        scale = self._strides[depth]
-        base_coords = self._coords[depth]
-
-        if expand >= 3:
-            mc_offsets = self._range_kernel()(expand) * scale
-            base_coords = (
-                base_coords.unsqueeze(dim=1).repeat(1, mc_offsets.size(0), 1) + mc_offsets.unsqueeze(0)
-            ).view(-1, 3)
-            base_coords = torch_unique(base_coords, dim=0)
-
-        if conforming:
-            base_coords = (base_coords / scale / 2.0).floor().int() * scale * 2
-            base_coords = torch_unique(base_coords, dim=0)
-            conform_offsets = torch.tensor(self.CONFORM_OFFSETS, dtype=torch.int32, device=base_coords.device) * scale
-            base_coords = (base_coords.unsqueeze(dim=1).repeat(1, 8, 1) + conform_offsets.unsqueeze(0)).view(-1, 3)
-
-        return base_coords
-
-    def get_num_voxels(self, depth: int):
-        return self._coords[depth].size(0) if self._coords[depth] is not None else 0
-
-    def get_voxel_centers(self, depth: int, normalized: bool = False):
-        return (self.get_coords(depth) + self._strides[depth] / 2.0) * (self._voxel_size if not normalized else 1.0)
-
-    def __repr__(self):
-        stat = f"Depth={self.depth}:\n"
-        for stride, coords in zip(self._strides, self._coords):
-            if coords is None:
-                stat += f" + [{stride}] Empty\n"
-                continue
-            c_min = torch.min(coords, dim=0).values
-            c_max = torch.max(coords, dim=0).values
-            stat += (
-                f" + [{stride}] #Voxels={coords.size(0)} "
-                f"Bound=[{c_min[0]},{c_max[0]}]x[{c_min[1]},{c_max[1]}]x[{c_min[2]},{c_max[2]}]\n"
-            )
-        return stat
-
-    def _update_hash_table(self):
-        for d in range(self.depth):
-            self._hash_table[d] = CuckooHashTable(data=self._coords[d])
-            assert self._hash_table[d].dim == 3
-
-    def _trilinear_weights(
-        self, xyz: torch.Tensor, tree_stride: int, xyz_data: torch.Tensor = 1, compute_grad: bool = False
-    ):
-        # Gradient is alpha_data w.r.t. xyz.
-        q_coords = xyz / self._voxel_size
-        d_coords = (q_coords / tree_stride).floor() * tree_stride
-        rel_coords = q_coords - d_coords - tree_stride / 2.0
-        oct_sign = torch.sign(rel_coords)
-        oct_local = torch.abs(rel_coords) / tree_stride
-
-        alpha_coords = []
-        alpha_data = []
-        grad_alpha_data = []
-        for nx, ny, nz in self.CONFORM_OFFSETS:
-            alpha_coords.append(
-                (
-                    d_coords
-                    + torch.stack([nx * oct_sign[:, 0], ny * oct_sign[:, 1], nz * oct_sign[:, 2]], dim=1) * tree_stride
-                ).int()
-            )
-            alpha_x = oct_local[:, 0] if nx == 1 else 1 - oct_local[:, 0]
-            alpha_y = oct_local[:, 1] if ny == 1 else 1 - oct_local[:, 1]
-            alpha_z = oct_local[:, 2] if nz == 1 else 1 - oct_local[:, 2]
-            alpha_os = alpha_x * alpha_y * alpha_z
-
-            if compute_grad:
-                assert xyz_data == 1, "What do you want?"
-                d_alpha_x = (oct_sign[:, 0] if nx == 1 else -oct_sign[:, 0]) / (self._voxel_size * tree_stride)
-                d_alpha_y = (oct_sign[:, 1] if ny == 1 else -oct_sign[:, 1]) / (self._voxel_size * tree_stride)
-                d_alpha_z = (oct_sign[:, 2] if nz == 1 else -oct_sign[:, 2]) / (self._voxel_size * tree_stride)
-                grad_alpha_data.append(
-                    torch.stack(
-                        [d_alpha_x * alpha_y * alpha_z, alpha_x * d_alpha_y * alpha_z, alpha_x * alpha_y * d_alpha_z],
-                        dim=1,
-                    )
-                )
-
-            alpha_data.append(
-                alpha_os * xyz_data if isinstance(xyz_data, int) or xyz_data.ndim == 1 else alpha_os[:, None] * xyz_data
-            )
-        alpha_coords = torch.cat(alpha_coords, dim=0)
-        alpha_data = torch.cat(alpha_data, dim=0)
-
-        if compute_grad:
-            return alpha_coords, alpha_data, torch.cat(grad_alpha_data, dim=0)
-
-        return alpha_coords, alpha_data
-
-    def get_coords_neighbours(
-        self,
-        source_coords: torch.Tensor,
-        source_stride: int,
-        target_depth: int,
-        nn_kernel: torch.Tensor,
-        conv_based: bool = False,
-        transposed: bool = False,
-        raw: bool = False,
-    ):
-        """
-        A generic interface for querying neighbourhood information. (This is without cache)
-            For all source (data), find all target whose neighbourhood (in target level) covers it,
-        will also return the relative position of the two.
-        :param nn_kernel: Unit is 1
-        :param transposed: allows efficient per-source handling.
-        """
-        assert 0 <= target_depth < self._depth
-
-        if not conv_based:
-            # Flaw: If the layers are different (source stride < target stride), you may end up with
-            #   neighbours that has no overlap support.
-            assert source_stride <= self._strides[target_depth], "Data must be deeper and has more nodes."
-            # Compute voxel center offsets.
-            quantized_source_coords = (
-                torch.div(
-                    source_coords.detach() + 0.5 * source_stride, self._strides[target_depth], rounding_mode="floor"
-                ).int()
-                * self._strides[target_depth]
-            )
-            c_offset = (quantized_source_coords - source_coords) / source_stride + (
-                self._strides[target_depth] // source_stride - 1
-            ) / 2.0
-        else:
-            assert not source_coords.requires_grad
-            assert source_stride >= self._strides[target_depth], "Data must be sparser and shallower."
-            quantized_source_coords = source_coords
-
-        hash_res = self._hash_table[target_depth].query(
-            quantized_source_coords, nn_kernel * self._strides[target_depth]
-        )  # (K, N)
-
-        if transposed:
-            hash_res = hash_res.T
-
-        if raw:
-            return hash_res
-
-        nbsizes = torch.sum(hash_res != -1, dim=1)
-
-        if transposed:
-            source_ids, kernel_ids = torch.where(hash_res != -1)
-            target_ids = hash_res[source_ids, kernel_ids]
-        else:
-            kernel_ids, source_ids = torch.where(hash_res != -1)
-            target_ids = hash_res[kernel_ids, source_ids]
-
-        neighbour_types = nn_kernel[kernel_ids]
-
-        if not conv_based:
-            neighbour_types = neighbour_types.float()
-            neighbour_types *= self._strides[target_depth] / source_stride
-            neighbour_types += c_offset[source_ids, :3]
-
-        return source_ids, target_ids, neighbour_types, nbsizes
-
-    def get_self_neighbours(self, source_depth: int, target_depth: int, target_range: int, conv_based: bool = False):
-        """
-        :param source_depth: source depth where you want the coord id to start from
-        :param target_depth: target depth where you want the coord id to shoot to
-        :param target_range: must be odd, logical neighbourhood range to search for, e.g. 5 for B2 basis.
-        :return: [sid, tid]
-        """
-        assert 0 <= source_depth < self.depth and 0 <= target_depth < self.depth
-
-        tree_coords, tree_strides = self._coords, self._strides
-
-        # conv_based flag will be ignored if source-depth == target-depth, because this is anyway
-        #   covered in both situations.
-        inv_op = False
-        if not conv_based and source_depth != target_depth:
-            neighbour_maps = self._region_nmap
-            # In the case where source is shallower/fewer than target, we inverse the operation
-            if source_depth > target_depth:
-                source_depth, target_depth, inv_op = target_depth, source_depth, True
-        else:
-            neighbour_maps = self._conv_nmap
-
-        def recover_inv_op(inv_src_ids, inv_tgt_ids, inv_nts, inv_nbs):
-            if not inv_op:
-                return inv_src_ids, inv_tgt_ids, inv_nts, inv_nbs
-            else:
-                # Filter far away nodes.
-                near_mask = torch.all(inv_nts.abs() < target_range / 2.0 + 1.0e-6, dim=1)
-                # Convert back neighbour types.
-                inv_nts = -inv_nts / tree_strides[target_depth] * tree_strides[source_depth]
-                return inv_tgt_ids[near_mask], inv_src_ids[near_mask], inv_nts[near_mask], None
-
-        exist_src, exist_tgt, exist_nt, exist_nbs, lack_range = neighbour_maps.get_map(
-            source_depth, target_depth, target_range
-        )
-
-        if lack_range is None:
-            return recover_inv_op(exist_src, exist_tgt, exist_nt, exist_nbs)
-
-        # Only compute incremental part:
-        neighbour_kernel = self._range_kernel()(target_range)
-        starting_lap = max(0, lack_range[0] - 2)
-        starting_lap = starting_lap**3
-        neighbour_kernel = neighbour_kernel[starting_lap:]
-
-        source_ids, target_ids, neighbour_types, nbsizes = self.get_coords_neighbours(
-            tree_coords[source_depth], tree_strides[source_depth], target_depth, neighbour_kernel, conv_based
-        )
-
-        if exist_src is not None:
-            source_ids = torch.cat([exist_src, source_ids], dim=0)
-            target_ids = torch.cat([exist_tgt, target_ids], dim=0)
-            neighbour_types = torch.cat([exist_nt, neighbour_types], dim=0)
-            nbsizes = torch.cat([exist_nbs, nbsizes], dim=0)
-
-        # Cache result for future use.
-        neighbour_maps.update_map(
-            source_depth, target_depth, target_range, [source_ids, target_ids, neighbour_types, nbsizes]
-        )
-
-        return recover_inv_op(source_ids, target_ids, neighbour_types, nbsizes)
-
-    def evaluate_voxel_status(self, coords: torch.Tensor, depth: int):
-        """
-        Evaluate the voxel status of given coordinates
-        :param coords: (N, 3)
-        :param depth: int
-        :return: (N, ) long tensor, with value 0,1,2
-        """
-        from core.hashtree import VoxelStatus
-
-        status = torch.full((coords.size(0),), VoxelStatus.VS_NON_EXIST.value, dtype=torch.long, device=coords.device)
-        sidx, _, _, _ = self.get_coords_neighbours(
-            coords, self._strides[depth], depth, self._identity_kernel(), conv_based=True
-        )
-        status[sidx] = VoxelStatus.VS_EXIST_STOP.value
-
-        if depth > 0:
-            # Next level.
-            conform_offsets = (
-                torch.tensor(self.CONFORM_OFFSETS, dtype=torch.int32, device=self._device) * self._strides[depth - 1]
-            )
-            conform_coords = (coords[sidx].unsqueeze(dim=1).repeat(1, 8, 1) + conform_offsets.unsqueeze(0)).view(-1, 3)
-            qidx, _, _, _ = self.get_coords_neighbours(
-                conform_coords, self._strides[depth - 1], depth - 1, self._identity_kernel(), conv_based=True
-            )
-            qidx = torch.div(qidx, 8, rounding_mode="floor")
-            status[sidx[qidx]] = VoxelStatus.VS_EXIST_CONTINUE.value
-
-        return status
-
-    def split_data(self, xyz: torch.Tensor, data_depth: int, data: torch.Tensor):
-        """
-        Split the data from the tree to query positions, with tri-linear interpolations.
-            This is the inverse operation of the splat function, used in decoders.
-        :param xyz: query positions.
-        :param data_depth: depth of the octree to split from
-        :param data: (V, C).
-        :return: (N, C)
-        """
-        tree_stride = self._strides[data_depth]
-        assert data.size(0) == self._coords[data_depth].size(0), "Tree data does not agree on size."
-
-        alpha_coords, alpha_weight = self._trilinear_weights(xyz, tree_stride)
-        alpha_source, alpha_target, _, _ = self.get_coords_neighbours(
-            alpha_coords, tree_stride, data_depth, self._identity_kernel()
-        )
-        return torch_scatter.scatter_sum(
-            data[alpha_target] * alpha_weight[alpha_source, None],
-            alpha_source % xyz.size(0),
-            dim=0,
-            dim_size=xyz.size(0),
-        )
-
-    def splat_data(
-        self,
-        xyz: torch.Tensor,
-        data_depth: int,
-        data: torch.Tensor = None,
-        check_corr: bool = True,
-        return_nf_mask: bool = False,
-    ):
-        """
-        Splat the data onto the tree with tri-linear interpolation.
-        :param xyz: data position
-        :param data_depth: depth of the octree to splat onto.
-        :param data: (N,) or (N,C) None means all ones, weight should be pre-multiplied to data if applicable
-        :return: (V,) or (V,C).
-        """
-        if data is not None:
-            assert data.size(0) == xyz.size(0), "Input data must agree with xyz in size."
-        else:
-            data = 1
-
-        tree_stride = self._strides[data_depth]
-        alpha_coords, alpha_data = self._trilinear_weights(xyz, tree_stride, data)
-
-        # align normal_coords and tree_coords.
-        alpha_source, alpha_target, _, nb_sizes = self.get_coords_neighbours(
-            alpha_coords, tree_stride, data_depth, self._identity_kernel(), transposed=True
-        )
-
-        # Make sure that each query coordinates has one correspondent:
-        if alpha_source.size(0) < alpha_coords.size(0) and check_corr:
-            print(
-                "Warning: Some grids that normal should be splatted onto is missing because expansion is too small. "
-                f"# Should = {alpha_coords.size(0)}, Actual = {alpha_source.size(0)}."
-            )
-        splat_res = torch_scatter.scatter_sum(
-            alpha_data[alpha_source], alpha_target, dim=0, dim_size=self._coords[data_depth].size(0)
-        )
-        if return_nf_mask:
-            # If a point can only be splatted on to less than 4 voxels, it is a bad splat.
-            return splat_res, nb_sizes.reshape(8, -1).sum(0) < 4
-        return splat_res
-
-    def _quantize_coords(self, xyz: torch.Tensor, data_depth: int):
-        # Note this is just splat_data with NEW_BRANCH.
-        tree_stride = self._strides[data_depth]
-        alpha_coords, _ = self._trilinear_weights(xyz, tree_stride)
-        alpha_coords = torch_unique(alpha_coords, dim=0)
-        return alpha_coords
-
-    def build_hierarchy_dense(self, xyz: torch.Tensor, expand_range: int = 0):
-        """
-        Rebuild the tree structure, based on current xyz, voxel_size and depth.
-        """
-        if expand_range == 2:
-            unique_coords = self._quantize_coords(xyz, 0)
-        else:
-            coords = torch.div(xyz, self._voxel_size).floor().int()
-            unique_coords = torch_unique(coords, dim=0)
-            if expand_range > 0:
-                offsets = self._range_kernel()(expand_range)
-                my_pad = (unique_coords.unsqueeze(dim=1).repeat(1, offsets.size(0), 1) + offsets.unsqueeze(0)).view(
-                    -1, 3
-                )
-                unique_coords = torch_unique(my_pad, dim=0)
-
-        self._coords = [unique_coords]
-        for d in range(1, self.depth):
-            coords = torch.div(self._coords[-1], self._strides[d], rounding_mode="floor") * self._strides[d]
-            coords = torch_unique(coords, dim=0)
-            self._coords.append(coords)
-        self._update_hash_table()
-
-    def build_hierarchy_subdivide(
-        self,
-        xyz: torch.Tensor,
-        subdivide_policy,
-        expand: bool = False,
-        limit_adaptive_depth: int = 100,
-        **policy_kwargs,
-    ):
-        """
-        Build a hierarchy, based on subdivision policy
-        :return:
-        """
-        current_pts = xyz / self._voxel_size
-        inv_mapping = None
-        xyz_depth = torch.full((xyz.size(0),), fill_value=self._depth - 1, device=self._device, dtype=torch.int)
-        xyz_depth_inds = torch.arange(xyz.size(0), device=self._device, dtype=torch.long)
-
-        for d in range(self._depth - 1, -1, -1):
-            if d != self._depth - 1:
-                nxt_mask = subdivide_policy(current_pts, inv_mapping, **policy_kwargs)
-                current_pts = current_pts[nxt_mask]
-                xyz_depth_inds = xyz_depth_inds[nxt_mask]
-                policy_kwargs = {k: v[nxt_mask] if isinstance(v, torch.Tensor) else v for k, v in policy_kwargs.items()}
-                xyz_depth[xyz_depth_inds] -= 1
-            coords = torch.div(current_pts, self.get_stride(d), rounding_mode="floor").int() * self._strides[d]
-            unique_coords, inv_mapping = torch_unique(coords, dim=0, return_inverse=True)
-            self._coords[d] = unique_coords
-        xyz_depth.clamp_(max=limit_adaptive_depth - 1)
-
-        if expand:
-            self._coords = []
-            for d in range(self.depth):
-                depth_samples = xyz[xyz_depth <= d]
-                coords = self._quantize_coords(depth_samples, d)
-                if depth_samples.size(0) == 0:
-                    print(f"-- disregard level {d} due to insufficient samples!")
-                self._coords.append(coords)
-        self._update_hash_table()
-
-        return xyz_depth
-
-    def build_hierarchy_adaptive(
-        self,
-        xyz: torch.Tensor,
-        xyz_density: torch.Tensor,
-        log_base: float = 4.0,
-        min_density: float = 8.0,
-        limit_adaptive_depth: int = 100,
-    ):
-        """
-        Build a hierarchy similar to Adaptive-OCNN, i.e., finest voxels does not cover all surfaces,
-            but only detailed parts. Coarse voxels, however, must cover all fine voxels.
-            However, in a uniform dense sampling case, this falls back to the build_encoder_hierarchy_dense case.
-        :param log_base: (float), used to determine how to split depths, the smaller, the more levels are going
-            to be used. 4 is chosen in the original paper, which matches the 2-manifold structure.
-        :param min_density: (float). The minimum normalized density (Unit: #points/voxel) to have for each point,
-            so that when the density is smaller than this threshold, a coarser voxel is used for splatting this point.
-            Any points with density larger than this threshold will be put to level-0.
-            Note: This should be kept fixed most of the time because having too few samples within a voxel is bound
-        :param limit_adaptive_depth: (int) depth limitation
-        to fail and lead to holes. Tune voxel size or sub-sample point to get what you want.
-        """
-        # Compute expected depth.
-        xyz_depth = -(torch.log(xyz_density / min_density) / np.log(log_base)).floor().int().clamp_(max=0)
-        xyz_depth.clamp_(max=min(self.depth - 1, limit_adaptive_depth - 1))
-
-        # self.xyz_depth = (self.xyz[:, 0] < 0.0).int()
-        # self.xyz_density = torch.ones((self.xyz.size(0),), device=self.device)
-
-        # Determine octants by splatting.
-        self._coords = []
-        for d in range(self.depth):
-            depth_samples = xyz[xyz_depth <= d]
-            coords = self._quantize_coords(depth_samples, d)
-            # if depth_samples.size(0) == 0:
-            #     print(f"-- disregard level {d} due to insufficient samples!")
-            self._coords.append(coords)
-
-        self._update_hash_table()
-        return xyz_depth
-
-    def update_coords(self, depth: int, coords: Union[torch.Tensor, None]):
-        if coords is None:
-            coords = torch.zeros((0, 3), dtype=torch.int32, device=self._device)
-        assert coords.ndim == 2 and coords.size(1) == 3, coords.size()
-        self._coords[depth] = coords
-        self._hash_table[depth] = CuckooHashTable(data=self._coords[depth])
-        return coords, torch.arange(coords.size(0), dtype=torch.long, device=coords.device)
-
-    def _identity_kernel(self):
-        return torch.tensor([[0, 0, 0]], dtype=torch.int32, device=self._device)
-
-    def _trilerp_light(self, queries: torch.Tensor, depth: int, feature: torch.Tensor, compute_grad: bool = False):
-        """
-        This version use less memory...
-        """
-        alpha_res = self._trilinear_weights(queries, self._strides[depth], compute_grad=compute_grad)
-
-        if compute_grad:
-            alpha_coords, alpha_weight, grad_alpha_weight = alpha_res
-        else:
-            alpha_coords, alpha_weight = alpha_res
-
-        # For the logic here refer to 'splat_data'
-        alpha_source, alpha_target, _, nb_sizes = self.get_coords_neighbours(
-            alpha_coords, self._strides[depth], depth, self._identity_kernel(), transposed=False
-        )
-
-        pts_source = alpha_source % queries.size(0)
-        depth_feature = torch_scatter.scatter_sum(
-            feature[alpha_target] * alpha_weight[alpha_source, None], pts_source, dim=0, dim_size=queries.size(0)
-        )
-
-        if compute_grad:
-            depth_grad = torch_scatter.scatter_sum(
-                feature[alpha_target][:, :, None] * grad_alpha_weight[alpha_source, None, :],
-                pts_source,
-                dim=0,
-                dim_size=queries.size(0),
-            )
-        else:
-            depth_grad = None
-
-        return depth_feature, depth_grad
-
-    def trilinear_interpolate(
-        self,
-        queries: torch.Tensor,
-        depth: int,
-        feature: torch.Tensor,
-        feature_bg: torch.Tensor = None,
-        compute_grad: bool = False,
-    ):
-        if feature_bg is not None:
-            assert feature_bg.ndim == 1
-            assert feature.size(1) == feature_bg.size(0), "Dimension not matched!"
-        else:
-            # Less memory version
-            # return self._trilerp_light(queries, depth, feature, compute_grad)
-            pass
-
-        from ext import sparse_op
-
-        nb_ids, nb_weight, nb_grad = sparse_op.trilerp(
-            self._hash_table[depth].object, queries, self.voxel_size, self._strides[depth], compute_grad
-        )
-
-        nb_ids = nb_ids.view(-1)
-        nb_weight = nb_weight.view(-1)
-        pts_ids = torch.tile(torch.arange(queries.size(0), device=queries.device)[:, None], (1, 8)).view(-1)
-
-        nb_mask = nb_ids > -1
-        depth_feature = torch_scatter.scatter_sum(
-            feature[nb_ids[nb_mask]] * nb_weight[nb_mask, None], pts_ids[nb_mask], dim=0, dim_size=queries.size(0)
-        )
-
-        if feature_bg is not None:
-            non_nb_mask = nb_ids == -1
-            depth_feature += torch_scatter.scatter_sum(
-                feature_bg[None, :] * nb_weight[non_nb_mask, None],
-                pts_ids[non_nb_mask],
-                dim=0,
-                dim_size=queries.size(0),
-            )
-
-        if compute_grad:
-            nb_grad = nb_grad.view(-1, nb_grad.size(-1))
-            depth_grad = torch_scatter.scatter_sum(
-                feature[nb_ids[nb_mask]][:, :, None] * nb_grad[nb_mask, None, :],
-                pts_ids[nb_mask],
-                dim=0,
-                dim_size=queries.size(0),
-            )
-            # Most of nb_grad[non_nb_mask] should be zero though...
-            if feature_bg is not None:
-                depth_grad += torch_scatter.scatter_sum(
-                    feature_bg[None, :, None] * nb_grad[non_nb_mask, None, :],
-                    pts_ids[non_nb_mask],
-                    dim=0,
-                    dim_size=queries.size(0),
-                )
-        else:
-            depth_grad = None
-
-        return depth_feature, depth_grad
diff --git a/fvdb/tests/unit/nkfw_api/ext/__init__.py b/fvdb/tests/unit/nkfw_api/ext/__init__.py
deleted file mode 100644
index be7c02d533..0000000000
--- a/fvdb/tests/unit/nkfw_api/ext/__init__.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright Contributors to the OpenVDB Project
-# SPDX-License-Identifier: Apache-2.0
-#
-import glob
-import os
-import os.path
-from pathlib import Path
-
-import torch
-from torch.utils.cpp_extension import load
-
-
-def load_torch_extension(name, additional_files=None, ignore_files=None, **kwargs):
-    if ignore_files is None:
-        ignore_files = []
-
-    if additional_files is None:
-        additional_files = []
-
-    def path_should_keep(pth):
-        for file_name in ignore_files:
-            if file_name in pth:
-                return False
-        return True
-
-    base_path = Path(__file__).parent / name
-    cpp_files = glob.glob(str(base_path / "*.cpp"), recursive=True)
-    cpp_files = filter(path_should_keep, cpp_files)
-    cu_files = glob.glob(str(base_path / "*.cu"), recursive=True)
-    cu_files = filter(path_should_keep, cu_files)
-
-    return load(
-        name="fvdb_test_" + name,
-        sources=list(cpp_files) + list(cu_files) + [base_path / t for t in additional_files],
-        verbose="COMPILE_VERBOSE" in os.environ.keys(),
-        extra_ldflags=["-L%s/lib" % os.environ.get("CONDA_PREFIX")] if os.environ.get("CONDA_PREFIX") else None,
-        **kwargs,
-    )
-
-
-common = load_torch_extension("common", extra_cflags=["-O2"], extra_cuda_cflags=["-O2", "-Xcompiler -fno-gnu-unique"])
-
-
-class CuckooHashTable:
-    # Note: This is supposed to be replaced by fVDB.
-    def __init__(self, data: torch.Tensor = None, hashed_data: torch.Tensor = None, enlarged: bool = False):
-        self.is_empty = False
-        if data is not None:
-            self.dim = data.size(1)
-            source_hash = self._sphash(data)
-        else:
-            self.dim = -1  # Never equals me.
-            source_hash = hashed_data
-        self.object = common.build_hash_table(source_hash, torch.tensor([]), enlarged)
-
-    @classmethod
-    def _sphash(cls, coords: torch.Tensor, offsets=None) -> torch.Tensor:  # Int64
-        assert coords.dtype in [torch.int, torch.long], coords.dtype
-        coords = coords.contiguous()
-        if offsets is None:
-            assert coords.ndim == 2 and coords.shape[1] in [2, 3, 4], coords.shape
-            if coords.size(0) == 0:
-                return torch.empty((coords.size(0),), dtype=torch.int64, device=coords.device)
-            return common.hash_cuda(coords)
-        else:
-            assert offsets.dtype == torch.int, offsets.dtype
-            assert offsets.ndim == 2 and offsets.shape[1] == 3, offsets.shape
-            assert coords.ndim == 2 and coords.shape[1] in [3, 4], coords.shape
-            if coords.size(0) == 0 or offsets.size(0) == 0:
-                return torch.empty((offsets.size(0), coords.size(0)), dtype=torch.int64, device=coords.device)
-            offsets = offsets.contiguous()
-            return common.kernel_hash_cuda(coords, offsets)
-
-    def query(self, coords, offsets=None):
-        assert coords.size(1) == self.dim
-        hashed_query = self._sphash(coords, offsets)
-        return self.query_hashed(hashed_query)
-
-    def query_hashed(self, hashed_query: torch.Tensor):
-        sizes = hashed_query.size()
-        hashed_query = hashed_query.view(-1)
-
-        if hashed_query.size(0) == 0:
-            return torch.zeros(sizes, dtype=torch.int64, device=hashed_query.device) - 1
-
-        output = common.hash_table_query(self.object, hashed_query.contiguous())
-        output = (output - 1).view(*sizes)
-
-        return output
diff --git a/fvdb/tests/unit/nkfw_api/ext/common/bind.cpp b/fvdb/tests/unit/nkfw_api/ext/common/bind.cpp
deleted file mode 100644
index a5615ca8f3..0000000000
--- a/fvdb/tests/unit/nkfw_api/ext/common/bind.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright Contributors to the OpenVDB Project
-// SPDX-License-Identifier: Apache-2.0
-//
-#include <torch/extension.h>
-#include "hashmap_cuda.cuh"
-#include <optional>
-
-// Hash computation
-at::Tensor hash_cuda(const at::Tensor idx);
-at::Tensor kernel_hash_cuda(const at::Tensor idx, const at::Tensor kernel_offset);
-
-// Hash queries (kernel queries should be flattened beforehand)
-HashLookupData build_hash_table(const at::Tensor hash_target, const at::Tensor idx_target,
-                                bool enlarge);
-at::Tensor     hash_table_query(const HashLookupData &hash_data, const at::Tensor hash_query);
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("hash_cuda", &hash_cuda, "");
-    m.def("kernel_hash_cuda", &kernel_hash_cuda, "");
-    m.def("build_hash_table", &build_hash_table);
-    m.def("hash_table_query", &hash_table_query, "");
-    py::class_<HashLookupData>(m, "HashLookupData");
-}
diff --git a/fvdb/tests/unit/nkfw_api/ext/common/hashmap_cuda.cu b/fvdb/tests/unit/nkfw_api/ext/common/hashmap_cuda.cu
deleted file mode 100644
index 483354def6..0000000000
--- a/fvdb/tests/unit/nkfw_api/ext/common/hashmap_cuda.cu
+++ /dev/null
@@ -1,399 +0,0 @@
-// Copyright Contributors to the OpenVDB Project
-// SPDX-License-Identifier: Apache-2.0
-//
-#include <c10/cuda/CUDAException.h>
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-
-#include "hashmap_cuda.cuh"
-
-__device__ static uint64_t
-atomicExch(uint64_t *addr, uint64_t val) {
-    return (uint64_t)atomicExch((unsigned long long int *)addr, (unsigned long long int)val);
-}
-
-__global__ void
-cuckooBucketKernel_Multi(uint64_t *const key_buf, uint64_t *const val_buf, const int size,
-                         const uint64_t *const keys, const uint64_t *const vals, const int n,
-                         int *const counters, const int num_buckets) {
-    // Get thread index.
-    int idx = threadIdx.x + blockIdx.x * blockDim.x;
-
-    // Only threads within range are active.
-    if (idx < n) {
-        // Do 1st-level hashing to get bucket id, then do atomic add to get index
-        // inside the bucket.
-        uint64_t key = keys[idx];
-        uint64_t val = vals ? vals[idx] : idx;
-
-        int bucket_num = do_1st_hash(key, num_buckets);
-        int bucket_ofs = atomicAdd(&counters[bucket_num], 1);
-
-        // Directly write the key into the table buffer.
-        if (bucket_ofs >= BUCKET_SIZE) {
-            printf("%d/%d ERROR: bucket overflow! (n=%d, bucket_num=%d/%d, key=%d)\n", bucket_ofs,
-                   BUCKET_SIZE, n, bucket_num, num_buckets, key);
-        } else {
-            key_buf[bucket_num * BUCKET_SIZE + bucket_ofs] = key;
-            val_buf[bucket_num * BUCKET_SIZE + bucket_ofs] = val;
-        }
-    }
-}
-
-__global__ void
-cuckooInsertKernel_Multi(uint64_t *const key, uint64_t *const val, const uint64_t *const key_buf,
-                         const uint64_t *const val_buf, const int size,
-                         const FuncConfig *const hash_func_configs, const int num_funcs,
-                         const int *const counters, const int num_buckets, const int evict_bound,
-                         const int pos_width, int *const rehash_requests) {
-    // Create local cuckoo table in shared memory. Size passed in as the third
-    // kernel parameter.
-    extern __shared__ uint64_t local_key[];
-    for (int i = 0; i < num_funcs; ++i) {
-        local_key[i * BUCKET_SIZE + threadIdx.x] = EMPTY_CELL;
-    }
-
-    // might be useful
-    __syncthreads();
-
-    // Get thread index.
-    int      idx     = threadIdx.x + blockIdx.x * blockDim.x;
-    uint64_t cur_idx = idx;
-
-    // Only threads within local bucket range are active.
-    if (threadIdx.x < counters[blockIdx.x]) {
-        // Set initial conditions.
-        uint64_t cur_key     = key_buf[cur_idx];
-        int      cur_func    = 0;
-        int      evict_count = 0;
-
-        // Start the test-kick-and-reinsert loops.
-        do {
-            int pos = do_2nd_hash(cur_key, hash_func_configs, cur_func, BUCKET_SIZE);
-
-            uint64_t new_data = make_data(cur_idx + 1, cur_func, pos_width);
-
-            uint64_t old_idx = atomicExch(&local_key[cur_func * BUCKET_SIZE + pos], new_data);
-
-            if (old_idx != EMPTY_CELL) {
-                cur_idx = fetch_val(old_idx, pos_width) - 1;
-                // potential overflow here. It seems that cur_idx < 0 is possible!
-                cur_key  = key_buf[cur_idx];
-                cur_func = (fetch_func(old_idx, pos_width) + 1) % num_funcs;
-                evict_count++;
-            } else {
-                break;
-            }
-
-        } while (evict_count < num_funcs * evict_bound);
-
-        // If exceeds eviction bound, then needs rehashing.
-        if (evict_count >= num_funcs * evict_bound) {
-            atomicAdd(rehash_requests, 1);
-        }
-    }
-
-    // Every thread write its responsible local slot into the global data table.
-    __syncthreads();
-    for (int i = 0; i < num_funcs; ++i) {
-        uint64_t cur_idx = local_key[i * BUCKET_SIZE + threadIdx.x];
-        if (cur_idx == EMPTY_CELL) {
-            continue;
-        }
-        int cur_func        = fetch_func(cur_idx, pos_width);
-        cur_idx             = fetch_val(cur_idx, pos_width) - 1;
-        key[i * size + idx] = key_buf[cur_idx];
-        val[i * size + idx] = val_buf[cur_idx];
-    }
-}
-
-__global__ void
-cuckooLookupKernel_Multi(const uint64_t *const keys, uint64_t *const results, const int n,
-                         const uint64_t *const all_keys, const uint64_t *const all_vals,
-                         const int size, const FuncConfig *const hash_func_configs,
-                         const int num_funcs, const int num_buckets, const int pos_width) {
-    int idx = threadIdx.x + blockIdx.x * blockDim.x;
-
-    // Only threads within range are active.
-    if (idx < n) {
-        uint64_t key = keys[idx];
-        results[idx] = hashtable_lookup(all_keys, all_vals, size, hash_func_configs, num_funcs,
-                                        num_buckets, key);
-    }
-}
-
-int
-CuckooHashTableCuda_Multi::insert_vals(const uint64_t *const keys, const uint64_t *const vals,
-                                       uint64_t *d_key_buf, uint64_t *d_val_buf, uint64_t *d_key,
-                                       uint64_t *d_val, const int n) {
-    //
-    // Phase 1: Distribute keys into buckets.
-    //
-
-    // Allocate GPU memory.
-
-    int *d_counters = NULL;
-
-    cudaMalloc((void **)&d_counters, _num_buckets * sizeof(int));
-
-    cudaMemset(d_counters, 0, _num_buckets * sizeof(int));
-
-    // Invoke bucket kernel.
-    if (n > 0) {
-        cuckooBucketKernel_Multi<<<ceil((double)n / BUCKET_SIZE), BUCKET_SIZE>>>(
-            d_key_buf, d_val_buf, _size, keys, vals, n, d_counters, _num_buckets);
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-    }
-
-    //
-    // Phase 2: Local cuckoo hashing.
-    //
-
-    // Allocate GPU memory.
-
-    cudaDeviceSynchronize();
-    int *d_rehash_requests = NULL;
-
-    cudaMalloc((void **)&d_rehash_requests, sizeof(int));
-
-    // Copy values onto GPU memory.
-    cudaMemcpy(_d_hash_func_configs, _hash_func_configs, _num_funcs * sizeof(FuncConfig),
-               cudaMemcpyHostToDevice);
-
-    // Invoke insert kernel. Passes shared memory table size by the third
-    // argument. Loops until no rehashing needed.
-
-    int rehash_count = 0;
-    do {
-        int rehash_requests = 0;
-        cudaMemset(d_rehash_requests, 0, sizeof(int));
-        cuckooInsertKernel_Multi<<<ceil((double)_size / BUCKET_SIZE), BUCKET_SIZE,
-                                   _num_funcs * BUCKET_SIZE * sizeof(uint64_t)>>>(
-            d_key, d_val, d_key_buf, d_val_buf, _size, _d_hash_func_configs, _num_funcs, d_counters,
-            _num_buckets, _evict_bound, _pos_width, d_rehash_requests);
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-        cudaMemcpy(&rehash_requests, d_rehash_requests, sizeof(int), cudaMemcpyDeviceToHost);
-
-        if (rehash_requests == 0) {
-            break;
-        } else {
-            rehash_count++;
-            gen_hash_funcs();
-            cudaMemcpy(_d_hash_func_configs, _hash_func_configs, _num_funcs * sizeof(FuncConfig),
-                       cudaMemcpyHostToDevice);
-        }
-    } while (rehash_count < MAX_DEPTH);
-
-    cudaDeviceSynchronize();
-
-    // Free GPU resources.
-
-    if (d_counters != NULL) {
-        cudaFree(d_counters);
-    }
-    if (d_rehash_requests != NULL) {
-        cudaFree(d_rehash_requests);
-    }
-
-    _inserted_size = n;
-    return (rehash_count < MAX_DEPTH) ? rehash_count : ERR_DEPTH;
-}
-
-// kernel hashing: given data D and offset map K, generate D x K
-// input N*4 int32 tensor, |K|*3 int32 tensor, output |K|*N int64 tensor
-template <class T, int DimSize>
-__global__ void
-kernel_hash_kernel(int N, int K, const T *__restrict__ data, const int *__restrict__ kernel_offset,
-                   int64_t *__restrict__ out) {
-    extern __shared__ int kernel_offset_local[];
-
-    for (int i = 0; i < K * 3; i++) {
-        kernel_offset_local[i] = kernel_offset[i];
-    }
-    __syncthreads();
-
-    int idx = blockDim.x * blockIdx.x + threadIdx.x;
-    int k   = idx % K;
-    int i   = idx / K;
-    T   cur_coord[DimSize];
-    if (i < N) {
-        data += i * DimSize;
-        for (int j = 0; j < 3; j++) {
-            cur_coord[j] = data[j] + kernel_offset[k * 3 + j];
-        }
-        if (DimSize == 4) {
-            cur_coord[3] = (T)data[3];
-        }
-        uint64_t hash = 14695981039346656037UL;
-        for (int j = 0; j < DimSize; j++) {
-            hash ^= (unsigned int)cur_coord[j];
-            hash *= 1099511628211UL;
-        }
-        hash           = (hash >> 60) ^ (hash & 0xFFFFFFFFFFFFFFF);
-        out[k * N + i] = hash;
-    }
-}
-
-template <class T, int DimSize>
-void
-kernel_hash_wrapper(int N, int K, const T *data, const int *kernel_offset, int64_t *out) {
-    kernel_hash_kernel<T, DimSize>
-        <<<ceil((double)(N * K) / 512), 512, K * 3 * sizeof(int)>>>(N, K, data, kernel_offset, out);
-    C10_CUDA_KERNEL_LAUNCH_CHECK();
-}
-
-// hashing
-// input N*4 int32 tensor output N*1 int64 tensor
-template <class T, int DimSize>
-__global__ void
-hash_kernel(int N, const T *__restrict__ data, int64_t *__restrict__ out) {
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    if (i < N) {
-        data += i * DimSize;
-        uint64_t hash = 14695981039346656037UL;
-        for (int j = 0; j < DimSize; j++) {
-            hash ^= (unsigned int)data[j];
-            hash *= 1099511628211UL;
-        }
-        hash   = (hash >> 60) ^ (hash & 0xFFFFFFFFFFFFFFF);
-        out[i] = hash;
-    }
-}
-
-template <class T, int DimSize>
-void
-hash_wrapper(int N, const T *data, int64_t *out) {
-    hash_kernel<T, DimSize><<<ceil((double)N / 512), 512>>>(N, data, out);
-    C10_CUDA_KERNEL_LAUNCH_CHECK();
-}
-
-at::Tensor
-hash_cuda(const at::Tensor idx) {
-    int        N   = idx.size(0);
-    at::Tensor out = torch::zeros({ N }, at::device(idx.device()).dtype(at::ScalarType::Long));
-
-    switch (idx.size(1)) {
-    case 2: {
-        if (idx.dtype() == torch::ScalarType::Long) {
-            hash_wrapper<int64_t, 2>(N, idx.data_ptr<int64_t>(), out.data_ptr<int64_t>());
-        } else if (idx.dtype() == torch::ScalarType::Int) {
-            hash_wrapper<int, 2>(N, idx.data_ptr<int>(), out.data_ptr<int64_t>());
-        }
-        break;
-    }
-    case 3: {
-        if (idx.dtype() == torch::ScalarType::Long) {
-            hash_wrapper<int64_t, 3>(N, idx.data_ptr<int64_t>(), out.data_ptr<int64_t>());
-        } else if (idx.dtype() == torch::ScalarType::Int) {
-            hash_wrapper<int, 3>(N, idx.data_ptr<int>(), out.data_ptr<int64_t>());
-        }
-        break;
-    }
-    case 4: {
-        if (idx.dtype() == torch::ScalarType::Long) {
-            hash_wrapper<int64_t, 4>(N, idx.data_ptr<int64_t>(), out.data_ptr<int64_t>());
-        } else if (idx.dtype() == torch::ScalarType::Int) {
-            hash_wrapper<int, 4>(N, idx.data_ptr<int>(), out.data_ptr<int64_t>());
-        }
-        break;
-    }
-    default: {
-        std::cerr << "Error. Not compiled" << std::endl;
-    }
-    }
-
-    return out;
-}
-
-at::Tensor
-kernel_hash_cuda(const at::Tensor idx, const at::Tensor kernel_offset) {
-    int        N   = idx.size(0);
-    int        K   = kernel_offset.size(0);
-    at::Tensor out = torch::zeros({ K, N }, at::device(idx.device()).dtype(at::ScalarType::Long));
-    switch (idx.size(1)) {
-    case 3: {
-        if (idx.dtype() == torch::ScalarType::Long) {
-            kernel_hash_wrapper<int64_t, 3>(N, K, idx.data_ptr<int64_t>(),
-                                            kernel_offset.data_ptr<int>(), out.data_ptr<int64_t>());
-        } else if (idx.dtype() == torch::ScalarType::Int) {
-            kernel_hash_wrapper<int, 3>(N, K, idx.data_ptr<int>(), kernel_offset.data_ptr<int>(),
-                                        out.data_ptr<int64_t>());
-        }
-        break;
-    }
-    case 4: {
-        if (idx.dtype() == torch::ScalarType::Long) {
-            kernel_hash_wrapper<int64_t, 4>(N, K, idx.data_ptr<int64_t>(),
-                                            kernel_offset.data_ptr<int>(), out.data_ptr<int64_t>());
-        } else if (idx.dtype() == torch::ScalarType::Int) {
-            kernel_hash_wrapper<int, 4>(N, K, idx.data_ptr<int>(), kernel_offset.data_ptr<int>(),
-                                        out.data_ptr<int64_t>());
-        }
-        break;
-    }
-    default: {
-        std::cerr << "Error. Not compiled" << std::endl;
-    }
-    }
-    return out;
-}
-
-HashLookupData
-build_hash_table(const at::Tensor hash_target, const at::Tensor idx_target, bool enlarge) {
-    // When n is large, the hash values tend to be more evenly distrubuted and
-    // choosing table_size to be 2 * nextPow2 typically suffices. For smaller n,
-    // the effect of uneven distribution of hash values is more pronounced and
-    // hence we choose table_size to be 4 * nextPow2 to reduce the chance of
-    // bucket overflow.
-    int n_source = hash_target.size(0);
-
-    const int nextPow2   = pow(2, ceil(log2((double)n_source)));
-    int       table_size = (n_source < 2048) ? 4 * nextPow2 : 2 * nextPow2;
-    if (enlarge) {
-        table_size = 4 * nextPow2;
-    }
-
-    if (table_size < 512) {
-        table_size = 512;
-    }
-    int                       num_funcs = 3;
-    CuckooHashTableCuda_Multi in_hash_table(table_size, 8 * ceil(log2((double)n_source)),
-                                            num_funcs);
-
-    auto       long_tensor_option = at::device(hash_target.device()).dtype(at::ScalarType::Long);
-    at::Tensor key_buf            = torch::zeros({ table_size }, long_tensor_option);
-    at::Tensor val_buf            = torch::zeros({ table_size }, long_tensor_option);
-    at::Tensor hash_key           = torch::zeros({ num_funcs * table_size }, long_tensor_option);
-    at::Tensor hash_val           = torch::zeros({ num_funcs * table_size }, long_tensor_option);
-
-    bool default_idx = idx_target.size(0) == 0;
-    in_hash_table.insert_vals((uint64_t *)(hash_target.data_ptr<int64_t>()),
-                              default_idx ? nullptr : (uint64_t *)(idx_target.data_ptr<int64_t>()),
-                              (uint64_t *)(key_buf.data_ptr<int64_t>()),
-                              (uint64_t *)(val_buf.data_ptr<int64_t>()),
-                              (uint64_t *)(hash_key.data_ptr<int64_t>()),
-                              (uint64_t *)(hash_val.data_ptr<int64_t>()), n_source);
-
-    auto hash_data = in_hash_table.get_data(hash_key, hash_val);
-
-    return hash_data;
-}
-
-at::Tensor
-hash_table_query(const HashLookupData &hash_data, const at::Tensor hash_query) {
-    int  n1         = hash_query.size(0);
-    auto hash_param = hash_data.get_param();
-
-    at::Tensor out =
-        torch::zeros({ n1 }, at::device(hash_query.device()).dtype(at::ScalarType::Long));
-
-    cuckooLookupKernel_Multi<<<ceil((double)n1 / BUCKET_SIZE), BUCKET_SIZE>>>(
-        (uint64_t *)(hash_query.data_ptr<int64_t>()), (uint64_t *)(out.data_ptr<int64_t>()), n1,
-        hash_param.d_key, hash_param.d_val, hash_param.size, hash_param.config,
-        hash_param.num_funcs, hash_param.num_buckets, 0);
-    C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-    hash_data.release_param(hash_param);
-    return out;
-}
diff --git a/fvdb/tests/unit/nkfw_api/ext/common/hashmap_cuda.cuh b/fvdb/tests/unit/nkfw_api/ext/common/hashmap_cuda.cuh
deleted file mode 100644
index 8949b3bc04..0000000000
--- a/fvdb/tests/unit/nkfw_api/ext/common/hashmap_cuda.cuh
+++ /dev/null
@@ -1,241 +0,0 @@
-// Copyright Contributors to the OpenVDB Project
-// SPDX-License-Identifier: Apache-2.0
-//
-#pragma once
-
-#include <torch/torch.h>
-#include <cmath>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <vector>
-
-#include "cuda_runtime.h"
-
-/** Reserved value for indicating "empty". */
-#define EMPTY_CELL (0)
-/** Max rehashing depth, and error depth. */
-#define MAX_DEPTH (100)
-#define ERR_DEPTH (-1)
-/** CUDA naive thread block size. */
-#define BLOCK_SIZE (256)
-/** CUDA multi-level thread block size = bucket size. */
-#define BUCKET_SIZE (512)
-
-/** Struct of a hash function config. */
-typedef struct {
-    int rv; // Randomized XOR value.
-    int ss; // Randomized shift filter start position.
-} FuncConfig;
-
-/** Hard code hash functions and all inline helper functions for CUDA kernels'
- * use. */
-inline __device__ int
-do_1st_hash(const uint64_t val, const int num_buckets) {
-    return val % num_buckets;
-}
-
-inline __device__ int
-do_2nd_hash(const uint64_t val, const FuncConfig *const hash_func_configs, const int func_idx,
-            const int size) {
-    FuncConfig fc = hash_func_configs[func_idx];
-    return ((val ^ fc.rv) >> fc.ss) % size; // XOR function as 2nd-level hashing.
-}
-
-// trying to ignore EMPTY_CELL by adding 1 at make_data.
-inline __device__ uint64_t
-fetch_val(const uint64_t data, const int pos_width) {
-    return data >> pos_width;
-}
-
-inline __device__ int
-fetch_func(const uint64_t data, const int pos_width) {
-    return data & ((0x1 << pos_width) - 1);
-}
-
-inline __device__ uint64_t
-make_data(const uint64_t val, const int func, const int pos_width) {
-    return (val << pos_width) ^ func;
-}
-
-struct HashLookupParam {
-    uint64_t   *d_key;
-    uint64_t   *d_val;
-    int         size;
-    FuncConfig *config = nullptr;
-    int         num_funcs;
-    int         num_buckets;
-};
-
-class HashLookupData {
-    HashLookupParam _param;
-    friend class CuckooHashTableCuda_Multi;
-
-  public:
-    int                     inserted_size;
-    torch::Tensor           keys;
-    torch::Tensor           vals;
-    std::vector<FuncConfig> config;
-    HashLookupParam
-    get_param() const {
-        HashLookupParam p = _param;
-        cudaMalloc((void **)&p.config, p.num_funcs * sizeof(FuncConfig));
-        cudaMemcpy(p.config, config.data(), p.num_funcs * sizeof(FuncConfig),
-                   cudaMemcpyHostToDevice);
-        p.d_key = (uint64_t *)(keys.data_ptr<int64_t>());
-        p.d_val = (uint64_t *)(vals.data_ptr<int64_t>());
-        return p;
-    }
-    torch::Device
-    device() const {
-        return this->keys.device();
-    }
-    void
-    release_param(const HashLookupParam &param) const {
-        cudaFree(param.config);
-    }
-};
-
-class CuckooHashTableCuda_Multi {
-  private:
-    const int _size;
-    const int _evict_bound;
-    const int _num_funcs;
-    const int _pos_width;
-    const int _num_buckets;
-    int       _inserted_size;
-
-    FuncConfig *_d_hash_func_configs;
-
-    /** Cuckoo hash function set. */
-    FuncConfig *_hash_func_configs;
-
-    /** Private operations. */
-    void
-    gen_hash_funcs() {
-        // Calculate bit width of value range and table size.
-        int val_width    = 8 * sizeof(uint64_t) - ceil(log2((double)_num_funcs));
-        int bucket_width = ceil(log2((double)_num_buckets));
-        int size_width   = ceil(log2((double)BUCKET_SIZE));
-        // Generate randomized configurations.
-        for (int i = 0; i < _num_funcs; ++i) { // At index 0 is a dummy function.
-            if (val_width - bucket_width <= size_width)
-                _hash_func_configs[i] = { rand(), 0 };
-            else {
-                _hash_func_configs[i] = {
-                    rand(), rand() % (val_width - bucket_width - size_width + 1) + bucket_width
-                };
-            }
-        }
-    };
-
-    inline uint64_t
-    fetch_val(const uint64_t data) {
-        return data >> _pos_width;
-    }
-    inline int
-    fetch_func(const uint64_t data) {
-        return data & ((0x1 << _pos_width) - 1);
-    }
-
-  public:
-    CuckooHashTableCuda_Multi(const int size, const int evict_bound, const int num_funcs)
-        : _size(size), _evict_bound(evict_bound), _num_funcs(num_funcs),
-          _pos_width(ceil(log2((double)_num_funcs))),
-          _num_buckets(ceil((double)_size / BUCKET_SIZE)) {
-        srand(time(NULL));
-        _d_hash_func_configs = NULL;
-        _hash_func_configs   = NULL;
-        _hash_func_configs   = new FuncConfig[num_funcs];
-
-        gen_hash_funcs();
-
-        cudaMalloc((void **)&_d_hash_func_configs, _num_funcs * sizeof(FuncConfig));
-        cudaMemcpy(_d_hash_func_configs, _hash_func_configs, _num_funcs * sizeof(FuncConfig),
-                   cudaMemcpyHostToDevice);
-    };
-    ~CuckooHashTableCuda_Multi() {
-        if (_hash_func_configs != NULL)
-            delete[] _hash_func_configs;
-
-        if (_d_hash_func_configs != NULL)
-            cudaFree(_d_hash_func_configs);
-    };
-
-    HashLookupData
-    get_data(torch::Tensor keys, torch::Tensor vals) const {
-        HashLookupData data;
-        data._param.size        = _size;
-        data._param.num_funcs   = _num_funcs;
-        data._param.num_buckets = _num_buckets;
-        for (int i = 0; i < _num_funcs; ++i) {
-            data.config.push_back(_hash_func_configs[i]);
-        }
-        data.vals          = vals;
-        data.keys          = keys;
-        data.inserted_size = _inserted_size;
-        return data;
-    }
-
-    int insert_vals(const uint64_t *const keys, const uint64_t *const vals, uint64_t *d_key_buf,
-                    uint64_t *d_val_buf, uint64_t *d_key, uint64_t *d_val, const int n);
-};
-
-__global__ void cuckooBucketKernel_Multi(uint64_t *const key_buf, uint64_t *const val_buf,
-                                         const int size, const uint64_t *const keys,
-                                         const uint64_t *const vals, const int n,
-                                         int *const counters, const int num_buckets);
-
-__global__ void cuckooInsertKernel_Multi(uint64_t *const key, uint64_t *const val,
-                                         const uint64_t *const key_buf,
-                                         const uint64_t *const val_buf, const int size,
-                                         const FuncConfig *const hash_func_configs,
-                                         const int num_funcs, const int *const counters,
-                                         const int num_buckets, const int evict_bound,
-                                         const int pos_width, int *const rehash_requests);
-
-__global__ void cuckooLookupKernel_Multi(const uint64_t *const keys, uint64_t *const results,
-                                         const int n, const uint64_t *const all_keys,
-                                         const uint64_t *const all_vals, const int size,
-                                         const FuncConfig *const hash_func_configs,
-                                         const int num_funcs, const int num_buckets,
-                                         const int pos_width);
-
-inline __device__ uint64_t
-hashtable_lookup(const uint64_t *const all_keys, const uint64_t *const all_vals, const int size,
-                 const FuncConfig *const hash_func_configs, const int num_funcs,
-                 const int num_buckets, uint64_t key) {
-    int bucket_num = do_1st_hash(key, num_buckets);
-    for (int i = 0; i < num_funcs; ++i) {
-        int pos = bucket_num * BUCKET_SIZE + do_2nd_hash(key, hash_func_configs, i, BUCKET_SIZE);
-        if (all_keys[i * size + pos] == key) {
-            return all_vals[i * size + pos] + 1;
-        }
-    }
-    return EMPTY_CELL;
-}
-
-inline __device__ uint64_t
-hash2(unsigned int a, unsigned int b) {
-    uint64_t hash = 14695981039346656037UL;
-    hash ^= a;
-    hash *= 1099511628211UL;
-    hash ^= b;
-    hash *= 1099511628211UL;
-    hash = (hash >> 60) ^ (hash & 0xFFFFFFFFFFFFFFF);
-    return hash;
-}
-
-template <class T>
-inline __device__ uint64_t
-hash3(const T a, const T b, const T c) {
-    uint64_t hash = 14695981039346656037UL;
-    hash ^= (unsigned int)a;
-    hash *= 1099511628211UL;
-    hash ^= (unsigned int)b;
-    hash *= 1099511628211UL;
-    hash ^= (unsigned int)c;
-    hash *= 1099511628211UL;
-    hash = (hash >> 60) ^ (hash & 0xFFFFFFFFFFFFFFF);
-    return hash;
-}
diff --git a/fvdb/tests/unit/nkfw_api/ext/common/torch_ptr.cuh b/fvdb/tests/unit/nkfw_api/ext/common/torch_ptr.cuh
deleted file mode 100644
index 250b1ca19f..0000000000
--- a/fvdb/tests/unit/nkfw_api/ext/common/torch_ptr.cuh
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright Contributors to the OpenVDB Project
-// SPDX-License-Identifier: Apache-2.0
-//
-#ifndef TORCH_PTR_CUH
-#define TORCH_PTR_CUH
-
-#include <torch/torch.h>
-
-#define float1in packed_accessor32<float, 1, torch::RestrictPtrTraits>
-#define float2in packed_accessor32<float, 2, torch::RestrictPtrTraits>
-#define float3in packed_accessor32<float, 3, torch::RestrictPtrTraits>
-using Float1Accessor = torch::PackedTensorAccessor32<float, 1, torch::RestrictPtrTraits>;
-using Float2Accessor = torch::PackedTensorAccessor32<float, 2, torch::RestrictPtrTraits>;
-using Float3Accessor = torch::PackedTensorAccessor32<float, 3, torch::RestrictPtrTraits>;
-
-#define int1in packed_accessor32<int, 1, torch::RestrictPtrTraits>
-#define int2in packed_accessor32<int, 2, torch::RestrictPtrTraits>
-#define int3in packed_accessor32<int, 3, torch::RestrictPtrTraits>
-using Int1Accessor = torch::PackedTensorAccessor32<int, 1, torch::RestrictPtrTraits>;
-using Int2Accessor = torch::PackedTensorAccessor32<int, 2, torch::RestrictPtrTraits>;
-using Int3Accessor = torch::PackedTensorAccessor32<int, 3, torch::RestrictPtrTraits>;
-
-#define long1in packed_accessor32<int64_t, 1, torch::RestrictPtrTraits>
-#define long2in packed_accessor32<int64_t, 2, torch::RestrictPtrTraits>
-#define long3in packed_accessor32<int64_t, 3, torch::RestrictPtrTraits>
-using Long1Accessor = torch::PackedTensorAccessor32<int64_t, 1, torch::RestrictPtrTraits>;
-using Long2Accessor = torch::PackedTensorAccessor32<int64_t, 2, torch::RestrictPtrTraits>;
-using Long3Accessor = torch::PackedTensorAccessor32<int64_t, 3, torch::RestrictPtrTraits>;
-
-#endif // TORCH_PTR_CUH
diff --git a/fvdb/tests/unit/nkfw_api/kernel.npz b/fvdb/tests/unit/nkfw_api/kernel.npz
deleted file mode 100644
index 32893e79b4d54bdac4e832f4416fbd16c2a8a2fd..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3186
zcmZ`+YgAK*md4v&v7;k;tFjymu~#jcT*+;z(rQd<ZEGtR0z!xiB<-@8UJ(Lhc;wv<
zZK2S@ttG@eJW@*q%<EJ!B!nbK1|lU0sc1-*M>KE}k|2>hNOJNTocS@oX79DXZ|%Ll
zKYQ){=gUlc;l*t+F)<s~>eZM{r&Z#`jWIDFvSVWYC1zVpQDJ#W;icq~Qu?}>E&nrJ
zlf^t&JdgHibz{Q&TfTmB7M-D}AK$Sz>DQOu9sYFppZ@UYPrumu;f6iCzTYkVk1_4s
zy1hi#kHkY=KNtdCT?0yGgt@9mYv9lrKy5TNnw%grVxQ*w1T(W-cz(o(UtR{eD=s5n
z&xu&HW6q9f19x?)o2ZWtlEcrI_||AB^0>1bk7JbkJVQxU+?g=A64^TcHE5kK>;27u
zNogA~nI~v(9L65jv{9IN`105X@=@8stx*UGHWs%(Ri~pba#IqoIJbJ-Go}eN$s;R~
zbYc>#O}<1eOP2ApcWTzpquPo{wbl3fO|^dnXrizI+e{uMW81qu<8+Jfu2s1YNN;p=
zR<0G)1zB$9a;HbS^bx&9^)+WGK^Et8k^`lJ<uR{=HC)twM1D@sHXJ!Y<VZu0MuXSz
z5LbU2qTC-z&nQfilFI>5JM!%#(`)>65bqa<3uM-DTtC8!Qkl<a)4V?|<m(~s2Txj~
zvz~G{LQ0POv~=Gpix7fnY=|9P^%ei7XHXy07EQw#`CZxz>QJ3tIgxS#Zaw3R@y!gg
z$@u|(sL#6)-H=a@C+n_F^e!5J!KT}kx*{Ld43wiZT0`d`&-7Y+;D@F_p4HM{Kh|A&
zAXxyT$^to2EgJ~wy5h9_@`LOwU^v0ft8Tx)4rgyw1W%iWiy~&w0$_d#edpQ_65S%c
z1)-^Wku90m6o3oll|s(g$RJHGS~{y*I34L)%41&E*K;cp%~~*6HS3Cn^nD7BVL1(b
z3LJ)Ly-m8K$iz#Pc`%W4!`1FW<hA`H>Ov6b-_siE&u>p{yp6YH3xGS4o9Cf@?f$I_
zYp<<fbjU0Bp9`JhzC)dRymo!c>H#%lU%R7PibZ1${DUS*HSJIob#!{+38#-Ha--?Z
zVP3)*WmLWN<RZHlG7-Zf1*^oG>E>aJ1C1Vov*#_kVJ;t^f6Qbi1&GE2>La(UK1+O2
zhpuXtAKb703!M=9Ub=QE0!|cCo?k4Q6}f~e3H%9UMIyxTDe#=L*lWm*RTYuDT~6Ju
zdd+GUBgf-ax%{GV<Ia5uW5p%*4qKm6?J4#D9{Md;|H`+*`=p_0(V|7|n~SUhu>Uq@
zBKq1hGsUtUd`V@_S)uK6-^I=V&IAm1JSfKqz@{J9wu>)=B-naUypG6#^2=CL*_-YC
zt^@F7YTzLI{@k#Gbj$MgRe{w-2$ml!gT#p3&38uWc=gOwRY92dmf7FLNh3Q9%12)X
zzEqSSU{1U8^aa>qlm;2=*tn8X3SCG{N(L=xGn`@~k|wFHss#6b<Gm>IWEn)o{7GPn
zbtiti_S+u<TmrR^KFe3Q9~_Bw*Ab7AtVD;woigQ}i3R8n8U_LT*@rS#9iMdlq%=v&
zcF47qX3`NB5aLzlx^@U&Ey)+Eeq6{a6NkQZZA<OARDJ+X@xK{2nV6;@7Lm<Z7U1e~
zwcg3HqUnn`R7XzR7T*kV^QhHO-DlJ8K%+7c7?b%1<Ig0Myjh_5x9&9kI8Gkw^mI-P
z`S+6Cd~h|eBlebQ9FY_1aUZ4<Cd`z(RE?ygt_W`*-Q*>!LJUcE_|2yZ`!BABX_K%@
z9vL)Xddw=T#`aO53d)4V$66470m-n!z;&Ia*H+t38fqu9D>^wRR1@j&Wqoc^G3P4$
z5rI!6|Ln=2w{$ckdICrR8m{qeUX;yCq&IN{&bthsBP0dAWfvpn-la3qbHQnNd|$0j
zkTFNxY|uZlKe6+TfvNuL#_f0{fCLGFOJ2PYT`D7Yu?}-{8QEpxMMtPUdb!4$$?3OW
zk>J<JrK*2<Uv#{w1tAW}%-<=36513=Yl?)L+WBFAG&3WG>YZ)_Ld~8$jim>h>s=8o
zU&qMd7Y(u3-95^D2K0KT2B;uUZ6r`~@&PzK+?ExQFVm6aIXHg0pT{|1J=oFQzDPxZ
zJ^pNt)cnruNU@Jq*bT+EH5C`)lyK%W`$`4Vuxpa)t+J_uVn3NJg*#4j`Z##+0U_d-
zm8|e(JMCBQ=+)&csr_MYTu!vduG~|45-M<>8+pJP%Meb8RL(l<H2Z#~iQpI@8v)uQ
zKY$%ERO32b4LZGDRnHAbLKy>Fi5XJj3eCYKV6OYmnD(@QlHflMYTCarD!t)LT8Z6F
znCmwX2spiSWaDzMM(_nQxhA^Zx^l&IqWhKgsmYt9`32Oe*G(^TxA}`7ABvujOsIzV
zit-;aaMxxMM`(~G(tN*7q;UQ{+A);=82G2jHw&;HFwB+&-SN<d(5LN#GV7q%UE_$0
z<(npVLCGZ;Pk6FuWzVy<kHs4WpN+_X@0;{-i+4@UV>=b@MM0Xz&S%vHH4I5&1D~9O
zzC|vBZ2fgamN3BUg|l5=rv<G|D~Wq{UiC7dxe%aO0p*D!wpr>!ip@x1B5_C(z4#tV
zue8oAJ-f(C>9`b;E)mqpY$5AChQ8nY>mcpvyQ~i$y8FPJO(M=<_#CxP6t~(tY}goV
z(Dk|=nZc#5#cR{9c~`4$obk7rfN<`w_$>hKb^Ud>1k-I!gHXIC9<J5P!mF>V23$r_
z**)u2wwVALLwB<0JA;JH+vWCGo^JdGl{znAwO}7;bRDNUCdl;9y`X<dKGA9Z(t34n
zytJTJ5xkN`ue8+$KDO1s^US>V1tm}L5moSEKIdso9m)!mE;LXJ1-w*30y*;Yf`vnA
zjuP?0MkG_D`C9<**y;v#%x`P&6o$WY?VPDtPb{f1hA5BsP*>{XWk5=)74RQ*62tkN
zKDY=1Fe5(!n$uo{Fu4jeTfVV><KOAu>abH62H;a>f>i-VTM=(H!Nr{2Y6ex+9NEP1
zemWLsZ}UcZ#NzOP4kgFPC-8LYN@jp?OlhFxktq&h3H{<|zvrfFP?*+yBEUNsLB+Zo
zMf7esmpv>=K8VZl3O#uVsBo_76n8z_CbNE0K&&<lP4e61oJzy!cA|*W64?L?8$3~S
z1vx+LKS-KVw=i<)$F)t9MmwVI3xv2AvfW7!S3+MAgC{$vTJ6mf?%0)-XW!Q_vMX|+
ze<eC)hjC@=lLf78ega?JQXuShvAr0io`!W)l|dh~_ex)}CraVZ400YQf~yY2CF8Q_
zl!yAt5?LH17Kj<|sFu-bCgI$E?%8q_?lj%AQ&HZ7sT#{hcI3?T(2(kK@@74--0uj!
z1d*);UV^s3IW`QLn4~~6+Ier-arARo^WNLd9I6SOqd6TefHHmWPeHnqZ~vb9a@Fk4
zNr5UU8JA)Yeq-<5&kbw=ILD!NI_|TSQlr46`=k{5Yc-o(E(3Bn{qXM90&QBHgIO<Z
zutzQ~*f<YPy74Pdk0z}Mv(p+OCo1{|{yHamA+->I^hlF9@*WhtmTCadY~<Uh+MDup
zygf<~>Pf3)-w`O@uIUcFKl=iGbL*Uth!xzJP2%B%%q~}UMKmWOTK>*HGY;yoeZi~s
z6a)W1Z;U!F&Sm-b8{!7wSqhXVLp!ckjSuxX|B0lcq6L9APiyoC&BSr03C_<WaSw6J
z$$5!=Y)U>ODZH9cG)u`nwwnzh9#OqJ9oX#9Ce{a|B2TAgqsV#0akOS9k9-j{R|q~%
zc<k1p`>?-Oa`QRw!cPY-*hGRiIv2GLtQdA`$iAWNMwGM}A!d5uiCgfgylBz1AR=~M
z)mdkVY_OkA{aKw8{riCN-*E=CI5N_rFRs9AwL+tVbOZJ*pASaxuu$I2^&Y)Yoort1
zmOJBwB>;0#=|ghM)^nbzT>qsz-=`T`4G#|nx5o>?GvP>mEamYpy{k+l)s1BZmjTyj
zwp6b5cldRlkJxdJEmKB;;G2P2P2Ocru;Y5@?8D=ufZaCPsRM=7SJ?)F9b8S}M%QPi
pt^32a7ydt&@V{Q*IexzGIsTt#$V~g=`sb5%Yx&k1@7?sg`)^{wLwEoH

diff --git a/fvdb/tests/unit/test_nkfw_api.py b/fvdb/tests/unit/test_nkfw_api.py
deleted file mode 100644
index de6dcc5534..0000000000
--- a/fvdb/tests/unit/test_nkfw_api.py
+++ /dev/null
@@ -1,329 +0,0 @@
-# Copyright Contributors to the OpenVDB Project
-# SPDX-License-Identifier: Apache-2.0
-#
-import functools
-import os
-import unittest
-import weakref
-
-import numpy as np
-import torch
-from torch import testing
-
-from .nkfw_api.backend.fvdb import SparseFeatureHierarchy as FVDBBackend
-from .nkfw_api.backend.hash_table import SparseFeatureHierarchy as HashTableBackend
-
-assert_tensor_equal = functools.partial(testing.assert_close, rtol=0, atol=0)
-
-
-def assert_tensor_set_equal(t_a: torch.Tensor, t_b: torch.Tensor, dim: int = 0):
-    t_a = torch.sort(t_a, dim=dim).values
-    t_b = torch.sort(t_b, dim=dim).values
-    assert_tensor_equal(t_a, t_b)
-
-
-def permute_neighbour(nn_info, perm_src, perm_tgt):
-
-    def transpose_permutation(perm: torch.Tensor):
-        p = torch.empty(perm.size(0), dtype=torch.long, device=perm.device)
-        p[perm] = torch.arange(perm.size(0), dtype=torch.long, device=perm.device)
-        return p
-
-    src_inds = transpose_permutation(perm_src)[nn_info[:, 0]]
-    tgt_inds = transpose_permutation(perm_tgt)[nn_info[:, 1]]
-    return torch.concat([torch.stack([src_inds, tgt_inds], dim=1), nn_info[:, 2:]], dim=1)
-
-
-class TestNKFWAPI(unittest.TestCase):
-    def setUp(self) -> None:
-        self.device = torch.device("cuda:0")
-        self.range_kernel = weakref.WeakMethod(self._get_range_kernel)
-        self.backend_cls = FVDBBackend
-        # self.backend_cls = HashTableBackend
-
-    cached_kernels = None
-
-    def _get_range_kernel(self, n_range):
-        assert n_range % 2 == 1, "target_range must be odd."
-        if self.cached_kernels is None:
-            data = np.load(os.path.join(os.path.dirname(__file__), "nkfw_api", "kernel.npz"))
-            self.cached_kernels = data["kernel"]
-        kernel = torch.tensor(self.cached_kernels.copy()[: n_range**3], dtype=torch.int, device=self.device)
-        return kernel
-
-    def test_meta_ops(self):
-        index = self.backend_cls(4, 0.1, self.device, self.range_kernel)
-        self.assertEqual(index.depth, 4)
-        self.assertEqual(index.get_stride(3), 8)
-
-        level_0_ijk = torch.tensor([[0, 0, 0], [0, 0, 1], [1, 0, 0], [1, 0, 1]], dtype=torch.int, device=self.device)
-        level_1_ijk = torch.tensor([[0, 2, 0], [0, 0, 0]], dtype=torch.int, device=self.device)
-
-        level_0_ijk, _ = index.update_coords(0, level_0_ijk)
-        level_1_ijk, _ = index.update_coords(1, level_1_ijk)
-
-        self.assertEqual(index.get_num_voxels(0), 4)
-        self.assertEqual(index.get_num_voxels(1), 2)
-        self.assertEqual(index.get_num_voxels(2), 0)
-        self.assertEqual(index.get_num_voxels(3), 0)
-
-        self.assertEqual(index.voxel_size[0], 0.1)
-        self.assertEqual(index.voxel_size[1], 0.1)
-        self.assertEqual(index.voxel_size[2], 0.1)
-
-        assert_tensor_set_equal(index.get_coords(0), level_0_ijk)
-        assert_tensor_set_equal(
-            index.get_coords(0, expand=3),
-            torch.tensor(
-                [
-                    [-1, -1, -1],
-                    [-1, -1, 0],
-                    [-1, -1, 1],
-                    [-1, -1, 2],
-                    [-1, 0, -1],
-                    [-1, 0, 0],
-                    [-1, 0, 1],
-                    [-1, 0, 2],
-                    [-1, 1, -1],
-                    [-1, 1, 0],
-                    [-1, 1, 1],
-                    [-1, 1, 2],
-                    [0, -1, -1],
-                    [0, -1, 0],
-                    [0, -1, 1],
-                    [0, -1, 2],
-                    [0, 0, -1],
-                    [0, 0, 0],
-                    [0, 0, 1],
-                    [0, 0, 2],
-                    [0, 1, -1],
-                    [0, 1, 0],
-                    [0, 1, 1],
-                    [0, 1, 2],
-                    [1, -1, -1],
-                    [1, -1, 0],
-                    [1, -1, 1],
-                    [1, -1, 2],
-                    [1, 0, -1],
-                    [1, 0, 0],
-                    [1, 0, 1],
-                    [1, 0, 2],
-                    [1, 1, -1],
-                    [1, 1, 0],
-                    [1, 1, 1],
-                    [1, 1, 2],
-                    [2, -1, -1],
-                    [2, -1, 0],
-                    [2, -1, 1],
-                    [2, -1, 2],
-                    [2, 0, -1],
-                    [2, 0, 0],
-                    [2, 0, 1],
-                    [2, 0, 2],
-                    [2, 1, -1],
-                    [2, 1, 0],
-                    [2, 1, 2],
-                    [2, 1, 1],
-                ],
-                dtype=torch.int,
-                device=self.device,
-            ),
-        )
-        assert_tensor_set_equal(
-            index.get_voxel_centers(1), torch.tensor([[0.1, 0.3, 0.1], [0.1, 0.1, 0.1]], device=self.device)
-        )
-
-        # Empty indices
-        index.update_coords(2, None)
-        self.assertEqual(index.get_coords(2).size(0), 0)
-
-    def test_neighbours(self):
-        index = self.backend_cls(2, 1.0, self.device, self.range_kernel)
-
-        level_0_ijk = torch.tensor([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 1, 0]], dtype=torch.int, device=self.device)
-
-        level_1_ijk = torch.tensor([[0, 0, 2], [0, 0, 0]], dtype=torch.int, device=self.device)
-
-        level_0_ijk, perm_0 = index.update_coords(0, level_0_ijk)
-        level_1_ijk, perm_1 = index.update_coords(1, level_1_ijk)
-
-        # Ring-1 NN -- same layer
-        src_ids, tgt_ids, n_types, n_counts = index.get_self_neighbours(0, 0, target_range=1)
-        assert_tensor_equal(src_ids, tgt_ids)
-        assert_tensor_set_equal(src_ids, torch.arange(4, dtype=torch.long, device=self.device))
-        assert_tensor_equal(n_types, torch.zeros((4, 3), device=self.device))
-
-        # Ring-3 NN -- same layer
-        src_ids, tgt_ids, n_types, n_counts = index.get_self_neighbours(0, 0, target_range=3)
-        nn_info = torch.cat([src_ids[:, None], tgt_ids[:, None], n_types.long()], dim=1)
-        assert_tensor_set_equal(
-            nn_info,
-            permute_neighbour(
-                torch.tensor(
-                    [
-                        [0, 0, 0, 0, 0],
-                        [1, 1, 0, 0, 0],
-                        [2, 2, 0, 0, 0],
-                        [3, 3, 0, 0, 0],
-                        [0, 3, 0, 1, 0],
-                        [1, 3, 0, 1, -1],
-                        [0, 1, 0, 0, 1],
-                        [1, 2, 0, 0, 1],
-                        [1, 0, 0, 0, -1],
-                        [2, 1, 0, 0, -1],
-                        [3, 1, 0, -1, 1],
-                        [3, 0, 0, -1, 0],
-                    ],
-                    dtype=torch.long,
-                    device=self.device,
-                ),
-                perm_0,
-                perm_0,
-            ),
-        )
-        assert_tensor_equal(
-            n_counts,
-            torch.tensor(
-                [4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                dtype=torch.long,
-                device=self.device,
-            ),
-        )
-
-        # Ring-1 NN -- across layer
-        src_ids, tgt_ids, n_types, n_counts = index.get_self_neighbours(0, 1, target_range=1)
-        nn_info = torch.cat([src_ids[:, None], tgt_ids[:, None], n_types.long()], dim=1)
-        assert_tensor_set_equal(
-            nn_info,
-            permute_neighbour(
-                torch.tensor(
-                    [[0, 1, 0, 0, 0], [1, 1, 0, 0, 0], [2, 0, 0, 0, 0], [3, 1, 0, 0, 0]],
-                    dtype=torch.long,
-                    device=self.device,
-                ),
-                perm_0,
-                perm_1,
-            ),
-        )
-        assert_tensor_equal(n_counts, torch.tensor([4], dtype=torch.long, device=self.device))
-
-        # Ring-3 NN -- across layer
-        src_ids, tgt_ids, n_types, n_counts = index.get_self_neighbours(0, 1, target_range=3)
-        nn_info = torch.cat([src_ids[:, None], tgt_ids[:, None], n_types.long()], dim=1)
-        assert_tensor_set_equal(
-            nn_info,
-            permute_neighbour(
-                torch.tensor(
-                    [
-                        [0, 1, 0, 0, 0],
-                        [1, 1, 0, 0, 0],
-                        [2, 0, 0, 0, 0],
-                        [3, 1, 0, 0, 0],
-                        [0, 0, 0, 0, 2],
-                        [1, 0, 0, 0, 1],
-                        [3, 0, 0, 0, 2],
-                        [2, 1, 0, 0, -1],
-                    ],
-                    dtype=torch.long,
-                    device=self.device,
-                ),
-                perm_0,
-                perm_1,
-            ),
-        )
-        assert_tensor_equal(
-            n_counts,
-            torch.tensor(
-                [
-                    4,
-                    0,
-                    0,
-                    0,
-                    0,
-                    0,
-                    0,
-                    0,
-                    0,
-                    0,
-                    0,
-                    3,  # (0,0,2) and (0,0,1) belongs to the same NN.
-                    1,
-                    0,
-                    0,
-                    0,
-                    0,
-                    0,
-                    0,
-                    0,
-                    0,
-                    0,
-                    0,
-                    0,
-                    0,
-                    0,
-                    0,
-                ],
-                dtype=torch.long,
-                device=self.device,
-            ),
-        )
-
-        # Ring-3 NN -- across layer (the other way around)
-        src_ids, tgt_ids, n_types, n_counts = index.get_self_neighbours(1, 0, target_range=3)
-        nn_info = torch.cat([src_ids[:, None], tgt_ids[:, None], n_types.long()], dim=1)
-        assert_tensor_set_equal(
-            nn_info,
-            permute_neighbour(
-                torch.tensor(
-                    [
-                        [1, 0, 0, 0, 0],
-                        [1, 1, 0, 0, 0],
-                        [0, 2, 0, 0, 0],
-                        [1, 3, 0, 0, 0],
-                        [0, 1, 0, 0, 0],
-                        [1, 2, 0, 0, 0],
-                    ],
-                    dtype=torch.long,
-                    device=self.device,
-                ),
-                perm_1,
-                perm_0,
-            ),
-        )
-
-        # coordinate-based NN (like hash table)
-        src_ids, tgt_ids, n_types, n_counts = index.get_coords_neighbours(
-            torch.tensor([[0.0, 0.0, 0.0], [0.0, 0.0, 2.0]], device=self.device),
-            1,
-            0,
-            nn_kernel=self._get_range_kernel(1),
-        )
-        nn_info = torch.cat([src_ids[:, None], tgt_ids[:, None], n_types.long()], dim=1)
-        assert_tensor_set_equal(
-            nn_info, torch.tensor([[0, 0, 0, 0, 0], [1, 2, 0, 0, 0]], dtype=torch.long, device=self.device)
-        )
-
-    def test_split_splat(self):
-        index = self.backend_cls(2, 1.0, self.device, self.range_kernel)
-        index.update_coords(
-            0, torch.tensor([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 1, 0]], dtype=torch.int, device=self.device)
-        )
-        index.update_coords(1, torch.tensor([[0, 0, 2], [0, 0, 0]], dtype=torch.int, device=self.device))
-
-        res = index.splat_data(
-            # World coordinates...
-            xyz=torch.tensor([[0.5, 0.5, 0.5], [0.5, 0.5, 2.0]], device=self.device),
-            data_depth=0,
-            data=torch.tensor([[10.0, 8.0], [300.0, 400.0]], device=self.device),
-            check_corr=False,
-        )
-        assert_tensor_equal(
-            res,
-            torch.tensor(
-                [[10.0, 8.0], [150.0, 200.0], [150.0, 200.0], [0.0, 0.0]], dtype=torch.float32, device=self.device
-            ),
-        )
-
-
-if __name__ == "__main__":
-    unittest.main()

From 2db574080dc7452b79f190f65c260f37f7db71a2 Mon Sep 17 00:00:00 2001
From: Francis Williams <fwilliams@users.noreply.github.com>
Date: Mon, 20 Jan 2025 21:51:32 -0500
Subject: [PATCH 54/59] fix itertools.cycle (#142)

---
 fvdb/examples/3dgs/train_colmap.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/fvdb/examples/3dgs/train_colmap.py b/fvdb/examples/3dgs/train_colmap.py
index 2291b34269..73ffc7fe04 100644
--- a/fvdb/examples/3dgs/train_colmap.py
+++ b/fvdb/examples/3dgs/train_colmap.py
@@ -261,8 +261,15 @@ def __init__(
 
     def train(self, start_step: int = 0):
         # We keep cycling through every image in a random order until we reach
-        # the specified number of optimization steps.
-        trainloader = itertools.cycle(
+        # the specified number of optimization steps. We can't use itertools.cycle
+        # because it caches each minibatch element in memory which can quickly
+        # exhaust the amount of available RAM
+        def cycle(dataloader):
+            while True:
+                for minibatch in dataloader:
+                    yield minibatch
+
+        trainloader = cycle(
             torch.utils.data.DataLoader(
                 self.trainset,
                 batch_size=self.cfg.batch_size,
@@ -294,7 +301,6 @@ def train(self, start_step: int = 0):
             # If you have very large images, you can iterate over disjoint crops and accumulate gradients
             # If cfg.crops_per_image is 1, then this just returns the image
             for pixels, crop, is_last in crop_image_batch(image, self.cfg.crops_per_image):
-
                 # Actual pixels to compute the loss on, normalized to [0, 1]
                 pixels = pixels.to(self.device) / 255.0  # [1, H, W, 3]
 
@@ -313,7 +319,6 @@ def train(self, start_step: int = 0):
                     image_crop=crop,
                     cache_info=True,
                 )
-
                 # If you specified depth rendering, grab the depth map as well
                 if renders.shape[-1] == 4:
                     colors, depths = renders[..., 0:3], renders[..., 3:4]

From a8e52da53cc3d1b8eee0c4e20637fdecdd28d1e3 Mon Sep 17 00:00:00 2001
From: Francis Williams <fwilliams@users.noreply.github.com>
Date: Tue, 21 Jan 2025 18:21:48 -0500
Subject: [PATCH 55/59] Fix bug where image cropping wasn't working in Gaussian
 Splatting

---
 .../detail/ops/gsplat/GaussianRasterizeForward.cu  | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/fvdb/src/detail/ops/gsplat/GaussianRasterizeForward.cu b/fvdb/src/detail/ops/gsplat/GaussianRasterizeForward.cu
index 03ef9be0f0..cb9cfbbbf9 100644
--- a/fvdb/src/detail/ops/gsplat/GaussianRasterizeForward.cu
+++ b/fvdb/src/detail/ops/gsplat/GaussianRasterizeForward.cu
@@ -163,10 +163,13 @@ rasterize_forward(const uint32_t C, const uint32_t N, const uint32_t n_isects, c
 
     const int32_t camera_id = blockIdx.x;
 
-    // blockIdx runs from [0, num_tiles_h] x [0, num_tiles_w]
-    const int32_t tile_id = (blockIdx.y + tile_origin_h) * tile_width + blockIdx.z + tile_origin_w;
+    // blockIdx.yz runs from [0, num_tiles_h] x [0, num_tiles_w]
+    const int32_t tile_id =
+        (blockIdx.y + tile_origin_h) * tile_width + (blockIdx.z + tile_origin_w);
 
     // Pixel coordinates run from [0, height] x [0, width]
+    // i.e. they are in the local coordinates of the crop starting from pixel
+    //      [image_origin_h, image_origin_w] with size [image_height, image_width]
     const uint32_t i      = blockIdx.y * tile_size + threadIdx.y;
     const uint32_t j      = blockIdx.z * tile_size + threadIdx.x;
     const int32_t  pix_id = i * image_width + j;
@@ -206,9 +209,12 @@ rasterize_forward(const uint32_t C, const uint32_t N, const uint32_t n_isects, c
                                      : tile_offsets[tile_id + 1];
     const uint32_t block_size  = blockDim.x * blockDim.y;
 
+    // Pixel coordinates in the global image (not just the local crop)
+    const uint32_t global_i = i + image_origin_h;
+    const uint32_t global_j = j + image_origin_w;
     return volume_render_tile<S, COLOR_DIM>(range_start, range_end, block_size, tile_size,
-                                            pixel_in_image, i, j, means2d, conics, colors,
-                                            opacities, backgrounds, tile_gaussian_ids,
+                                            pixel_in_image, global_i, global_j, means2d, conics,
+                                            colors, opacities, backgrounds, tile_gaussian_ids,
                                             out_render_colors, out_render_alphas, out_last_ids);
 }
 

From eb489a2bdc55286ab67b7169d5a92f96f0f64f86 Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <2375296+swahtz@users.noreply.github.com>
Date: Wed, 22 Jan 2025 12:51:20 +1300
Subject: [PATCH 56/59] Remove test env accidentally merged in previous PR
 (#150)

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 fvdb/env/torch_cxx11.yml | 25 -------------------------
 1 file changed, 25 deletions(-)
 delete mode 100644 fvdb/env/torch_cxx11.yml

diff --git a/fvdb/env/torch_cxx11.yml b/fvdb/env/torch_cxx11.yml
deleted file mode 100644
index 3ed06ba099..0000000000
--- a/fvdb/env/torch_cxx11.yml
+++ /dev/null
@@ -1,25 +0,0 @@
-name: torch_cxx11
-channels:
-  - conda-forge
-dependencies:
-  - python=3.10
-  - pytorch-gpu=2.5[build=cuda120*]
-  - ca-certificates
-  - openssl
-  - cuda-version=12.0
-  - gitpython
-  - tqdm
-  - numpy
-  - make
-  - cmake
-  - ninja
-  - cxx-compiler
-  - gxx_linux-64=11
-  - gcc_linux-64=11
-  - requests
-  - cuda-toolkit=12.0
-  - cuda-compiler=12.0
-  - cuda-nvcc=12.0
-  - cuda-cccl=12.0
-  - cuda-libraries-static=12.0
-  - cuda-cudart-static=12.0

From 16bfecf4c68d3f6513e6671e9745d11625dc16fe Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <2375296+swahtz@users.noreply.github.com>
Date: Wed, 22 Jan 2025 13:15:17 +1300
Subject: [PATCH 57/59] fVDB version bumped to 0.2.1 (#151)

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 fvdb/fvdb/__init__.py                                  | 4 ++--
 fvdb/projects/panoptic_segmentation/mask_pls/README.md | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fvdb/fvdb/__init__.py b/fvdb/fvdb/__init__.py
index becbf6ac73..861f331f6f 100644
--- a/fvdb/fvdb/__init__.py
+++ b/fvdb/fvdb/__init__.py
@@ -61,8 +61,8 @@ def jcat(things_to_cat, dim=None):
         raise ValueError("jcat() can only cat GridBatch, JaggedTensor, or VDBTensor")
 
 
-__version__ = "0.2.0"
-__version_info__ = (0, 2, 0)
+__version__ = "0.2.1"
+__version_info__ = (0, 2, 1)
 
 __all__ = [
     "GridBatch",
diff --git a/fvdb/projects/panoptic_segmentation/mask_pls/README.md b/fvdb/projects/panoptic_segmentation/mask_pls/README.md
index 4105aa6011..361a6ad1e4 100644
--- a/fvdb/projects/panoptic_segmentation/mask_pls/README.md
+++ b/fvdb/projects/panoptic_segmentation/mask_pls/README.md
@@ -9,7 +9,7 @@ Build an environment with the required dependencies for this project and install
 ```bash
 conda env create -f maskpls_envrionment.yml
 conda activate maskpls
-pip install /path/to/fVDB/dist/fvdb-0.2.0-cp312-cp312-linux_x86_64.whl # Replace with the correct wheel
+pip install /path/to/fVDB/dist/fvdb-0.2.1-cp311-cp311-linux_x86_64.whl # Replace with the correct wheel
 ```
 
 ## Usage
@@ -37,4 +37,4 @@ python train.py --dataset-type SemanticKITTI \
 
 ## References
 
-Based on the MaskPLS paper: [MaskPLS: Mask-Based Panoptic LiDAR Segmentation](https://www.ipb.uni-bonn.de/wp-content/papercite-data/pdf/marcuzzi2023ral.pdf)
\ No newline at end of file
+Based on the MaskPLS paper: [MaskPLS: Mask-Based Panoptic LiDAR Segmentation](https://www.ipb.uni-bonn.de/wp-content/papercite-data/pdf/marcuzzi2023ral.pdf)

From 04e31933773958658e91bb0c0a2a5ec514504c9c Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Wed, 22 Jan 2025 13:21:19 +1300
Subject: [PATCH 58/59] Revert any changes to .github/ made to accomodate CI

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 .github/copy-pr-bot.yaml             |   3 -
 .github/workflows/ax.yml             |  23 +-
 .github/workflows/build.yml          | 104 ++--
 .github/workflows/docs.yml           |  16 +-
 .github/workflows/fvdb.yml           | 234 --------
 .github/workflows/fvdb_codestyle.yml |  81 ---
 .github/workflows/houdini.yml        |  18 +-
 .github/workflows/nanovdb.yml        | 109 ++--
 .github/workflows/status_checks.yml  |  16 -
 .github/workflows/weekly.yml         | 850 +++++++++++++--------------
 .github/workflows/whitespace.yml     |  13 +-
 11 files changed, 602 insertions(+), 865 deletions(-)
 delete mode 100644 .github/copy-pr-bot.yaml
 delete mode 100644 .github/workflows/fvdb.yml
 delete mode 100644 .github/workflows/fvdb_codestyle.yml
 delete mode 100644 .github/workflows/status_checks.yml

diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
deleted file mode 100644
index 4cfbdc7f05..0000000000
--- a/.github/copy-pr-bot.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-enabled: true
-auto_sync_draft: false
-auto_sync_ready: true
diff --git a/.github/workflows/ax.yml b/.github/workflows/ax.yml
index 560607b84d..be512fbc2c 100644
--- a/.github/workflows/ax.yml
+++ b/.github/workflows/ax.yml
@@ -4,7 +4,22 @@ name: AX
 on:
   push:
     branches:
-      - "pull-request/[0-9]+"
+      - 'master'
+      - 'feature/**'
+      - 'pr/**'
+    paths-ignore:
+      - 'CHANGES'
+      - 'CODEOWNERS'
+      - 'doc/**'
+      - 'nanovdb/**'
+      - 'openvdb_maya/**'
+      - 'openvdb_houdini/**'
+      - 'fvdb/**'
+      - 'pendingchanges/**'
+      - '**.md'
+  pull_request:
+    branches:
+      - '**'
     paths-ignore:
       - 'CHANGES'
       - 'CODEOWNERS'
@@ -40,7 +55,7 @@ jobs:
       github.event_name != 'workflow_dispatch' ||
       github.event.inputs.type == 'all' ||
       github.event.inputs.type == 'linux'
-    runs-on: ${{ (github.repository_owner == 'NVIDIA-Omniverse' && 'linux-amd64-cpu32') || (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
+    runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
     name: >
       linux-ax:${{ matrix.config.image }}-cxx:${{ matrix.config.cxx }}-${{ matrix.config.build }}
     container:
@@ -137,7 +152,7 @@ jobs:
     if: |
       github.event_name == 'workflow_dispatch' &&
       github.event.inputs.type == 'grammar'
-    runs-on: ${{ (github.repository_owner == 'NVIDIA-Omniverse' && 'linux-amd64-cpu32') || (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
+    runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
     container:
       image: aswf/ci-openvdb:2023-clang15
     steps:
@@ -145,7 +160,7 @@ jobs:
     - name: build
       run: ./ci/build.sh -v --components=axgr --target=openvdb_ax_grammar --cargs=\"-DOPENVDB_AX_GRAMMAR_NO_LINES=ON\"
     - name: upload grammar
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v2
       with:
         name: ax_grammar
         path: ./build/openvdb_ax/openvdb_ax/openvdb_ax/grammar
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index fe91055487..6d479e57a5 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -4,7 +4,23 @@ name: Build
 on:
   push:
     branches:
-      - "pull-request/[0-9]+"
+      - 'master'
+      - 'feature/**'
+      - 'pr/**'
+    paths-ignore:
+      - 'CHANGES'
+      - 'CODEOWNERS'
+      - 'doc/**'
+      - 'openvdb_maya/**'
+      - 'openvdb_houdini/**'
+      - 'openvdb_ax/**'
+      - 'nanovdb/**'
+      - 'fvdb/**'
+      - 'pendingchanges/**'
+      - '**.md'
+  pull_request:
+    branches:
+      - '**'
     paths-ignore:
       - 'CHANGES'
       - 'CODEOWNERS'
@@ -46,7 +62,7 @@ jobs:
       github.event_name != 'workflow_dispatch' ||
       github.event.inputs.type == 'all' ||
       github.event.inputs.type == 'linux'
-    runs-on: ${{ (github.repository_owner == 'NVIDIA-Omniverse' && 'linux-amd64-cpu32') || (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
+    runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
     name: >
       linux-vfx:${{ matrix.config.image }}-
       abi:${{ matrix.config.abi }}-
@@ -62,7 +78,7 @@ jobs:
         config:
           - { cxx: clang++, image: '2024', abi: '12', build: 'Release', cmake: '' }
           - { cxx: g++,     image: '2024', abi: '12', build: 'Release', cmake: '' }
-          - { cxx: clang++, image: '2024', abi: '12', build: 'Debug',   cmake: '-DUSE_EXPLICIT_INSTANTIATION=OFF' }
+          - { cxx: clang++, image: '2024', abi: '12', build: 'Debug',   cmake: '' }
           - { cxx: clang++, image: '2023', abi: '11', build: 'Release', cmake: '-DDISABLE_DEPENDENCY_VERSION_CHECKS=ON' }
           - { cxx: g++,     image: '2023', abi: '11', build: 'Release', cmake: '-DDISABLE_DEPENDENCY_VERSION_CHECKS=ON' }
       fail-fast: false
@@ -109,47 +125,47 @@ jobs:
       if: matrix.config.build == 'Release'
       run: ccache --evict-older-than 1d
 
-  # windows:
-  #   # Windows CI. Tests a dynamic build with MD.
-  #   if: |
-  #     github.event_name != 'workflow_dispatch' ||
-  #     github.event.inputs.type == 'all' ||
-  #     github.event.inputs.type == 'win'
-  #   runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'windows-2022-8c-32g-300h') || 'windows-latest' }}
-  #   name: windows
-  #   env:
-  #     VCPKG_DEFAULT_TRIPLET: x64-windows
-  #   strategy:
-  #     fail-fast: false
-  #   steps:
-  #   - uses: actions/checkout@v3
-  #   - name: path
-  #     shell: pwsh
-  #     run: |
-  #       # note: system path must be modified in a previous step to it's use
-  #       echo "$Env:VCPKG_INSTALLATION_ROOT\installed\x64-windows\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-  #       echo "${{github.workspace}}\build\openvdb\openvdb\Release" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-  #   - name: install
-  #     shell: powershell
-  #     run: .\ci\install_windows.ps1
-  #   - name: build
-  #     run: >
-  #       ./ci/build.sh -v
-  #       --config='Release'
-  #       --components='core,bin,view,render,python,test'
-  #       --cargs=\'
-  #       -A x64 -G \"Visual Studio 17 2022\" -DOPENVDB_CORE_STATIC=OFF
-  #       -DMSVC_COMPRESS_PDB=ON
-  #       -DUSE_EXR=ON
-  #       -DUSE_PNG=ON
-  #       -DVCPKG_TARGET_TRIPLET=${VCPKG_DEFAULT_TRIPLET}
-  #       -DCMAKE_TOOLCHAIN_FILE=\"${VCPKG_INSTALLATION_ROOT}\\scripts\\buildsystems\\vcpkg.cmake\"
-  #       \'
-  #   - name: size
-  #     # Print the build directy size (monitor if we're hitting runner limits)
-  #     run: du -h build
-  #   - name: test
-  #     run: cd build && ctest -V -C Release
+  windows:
+    # Windows CI. Tests a dynamic build with MD.
+    if: |
+      github.event_name != 'workflow_dispatch' ||
+      github.event.inputs.type == 'all' ||
+      github.event.inputs.type == 'win'
+    runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'windows-2022-8c-32g-300h') || 'windows-latest' }}
+    name: windows
+    env:
+      VCPKG_DEFAULT_TRIPLET: x64-windows
+    strategy:
+      fail-fast: false
+    steps:
+    - uses: actions/checkout@v3
+    - name: path
+      shell: pwsh
+      run: |
+        # note: system path must be modified in a previous step to it's use
+        echo "$Env:VCPKG_INSTALLATION_ROOT\installed\x64-windows\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+        echo "${{github.workspace}}\build\openvdb\openvdb\Release" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+    - name: install
+      shell: powershell
+      run: .\ci\install_windows.ps1
+    - name: build
+      run: >
+        ./ci/build.sh -v
+        --config='Release'
+        --components='core,bin,view,render,python,test'
+        --cargs=\'
+        -A x64 -G \"Visual Studio 17 2022\" -DOPENVDB_CORE_STATIC=OFF
+        -DMSVC_COMPRESS_PDB=ON
+        -DUSE_EXR=ON
+        -DUSE_PNG=ON
+        -DVCPKG_TARGET_TRIPLET=${VCPKG_DEFAULT_TRIPLET}
+        -DCMAKE_TOOLCHAIN_FILE=\"${VCPKG_INSTALLATION_ROOT}\\scripts\\buildsystems\\vcpkg.cmake\"
+        \'
+    - name: size
+      # Print the build directy size (monitor if we're hitting runner limits)
+      run: du -h build
+    - name: test
+      run: cd build && ctest -V -C Release
 
   macos:
     if: |
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index b3ec4a8272..617ae20a12 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -4,13 +4,22 @@ name: Docs
 on:
   push:
     branches:
-      - "pull-request/[0-9]+"
+      - 'master'
+      - 'feature/**'
+      - 'pr/**'
+    paths-ignore:
+      - 'CHANGES'
+      - 'openvdb_maya/**'
+      - 'pendingchanges/**'
+      - '**.md'
+  pull_request:
+    branches:
+      - '**'
     paths-ignore:
       - 'CHANGES'
       - 'openvdb_maya/**'
       - 'pendingchanges/**'
       - '**.md'
-      - 'fvdb/**'
   workflow_dispatch:
     inputs:
       deploy:
@@ -101,7 +110,7 @@ jobs:
     env:
       CXX: g++
     steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v3
     - name: install_gcovr
       run: pip install gcovr
     - name: build
@@ -147,3 +156,4 @@ jobs:
           - Deployed from: AcademySoftwareFoundation/openvdb ${{ github.sha }}
 
           Signed-off-by: ${{ github.actor }} <${{ github.actor }}@users.noreply.github.com>"
+
diff --git a/.github/workflows/fvdb.yml b/.github/workflows/fvdb.yml
deleted file mode 100644
index 84e7eafcfc..0000000000
--- a/.github/workflows/fvdb.yml
+++ /dev/null
@@ -1,234 +0,0 @@
-name: fVDB Unit Tests
-
-on:
-  push:
-    branches:
-      - "pull-request/[0-9]+"
-    paths-ignore:
-        - 'CHANGES'
-        - 'CODEOWNERS'
-        - 'doc/**'
-        - 'openvdb/**'
-        - 'openvdb_cmd/**'
-        - 'openvdb_ax/**'
-        - 'openvdb_maya/**'
-        - 'openvdb_houdini/**'
-        - 'nanovdb/**'
-        - 'pendingchanges/**'
-        - '**.md'
-        - 'fvdb/debug/**'
-        - 'fvdb/docs/**'
-        - 'fvdb/examples/**'
-        - 'fvdb/notebooks/**'
-        - 'fvdb/scripts/**'
-
-# Allow subsequent pushes to the same PR or REF to cancel any previous jobs.
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
-  cancel-in-progress: true
-
-permissions:
-  contents: write
-  deployments: write
-
-jobs:
-  fvdb-build:
-    if: ${{ !startsWith(github.event.pull_request.title, 'Draft:') }}
-    name: fVDB Build
-    runs-on:
-      - self-hosted
-    container:
-      image: aswf/ci-openvdb:2024
-      env:
-        PYTHONPATH: ""
-      options: --rm
-    defaults:
-      run:
-        shell: bash -el {0}
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Set up fvdb_build Conda env
-        uses: conda-incubator/setup-miniconda@v3
-        with:
-          miniforge-version: latest
-          conda-remove-defaults: "true"
-          activate-environment: fvdb_build
-          environment-file: fvdb/env/build_environment.yml
-
-
-      - name: Buid fvdb
-        run: |
-          cd fvdb;
-          TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6+PTX" MAX_JOBS=$(($(nproc) < $(free -g | awk '/^Mem:/{jobs=int($4/2.5); if(jobs<1) jobs=1; print jobs}') ? $(nproc) : $(free -g | awk '/^Mem:/{jobs=int($4/2.5); if(jobs<1) jobs=1; print jobs}'))) conda run --no-capture-output -n fvdb_build python setup.py bdist_wheel --dist-dir=dist
-
-      - name: Upload package
-        uses: actions/upload-artifact@v4
-        with:
-            name: fvdb-test-package
-            path: fvdb/dist/*.whl
-            retention-days: 2
-
-      - name: Cleanup
-        if: always()
-        run: |
-          echo "Cleaning up /__w/_temp directory"
-          sudo rm -rf /__w/_temp/*
-          echo "Cleanup completed"
-
-
-  fvdb-unit-test:
-    needs: [fvdb-build]
-    name: fVDB Unit Tests
-    runs-on:
-      - self-hosted
-    container:
-      image: aswf/ci-openvdb:2024
-      env:
-        PYTHONPATH: ""
-      options: --rm
-    defaults:
-      run:
-        shell: bash -el {0}
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set up fvdb_test Conda env
-        uses: conda-incubator/setup-miniconda@v3
-        with:
-          miniforge-version: latest
-          conda-remove-defaults: "true"
-          activate-environment: fvdb_test
-          environment-file: fvdb/env/test_environment.yml
-
-      - name: Download package
-        uses: actions/download-artifact@v4
-        with:
-            name: fvdb-test-package
-            path: ./dist
-
-      - name: Install package
-        run: |
-            conda activate fvdb_test
-            pip install ./dist/*.whl
-
-      - name: Run tests
-        run: |
-            cd fvdb/tests;
-            pytest -v unit
-
-      - name: Cleanup
-        if: always()
-        run: |
-          echo "Cleaning up /__w/_temp directory"
-          sudo rm -rf /__w/_temp/*
-          echo "Cleanup completed"
-
-  fvdb-docs-test:
-    needs: [fvdb-build]
-    name: fVDB Documentation Tests
-    runs-on:
-      - self-hosted
-    container:
-      image: aswf/ci-openvdb:2024
-      env:
-        PYTHONPATH: ""
-      options: --rm
-    defaults:
-      run:
-        shell: bash -el {0}
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set up fvdb_test Conda env
-        uses: conda-incubator/setup-miniconda@v3
-        with:
-          miniforge-version: latest
-          conda-remove-defaults: "true"
-          activate-environment: fvdb_test
-          environment-file: fvdb/env/test_environment.yml
-
-      - name: Download package
-        uses: actions/download-artifact@v4
-        with:
-            name: fvdb-test-package
-            path: ./dist
-
-      - name: Install package
-        run: |
-            conda activate fvdb_test
-            pip install ./dist/*.whl
-
-      - name: Run tests
-        run: |
-            pytest --markdown-docs fvdb/docs
-
-      - name: Cleanup
-        if: always()
-        run: |
-          echo "Cleaning up /__w/_temp directory"
-          sudo rm -rf /__w/_temp/*
-          echo "Cleanup completed"
-
-  fvdb-benchmarks:
-    needs: [fvdb-build]
-    name: fVDB Continuous Benchmarking
-    runs-on:
-      - self-hosted
-    container:
-      image: aswf/ci-openvdb:2024
-      env:
-        PYTHONPATH: ""
-      options: --rm
-    defaults:
-      run:
-        shell: bash -el {0}
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set up fvdb_test Conda env
-        uses: conda-incubator/setup-miniconda@v3
-        with:
-          miniforge-version: latest
-          conda-remove-defaults: "true"
-          activate-environment: fvdb_test
-          environment-file: fvdb/env/test_environment.yml
-
-      - name: Download package
-        uses: actions/download-artifact@v4
-        with:
-            name: fvdb-test-package
-            path: ./dist
-
-      - name: Install package
-        run: |
-            conda activate fvdb_test
-            pip install ./dist/*.whl
-
-      - name: Disable git ownership verification
-        run: |
-          git config --global --add safe.directory "$(pwd)"
-
-      - name: Run benchmarks
-        run: |
-          cd fvdb/tests;
-          pytest benchmark --benchmark-json benchmark/output.json
-
-      - name: Store benchmark result
-        uses: benchmark-action/github-action-benchmark@v1
-        with:
-            name: Python Benchmark with pytest-benchmark
-            tool: 'pytest'
-            output-file-path: fvdb/tests/benchmark/output.json
-            # Use personal access token instead of GITHUB_TOKEN due to https://github.community/t/github-action-not-triggering-gh-pages-upon-push/16096
-            github-token: ${{ secrets.GITHUB_TOKEN }}
-            auto-push: true
-            # Show alert with commit comment on detecting possible performance regression
-            alert-threshold: '200%'
-            comment-on-alert: true
-            fail-on-alert: true
-            alert-comment-cc-users: '@swahtz'
-
-      - name: Cleanup
-        if: always()
-        run: |
-          echo "Cleaning up /__w/_temp directory"
-          sudo rm -rf /__w/_temp/*
-          echo "Cleanup completed"
diff --git a/.github/workflows/fvdb_codestyle.yml b/.github/workflows/fvdb_codestyle.yml
deleted file mode 100644
index c5a5e7af2e..0000000000
--- a/.github/workflows/fvdb_codestyle.yml
+++ /dev/null
@@ -1,81 +0,0 @@
-name: fVDB Code Style
-on:
-  push:
-    branches:
-      - "pull-request/[0-9]+"
-    paths-ignore:
-        - 'CHANGES'
-        - 'CODEOWNERS'
-        - 'doc/**'
-        - 'ci/**'
-        - 'openvdb/**'
-        - 'openvdb_cmd/**'
-        - 'openvdb_ax/**'
-        - 'openvdb_maya/**'
-        - 'openvdb_houdini/**'
-        - 'openvdb_wolfram/**'
-        - 'tsc/**'
-        - 'nanovdb/**'
-        - 'pendingchanges/**'
-        - '**.md'
-        - 'fvdb/docs/**'
-        - 'fvdb/env/**'
-        - 'fvdb/notebooks/**'
-
-
-
-# Allow subsequent pushes to the same PR or REF to cancel any previous jobs.
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  test-python-black-lint:
-    name: Check Python code style with black
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: psf/black@stable
-        with:
-          options: "--check --verbose --target-version=py311 --line-length=120"
-          src: "fvdb/"
-          version: "~= 24.0"
-
-  test-cpp-clang-format-lint:
-    name: Check C++ code style with clang-format
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v4
-    - uses: DoozyX/clang-format-lint-action@v0.18.2
-      with:
-        source: 'fvdb/src/'
-        extensions: 'h,cpp,cc,cu,cuh'
-        clangFormatVersion: 18
-        style: file
-
-  include-guards:
-    name: Check include guards
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v4
-    - uses: swahtz/include-guards-check-action@master
-      with:
-        path: 'fvdb/src/'
-        pattern: 'FVDB_{path}'
-
-  check-spdx-identifiers:
-    name: Check SPDX identifiers
-    runs-on: ubuntu-latest
-    steps:
-    - name: checkout
-      uses: actions/checkout@v4
-    - uses: swahtz/spdx@feature/ignore_paths
-      with:
-        licenses: |-
-          Apache-2.0
-        ignore-paths: |-
-            openvdb/openvdb/math/Half.cc
-            openvdb/openvdb/math/Half.h
-            openvdb_wolfram/OpenVDBLink
-            openvdb_ax/openvdb_ax/grammar/generated
-            fvdb/projects/panoptic_segmentation/garfield/garfield
diff --git a/.github/workflows/houdini.yml b/.github/workflows/houdini.yml
index ac87f2dc2d..28efc703b8 100644
--- a/.github/workflows/houdini.yml
+++ b/.github/workflows/houdini.yml
@@ -4,7 +4,21 @@ name: Houdini
 on:
   push:
     branches:
-      - "pull-request/[0-9]+"
+      - 'master'
+      - 'feature/**'
+      - 'pr/**'
+    paths-ignore:
+      - 'CHANGES'
+      - 'CODEOWNERS'
+      - 'doc/**'
+      - 'nanovdb/**'
+      - 'openvdb_maya/**'
+      - 'fvdb/**'
+      - 'pendingchanges/**'
+      - '**.md'
+  pull_request:
+    branches:
+      - '**'
     paths-ignore:
       - 'CHANGES'
       - 'CODEOWNERS'
@@ -51,7 +65,7 @@ jobs:
     if: >
       ${{ needs.checksecret.outputs.HOUDINI_SECRETS == 'true' ||
           github.repository_owner == 'AcademySoftwareFoundation' }}
-    runs-on: ${{ (github.repository_owner == 'NVIDIA-Omniverse' && 'linux-amd64-cpu32') || (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
+    runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
     name: hou:${{ matrix.config.hou_hash }}-vfx:${{ matrix.config.image }}-cxx:${{ matrix.config.cxx }}
     container:
       image: aswf/ci-base:${{ matrix.config.image }}
diff --git a/.github/workflows/nanovdb.yml b/.github/workflows/nanovdb.yml
index c3ccb3829d..0efc4580a8 100644
--- a/.github/workflows/nanovdb.yml
+++ b/.github/workflows/nanovdb.yml
@@ -4,7 +4,22 @@ name: NanoVDB
 on:
   push:
     branches:
-      - "pull-request/[0-9]+"
+      - 'master'
+      - 'feature/**'
+      - 'pr/**'
+    paths-ignore:
+      - 'CHANGES'
+      - 'CODEOWNERS'
+      - 'doc/**'
+      - 'openvdb_ax/**'
+      - 'openvdb_maya/**'
+      - 'openvdb_houdini/**'
+      - 'fvdb/**'
+      - 'pendingchanges/**'
+      - '**.md'
+  pull_request:
+    branches:
+      - '**'
     paths-ignore:
       - 'CHANGES'
       - 'CODEOWNERS'
@@ -36,7 +51,7 @@ jobs:
       github.event_name != 'workflow_dispatch' ||
       github.event.inputs.type == 'all' ||
       github.event.inputs.type == 'linux'
-    runs-on: ${{ (github.repository_owner == 'NVIDIA-Omniverse' && 'linux-amd64-cpu32') || (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
+    runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
     name: >
       linux-nanovdb:cxx:${{ matrix.config.cxx }}-${{ matrix.config.build }}
     container:
@@ -68,7 +83,6 @@ jobs:
           --cargs=\'
           -DUSE_EXPLICIT_INSTANTIATION=OFF
           -DNANOVDB_USE_CUDA=ON
-          -DCMAKE_CUDA_ARCHITECTURES="80"
           -DNANOVDB_USE_OPENVDB=ON
           -DCMAKE_INSTALL_PREFIX=`pwd`
           -DUSE_BLOSC=OFF
@@ -77,50 +91,49 @@ jobs:
       - name: test
         run: cd build && sudo ctest -V -E ".*cuda.*"
 
-  # windows-nanovdb:
-  #   if: |
-  #     github.event_name != 'workflow_dispatch' ||
-  #     github.event.inputs.type == 'all' ||
-  #     github.event.inputs.type == 'win'
-  #   runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'windows-2022-8c-32g-300h') || 'windows-latest' }}
-  #   env:
-  #     VCPKG_DEFAULT_TRIPLET: 'x64-windows'
-  #     visual_studio: "Visual Studio 17 2022"
-  #     cuda: "12.4.0"
-  #   strategy:
-  #     fail-fast: false
-  #   steps:
-  #   - uses: actions/checkout@v3
-  #   - name: path
-  #     run: |
-  #       # note: system path must be modified in a previous step to it's use
-  #       echo "$Env:VCPKG_INSTALLATION_ROOT\installed\x64-windows\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-  #       echo "${{github.workspace}}\build\openvdb\openvdb\Release" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-  #   - name: install_cuda
-  #     shell: powershell
-  #     run: .\ci\install_windows_cuda.ps1
-  #   - name: install
-  #     shell: powershell
-  #     run: .\ci\install_windows.ps1
-  #   - name: build
-  #     shell: bash
-  #     run: >
-  #       ./ci/build.sh -v
-  #       --config=Release
-  #       --components=core,nano,nanotest,nanoexam,nanobench,nanotool
-  #       --cargs=\'
-  #       -A x64 -G \"Visual Studio 17 2022\" -DOPENVDB_CORE_STATIC=OFF
-  #       -DMSVC_COMPRESS_PDB=ON
-  #       -DUSE_EXPLICIT_INSTANTIATION=OFF
-  #       -DNANOVDB_USE_CUDA=ON
-  #       -DCMAKE_CUDA_ARCHITECTURES="80"
-  #       -DNANOVDB_USE_OPENVDB=ON
-  #       -DVCPKG_TARGET_TRIPLET=${VCPKG_DEFAULT_TRIPLET}
-  #       -DCMAKE_TOOLCHAIN_FILE=\"${VCPKG_INSTALLATION_ROOT}\\scripts\\buildsystems\\vcpkg.cmake\"
-  #       \'
-  #   - name: test
-  #     shell: bash
-  #     run: cd build && ctest -V -E ".*cuda.*"
+  windows-nanovdb:
+    if: |
+      github.event_name != 'workflow_dispatch' ||
+      github.event.inputs.type == 'all' ||
+      github.event.inputs.type == 'win'
+    runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'windows-2022-8c-32g-300h') || 'windows-latest' }}
+    env:
+      VCPKG_DEFAULT_TRIPLET: 'x64-windows'
+      visual_studio: "Visual Studio 17 2022"
+      cuda: "12.4.0"
+    strategy:
+      fail-fast: false
+    steps:
+    - uses: actions/checkout@v3
+    - name: path
+      run: |
+        # note: system path must be modified in a previous step to it's use
+        echo "$Env:VCPKG_INSTALLATION_ROOT\installed\x64-windows\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+        echo "${{github.workspace}}\build\openvdb\openvdb\Release" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+    - name: install_cuda
+      shell: powershell
+      run: .\ci\install_windows_cuda.ps1
+    - name: install
+      shell: powershell
+      run: .\ci\install_windows.ps1
+    - name: build
+      shell: bash
+      run: >
+        ./ci/build.sh -v
+        --config=Release
+        --components=core,nano,nanotest,nanoexam,nanobench,nanotool
+        --cargs=\'
+        -A x64 -G \"Visual Studio 17 2022\" -DOPENVDB_CORE_STATIC=OFF
+        -DMSVC_COMPRESS_PDB=ON
+        -DUSE_EXPLICIT_INSTANTIATION=OFF
+        -DNANOVDB_USE_CUDA=ON
+        -DNANOVDB_USE_OPENVDB=ON
+        -DVCPKG_TARGET_TRIPLET=${VCPKG_DEFAULT_TRIPLET}
+        -DCMAKE_TOOLCHAIN_FILE=\"${VCPKG_INSTALLATION_ROOT}\\scripts\\buildsystems\\vcpkg.cmake\"
+        \'
+    - name: test
+      shell: bash
+      run: cd build && ctest -V -E ".*cuda.*"
 
   macos-nanovdb:
     if: |
@@ -156,7 +169,7 @@ jobs:
       github.event_name != 'workflow_dispatch' ||
       github.event.inputs.type == 'all' ||
       github.event.inputs.type == 'linux'
-    runs-on: ${{ (github.repository_owner == 'NVIDIA-Omniverse' && 'linux-amd64-cpu32') || (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
+    runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
     container:
       image: aswf/ci-openvdb:2024
     steps:
diff --git a/.github/workflows/status_checks.yml b/.github/workflows/status_checks.yml
deleted file mode 100644
index c6d3560d00..0000000000
--- a/.github/workflows/status_checks.yml
+++ /dev/null
@@ -1,16 +0,0 @@
-name: Wait for Status Checks
-on:
-  pull_request:
-  push:
-    branches:
-      - "pull-request/[0-9]+"
-jobs:
-  enforce-all-checks:
-    runs-on: ubuntu-latest
-    permissions:
-      checks: read
-    steps:
-      - name: GitHub Checks
-        uses: poseidon/wait-for-status-checks@v0.6.0
-        with:
-          token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml
index 2275e843d5..747c8af744 100644
--- a/.github/workflows/weekly.yml
+++ b/.github/workflows/weekly.yml
@@ -30,110 +30,110 @@ jobs:
   ################################## Houdini ##################################
   #############################################################################
 
-  # # Check that valid github secrets have been set for the ability to
-  # # download Houdini and cache it. The secrets are used in download_houdini.py
-  # checksecret:
-  #   name: Verify Houdini Secrets
-  #   runs-on: ubuntu-latest
-  #   outputs:
-  #     HOUDINI_SECRETS: ${{ steps.check.outputs.HOUDINI_SECRETS }}
-  #   steps:
-  #     - id: check
-  #       env:
-  #           HOUDINI_CLIENT_ID: ${{ secrets.HOUDINI_CLIENT_ID }}
-  #           HOUDINI_SECRET_KEY: ${{ secrets.HOUDINI_SECRET_KEY }}
-  #       run: echo "HOUDINI_SECRETS=${{ env.HOUDINI_CLIENT_ID != '' && env.HOUDINI_SECRET_KEY != '' }}" >> $GITHUB_OUTPUT
-  #     - name: Skip Next Jobs
-  #       if: steps.check.outputs.HOUDINI_SECRETS != 'true'
-  #       run: echo "HOUDINI_CLIENT_ID and HOUDINI_SECRET_KEY GitHub Action Secrets needs to be set to install Houdini builds"
-  #     # Explicitly error on the ASWF repo, we expect this secret to always exist
-  #     - name: Error ASWF
-  #       if: steps.check.outputs.HOUDINI_SECRETS != 'true' && github.repository_owner == 'AcademySoftwareFoundation'
-  #       run: exit 1
+  # Check that valid github secrets have been set for the ability to
+  # download Houdini and cache it. The secrets are used in download_houdini.py
+  checksecret:
+    name: Verify Houdini Secrets
+    runs-on: ubuntu-latest
+    outputs:
+      HOUDINI_SECRETS: ${{ steps.check.outputs.HOUDINI_SECRETS }}
+    steps:
+      - id: check
+        env:
+            HOUDINI_CLIENT_ID: ${{ secrets.HOUDINI_CLIENT_ID }}
+            HOUDINI_SECRET_KEY: ${{ secrets.HOUDINI_SECRET_KEY }}
+        run: echo "HOUDINI_SECRETS=${{ env.HOUDINI_CLIENT_ID != '' && env.HOUDINI_SECRET_KEY != '' }}" >> $GITHUB_OUTPUT
+      - name: Skip Next Jobs
+        if: steps.check.outputs.HOUDINI_SECRETS != 'true'
+        run: echo "HOUDINI_CLIENT_ID and HOUDINI_SECRET_KEY GitHub Action Secrets needs to be set to install Houdini builds"
+      # Explicitly error on the ASWF repo, we expect this secret to always exist
+      - name: Error ASWF
+        if: steps.check.outputs.HOUDINI_SECRETS != 'true' && github.repository_owner == 'AcademySoftwareFoundation'
+        run: exit 1
 
-  # # download the latest production version of Houdini X, strip out headers,
-  # # libraries and binaries required for building OpenVDB and put it into
-  # # the GitHub Actions cache
-  # linux_houdini:
-  #   needs: [checksecret]
-  #   if: |
-  #     (needs.checksecret.outputs.HOUDINI_SECRETS == 'true') &&
-  #     (github.event_name != 'workflow_dispatch' ||
-  #      github.event.inputs.type == 'all' ||
-  #      github.event.inputs.type == 'houdini')
-  #   runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
-  #   name: linux-houdini:${{ matrix.config.hou_hash }}
-  #   env:
-  #     CXX: clang++
-  #     HOUDINI_CLIENT_ID: ${{ secrets.HOUDINI_CLIENT_ID }}
-  #     HOUDINI_SECRET_KEY: ${{ secrets.HOUDINI_SECRET_KEY }}
-  #   strategy:
-  #     matrix:
-  #       config:
-  #         - { houdini_version: '20.0', platform: 'linux_x86_64_gcc11.2', hou_hash: '20_0-newabi' }
-  #         - { houdini_version: '20.5', platform: 'linux_x86_64_gcc11.2', hou_hash: '20_5' }
-  #     fail-fast: false
-  #   container:
-  #     image: aswf/ci-base:2024
-  #   steps:
-  #   - uses: actions/checkout@v3
-  #   # We bumped from the 2021 CI image to 2023 here to fix some OpenSSL issues
-  #   # with the Houdini download script. In so doing we broke some of the caching
-  #   # between this job and the jobs in houdini.yml which _don't_ use the 2023
-  #   # image yet. The issue is that the cache action will use zstd if it's
-  #   # available to zip the cache and this causes it to be inserted with a unique
-  #   # hash which images without zstd (i.e. the 2021/2022 images don't have
-  #   # access to). For now, uninstall zstd here instead of installing it
-  #   # everywhere and ask the LF to add zstd to the older base images.
-  #   - name: remove zstd
-  #     run: yum -y remove zstd
-  #   - name: timestamp
-  #     id: timestamp
-  #     run: echo "timestamp=$(date -u +'%Y-%m-%dT%H:%M:%SZ')" >> $GITHUB_OUTPUT
-  #   - name: download_houdini
-  #     run: ./ci/download_houdini.sh ${{ matrix.config.houdini_version }} ${{ matrix.config.platform }} --prod
-  #   - name: install_houdini
-  #     run: |
-  #       mkdir $HOME/houdini_install
-  #       cp hou/hou.tar.gz $HOME/houdini_install/hou.tar.gz
-  #       cd $HOME/houdini_install && tar -xzf hou.tar.gz && cd -
-  #   - name: write_houdini_cache
-  #     uses: actions/cache/save@v3
-  #     with:
-  #       path: hou
-  #       key: vdb-v5-houdini${{ matrix.config.hou_hash }}-${{ steps.timestamp.outputs.timestamp }}
+  # download the latest production version of Houdini X, strip out headers,
+  # libraries and binaries required for building OpenVDB and put it into
+  # the GitHub Actions cache
+  linux_houdini:
+    needs: [checksecret]
+    if: |
+      (needs.checksecret.outputs.HOUDINI_SECRETS == 'true') &&
+      (github.event_name != 'workflow_dispatch' ||
+       github.event.inputs.type == 'all' ||
+       github.event.inputs.type == 'houdini')
+    runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
+    name: linux-houdini:${{ matrix.config.hou_hash }}
+    env:
+      CXX: clang++
+      HOUDINI_CLIENT_ID: ${{ secrets.HOUDINI_CLIENT_ID }}
+      HOUDINI_SECRET_KEY: ${{ secrets.HOUDINI_SECRET_KEY }}
+    strategy:
+      matrix:
+        config:
+          - { houdini_version: '20.0', platform: 'linux_x86_64_gcc11.2', hou_hash: '20_0-newabi' }
+          - { houdini_version: '20.5', platform: 'linux_x86_64_gcc11.2', hou_hash: '20_5' }
+      fail-fast: false
+    container:
+      image: aswf/ci-base:2024
+    steps:
+    - uses: actions/checkout@v3
+    # We bumped from the 2021 CI image to 2023 here to fix some OpenSSL issues
+    # with the Houdini download script. In so doing we broke some of the caching
+    # between this job and the jobs in houdini.yml which _don't_ use the 2023
+    # image yet. The issue is that the cache action will use zstd if it's
+    # available to zip the cache and this causes it to be inserted with a unique
+    # hash which images without zstd (i.e. the 2021/2022 images don't have
+    # access to). For now, uninstall zstd here instead of installing it
+    # everywhere and ask the LF to add zstd to the older base images.
+    - name: remove zstd
+      run: yum -y remove zstd
+    - name: timestamp
+      id: timestamp
+      run: echo "timestamp=$(date -u +'%Y-%m-%dT%H:%M:%SZ')" >> $GITHUB_OUTPUT
+    - name: download_houdini
+      run: ./ci/download_houdini.sh ${{ matrix.config.houdini_version }} ${{ matrix.config.platform }} --prod
+    - name: install_houdini
+      run: |
+        mkdir $HOME/houdini_install
+        cp hou/hou.tar.gz $HOME/houdini_install/hou.tar.gz
+        cd $HOME/houdini_install && tar -xzf hou.tar.gz && cd -
+    - name: write_houdini_cache
+      uses: actions/cache/save@v3
+      with:
+        path: hou
+        key: vdb-v5-houdini${{ matrix.config.hou_hash }}-${{ steps.timestamp.outputs.timestamp }}
 
-  # macos_houdini:
-  #   needs: [checksecret]
-  #   if: |
-  #     (needs.checksecret.outputs.HOUDINI_SECRETS == 'true') &&
-  #     (github.event_name != 'workflow_dispatch' ||
-  #      github.event.inputs.type == 'all' ||
-  #      github.event.inputs.type == 'houdini')
-  #   # Note that macos-14 (current macos-latest) switches to M1. We could instead test
-  #   # the arm build here instead of the x86 one.
-  #   runs-on: macos-latest
-  #   name: macos-houdini-20
-  #   env:
-  #     HOUDINI_CLIENT_ID: ${{ secrets.HOUDINI_CLIENT_ID }}
-  #     HOUDINI_SECRET_KEY: ${{ secrets.HOUDINI_SECRET_KEY }}
-  #   steps:
-  #   - uses: actions/checkout@v3
-  #   - name: timestamp
-  #     id: timestamp
-  #     run: echo "timestamp=$(date -u +'%Y-%m-%dT%H:%M:%SZ')" >> $GITHUB_OUTPUT
-  #   - name: download_houdini
-  #     run: ./ci/download_houdini.sh 20.0 macosx_arm64_clang14.0_13 --prod
-  #   - name: install_houdini
-  #     run: |
-  #       mkdir $HOME/houdini_install
-  #       cp hou/hou.tar.gz $HOME/houdini_install/hou.tar.gz
-  #       cd $HOME/houdini_install && tar -xzf hou.tar.gz && cd -
-  #   - name: write_houdini_cache
-  #     uses: actions/cache/save@v3
-  #     with:
-  #       path: hou
-  #       key: vdb-v5-houdini-macos-${{ steps.timestamp.outputs.timestamp }}
+  macos_houdini:
+    needs: [checksecret]
+    if: |
+      (needs.checksecret.outputs.HOUDINI_SECRETS == 'true') &&
+      (github.event_name != 'workflow_dispatch' ||
+       github.event.inputs.type == 'all' ||
+       github.event.inputs.type == 'houdini')
+    # Note that macos-14 (current macos-latest) switches to M1. We could instead test
+    # the arm build here instead of the x86 one.
+    runs-on: macos-latest
+    name: macos-houdini-20
+    env:
+      HOUDINI_CLIENT_ID: ${{ secrets.HOUDINI_CLIENT_ID }}
+      HOUDINI_SECRET_KEY: ${{ secrets.HOUDINI_SECRET_KEY }}
+    steps:
+    - uses: actions/checkout@v3
+    - name: timestamp
+      id: timestamp
+      run: echo "timestamp=$(date -u +'%Y-%m-%dT%H:%M:%SZ')" >> $GITHUB_OUTPUT
+    - name: download_houdini
+      run: ./ci/download_houdini.sh 20.0 macosx_arm64_clang14.0_13 --prod
+    - name: install_houdini
+      run: |
+        mkdir $HOME/houdini_install
+        cp hou/hou.tar.gz $HOME/houdini_install/hou.tar.gz
+        cd $HOME/houdini_install && tar -xzf hou.tar.gz && cd -
+    - name: write_houdini_cache
+      uses: actions/cache/save@v3
+      with:
+        path: hou
+        key: vdb-v5-houdini-macos-${{ steps.timestamp.outputs.timestamp }}
 
   #############################################################################
   ########################### Core Library Extras #############################
@@ -141,84 +141,84 @@ jobs:
 
   # Extra configuration tests for the OpenVDB Core library. These test a
   # variety of options with newer compilers.
-  # linux-extra:
-  #   if: |
-  #     github.event_name != 'workflow_dispatch' ||
-  #     github.event.inputs.type == 'all' ||
-  #     github.event.inputs.type == 'extra'
-  #   runs-on: ${{ (github.repository_owner == 'NVIDIA-Omniverse' && 'linux-amd64-cpu32') || (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
-  #   name: linux-extra:${{ matrix.config.name }}
-  #   container:
-  #     image: aswf/ci-openvdb:2024
-  #   env:
-  #     CXX: clang++
-  #   strategy:
-  #     matrix:
-  #       config:
-  #         - { name: 'all',   build: 'Release', components: 'core,python,bin,view,render,test',               cmake: '-DUSE_BLOSC=ON  -DUSE_ZLIB=ON  -DUSE_EXR=ON  -DUSE_PNG=ON'  }
-  #         - { name: 'lite',  build: 'Release', components: 'core,python,bin,view,render,test',               cmake: '-DUSE_BLOSC=OFF -DUSE_ZLIB=OFF -DUSE_EXR=OFF -DUSE_PNG=OFF -DOPENVDB_USE_DELAYED_LOADING=OFF' }
-  #         - { name: 'half',  build: 'Release', components: 'core,python,bin,view,render,test',               cmake: '-DUSE_BLOSC=OFF -DUSE_IMATH_HALF=ON' }
-  #         - { name: 'sse',   build: 'Release', components: 'core,python,bin,view,render,test',               cmake: '-DOPENVDB_SIMD=SSE42' }
-  #         - { name: 'avx',   build: 'Release', components: 'core,python,bin,view,render,test',               cmake: '-DOPENVDB_SIMD=AVX' }
-  #         - { name: 'numpy', build: 'Release', components: 'core,python,bin,view,render,test',               cmake: '-DUSE_NUMPY=ON -DOPENVDB_PYTHON_WRAP_ALL_GRID_TYPES=ON' }
-  #         - { name: 'asan',  build: 'asan',    components: 'core,test',                                      cmake: '-DNANOVDB_USE_OPENVDB=ON -DOPENVDB_AX_STATIC=OFF -DOPENVDB_CORE_STATIC=OFF -DUSE_BLOSC=OFF' } # We never called blosc_destroy(), so disable blosc to silence these errors
-  #         - { name: 'ubsan', build: 'ubsan',   components: 'core,test',                                      cmake: '-DCMAKE_CXX_FLAGS="-Wno-deprecated-declarations" ' }
-  #         - { name: 'c++20', build: 'Release', components: 'core,test',                                      cmake: '-DCMAKE_CXX_STANDARD=20' }
-  #         - { name: 'conf',  build: 'Release', components: 'core,python,bin,view,render,test',               cmake: '-DCMAKE_FIND_PACKAGE_PREFER_CONFIG=ON' }
-  #     fail-fast: false
-  #   steps:
-  #   - uses: actions/checkout@v3
-  #   - name: nanobind
-  #     #if: contains(container.image, '2023') == false
-  #     run: ./ci/install_nanobind.sh 2.0.0
-  #   - name: build
-  #     run: >
-  #       ./ci/build.sh -v
-  #       --build-type=${{ matrix.config.build }}
-  #       --components="${{ matrix.config.components }}"
-  #       --cargs=\"-DOPENVDB_CXX_STRICT=ON ${{ matrix.config.cmake }}\"
-  #   - name: test
-  #     run: cd build && ctest -V
+  linux-extra:
+    if: |
+      github.event_name != 'workflow_dispatch' ||
+      github.event.inputs.type == 'all' ||
+      github.event.inputs.type == 'extra'
+    runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
+    name: linux-extra:${{ matrix.config.name }}
+    container:
+      image: aswf/ci-openvdb:2024
+    env:
+      CXX: clang++
+    strategy:
+      matrix:
+        config:
+          - { name: 'all',   build: 'Release', components: 'core,python,bin,view,render,test',               cmake: '-DUSE_BLOSC=ON  -DUSE_ZLIB=ON  -DUSE_EXR=ON  -DUSE_PNG=ON'  }
+          - { name: 'lite',  build: 'Release', components: 'core,python,bin,view,render,test',               cmake: '-DUSE_BLOSC=OFF -DUSE_ZLIB=OFF -DUSE_EXR=OFF -DUSE_PNG=OFF -DOPENVDB_USE_DELAYED_LOADING=OFF' }
+          - { name: 'half',  build: 'Release', components: 'core,python,bin,view,render,test',               cmake: '-DUSE_BLOSC=OFF -DUSE_IMATH_HALF=ON' }
+          - { name: 'sse',   build: 'Release', components: 'core,python,bin,view,render,test',               cmake: '-DOPENVDB_SIMD=SSE42' }
+          - { name: 'avx',   build: 'Release', components: 'core,python,bin,view,render,test',               cmake: '-DOPENVDB_SIMD=AVX' }
+          - { name: 'numpy', build: 'Release', components: 'core,python,bin,view,render,test',               cmake: '-DUSE_NUMPY=ON -DOPENVDB_PYTHON_WRAP_ALL_GRID_TYPES=ON' }
+          - { name: 'asan',  build: 'asan',    components: 'core,test',                                      cmake: '-DNANOVDB_USE_OPENVDB=ON -DOPENVDB_AX_STATIC=OFF -DOPENVDB_CORE_STATIC=OFF -DUSE_BLOSC=OFF' } # We never called blosc_destroy(), so disable blosc to silence these errors
+          - { name: 'ubsan', build: 'ubsan',   components: 'core,test',                                      cmake: '-DCMAKE_CXX_FLAGS="-Wno-deprecated-declarations" ' }
+          - { name: 'c++20', build: 'Release', components: 'core,test',                                      cmake: '-DCMAKE_CXX_STANDARD=20' }
+          - { name: 'conf',  build: 'Release', components: 'core,python,bin,view,render,test',               cmake: '-DCMAKE_FIND_PACKAGE_PREFER_CONFIG=ON' }
+      fail-fast: false
+    steps:
+    - uses: actions/checkout@v3
+    - name: nanobind
+      #if: contains(container.image, '2023') == false
+      run: ./ci/install_nanobind.sh 2.0.0
+    - name: build
+      run: >
+        ./ci/build.sh -v
+        --build-type=${{ matrix.config.build }}
+        --components="${{ matrix.config.components }}"
+        --cargs=\"-DOPENVDB_CXX_STRICT=ON ${{ matrix.config.cmake }}\"
+    - name: test
+      run: cd build && ctest -V
 
-  # # Test latest dependencies, latest compilers and options
-  # latest:
-  #   if: |
-  #     github.event_name != 'workflow_dispatch' ||
-  #     github.event.inputs.type == 'all' ||
-  #     github.event.inputs.type == 'latest'
-  #   runs-on: ${{ matrix.config.runson }}
-  #   env:
-  #     CXX: ${{ matrix.config.cxx }}
-  #   strategy:
-  #     matrix:
-  #       config:
-  #         - { runson: ubuntu-latest, cxx: g++,     cmake: '' }
-  #         # Disable the clang job for now. See https://github.com/actions/runner-images/issues/8659
-  #         # - { runson: ubuntu-latest, cxx: clang++, cmake: '' }
-  #         # @todo gcc on macos
-  #         - { runson: macos-latest,  cxx: '',      cmake: '-DCMAKE_CXX_COMPILER=/opt/homebrew/opt/llvm@15/bin/clang++ -DLLVM_DIR=/opt/homebrew/opt/llvm@15/lib/cmake/llvm' }
-  #     fail-fast: false
-  #   steps:
-  #     - uses: actions/checkout@v3
-  #     - name: install_deps
-  #       run: |
-  #         if [ "$RUNNER_OS" == "Linux" ]; then
-  #           sudo apt-get -q install -y libboost-dev libboost-iostreams-dev libtbb-dev libblosc-dev llvm-dev libgtest-dev libcppunit-dev
-  #           ./ci/install_nanobind.sh 2.0.0
-  #         elif [ "$RUNNER_OS" == "macOS" ]; then
-  #           ./ci/install_macos.sh 15
-  #           ./ci/install_tbb_macos.sh
-  #         else
-  #           echo "$RUNNER_OS not supported"; exit 1
-  #         fi
-  #     - name: build
-  #       run: >
-  #         ./ci/build.sh -v
-  #         --build-type=Release
-  #         --components=\"core,axcore,python,bin,render,test,axbin\"
-  #         --cargs=\"-DCMAKE_CXX_STANDARD=20 -DOPENVDB_USE_DELAYED_LOADING=OFF -DCMAKE_INSTALL_PREFIX=${{ github.workspace }}/install ${{ matrix.config.cmake }}\"
-  #     - name: test
-  #       run: cd build && ctest -V
+  # Test latest dependencies, latest compilers and options
+  latest:
+    if: |
+      github.event_name != 'workflow_dispatch' ||
+      github.event.inputs.type == 'all' ||
+      github.event.inputs.type == 'latest'
+    runs-on: ${{ matrix.config.runson }}
+    env:
+      CXX: ${{ matrix.config.cxx }}
+    strategy:
+      matrix:
+        config:
+          - { runson: ubuntu-latest, cxx: g++,     cmake: '' }
+          # Disable the clang job for now. See https://github.com/actions/runner-images/issues/8659
+          # - { runson: ubuntu-latest, cxx: clang++, cmake: '' }
+          # @todo gcc on macos
+          - { runson: macos-latest,  cxx: '',      cmake: '-DCMAKE_CXX_COMPILER=/opt/homebrew/opt/llvm@15/bin/clang++ -DLLVM_DIR=/opt/homebrew/opt/llvm@15/lib/cmake/llvm' }
+      fail-fast: false
+    steps:
+      - uses: actions/checkout@v3
+      - name: install_deps
+        run: |
+          if [ "$RUNNER_OS" == "Linux" ]; then
+            sudo apt-get -q install -y libboost-dev libboost-iostreams-dev libtbb-dev libblosc-dev llvm-dev libgtest-dev libcppunit-dev
+            ./ci/install_nanobind.sh 2.0.0
+          elif [ "$RUNNER_OS" == "macOS" ]; then
+            ./ci/install_macos.sh 15
+            ./ci/install_tbb_macos.sh
+          else
+            echo "$RUNNER_OS not supported"; exit 1
+          fi
+      - name: build
+        run: >
+          ./ci/build.sh -v
+          --build-type=Release
+          --components=\"core,axcore,python,bin,render,test,axbin\"
+          --cargs=\"-DCMAKE_CXX_STANDARD=20 -DOPENVDB_USE_DELAYED_LOADING=OFF -DCMAKE_INSTALL_PREFIX=${{ github.workspace }}/install ${{ matrix.config.cmake }}\"
+      - name: test
+        run: cd build && ctest -V
 
   windows:
     # Windows CI. Tests static and dynamic builds with MT and MD respectively.
@@ -279,152 +279,152 @@ jobs:
   ############################ AX Library Extras ##############################
   #############################################################################
 
-  # linux-ax:
-  #   if: |
-  #     github.event_name != 'workflow_dispatch' ||
-  #     github.event.inputs.type == 'all' ||
-  #     github.event.inputs.type == 'ax'
-  #   runs-on: ${{ (github.repository_owner == 'NVIDIA-Omniverse' && 'linux-amd64-cpu32') || (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
-  #   name: >
-  #     linux-ax:${{ matrix.config.image }}-cxx:${{ matrix.config.cxx }}-${{ matrix.config.build }}
-  #   container:
-  #     image: aswf/ci-openvdb:${{ matrix.config.image }}
-  #   env:
-  #     CXX: ${{ matrix.config.cxx }}
-  #   strategy:
-  #     matrix:
-  #       config:
-  #         # Unified
-  #         - { image: '2023-clang15', cxx: 'clang++', build: 'Release', components: 'core,bin,axcore,axbin,axtest', cmake: '' }
-  #         - { image: '2023-clang15', cxx: 'g++',     build: 'Release', components: 'core,bin,axcore,axbin,axtest', cmake: '' }
-  #     fail-fast: false
-  #   steps:
-  #     - uses: actions/checkout@v3
-  #     - name: nanobind
-  #       #f: contains(matrix.config.image, '2023') == false
-  #       run: ./ci/install_nanobind.sh 2.0.0
-  #     - name: build
-  #       run: >
-  #         ./ci/build.sh -v
-  #         --build-type=${{ matrix.config.build }}
-  #         --components=${{ matrix.config.components }}
-  #         --cargs=\"
-  #         ${{ matrix.config.cmake }}
-  #         -DOPENVDB_AX_TEST_CMD_DOWNLOADS=ON
-  #         -DUSE_EXPLICIT_INSTANTIATION=OFF
-  #         -DOPENVDB_CXX_STRICT=ON
-  #         \"
-  #     - name: clean
-  #       if: matrix.config.components == 'core'
-  #       run: rm -rf build
-  #     - name: build
-  #       if: matrix.config.components == 'core'
-  #       run: >
-  #         ./ci/build.sh -v
-  #         --build-type=${{ matrix.config.build }}
-  #         --components="bin,axcore,axbin,axtest,python"
-  #         --cargs=\"
-  #         ${{ matrix.config.cmake }}
-  #         -DOPENVDB_AX_TEST_CMD_DOWNLOADS=ON
-  #         -DUSE_EXPLICIT_INSTANTIATION=OFF
-  #         -DOPENVDB_CXX_STRICT=ON
-  #         \"
-  #     - name: test
-  #       run: cd build && ctest -V
-  #     - name: test_doxygen_examples
-  #       run: ./ci/extract_test_examples.sh
+  linux-ax:
+    if: |
+      github.event_name != 'workflow_dispatch' ||
+      github.event.inputs.type == 'all' ||
+      github.event.inputs.type == 'ax'
+    runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
+    name: >
+      linux-ax:${{ matrix.config.image }}-cxx:${{ matrix.config.cxx }}-${{ matrix.config.build }}
+    container:
+      image: aswf/ci-openvdb:${{ matrix.config.image }}
+    env:
+      CXX: ${{ matrix.config.cxx }}
+    strategy:
+      matrix:
+        config:
+          # Unified
+          - { image: '2023-clang15', cxx: 'clang++', build: 'Release', components: 'core,bin,axcore,axbin,axtest', cmake: '' }
+          - { image: '2023-clang15', cxx: 'g++',     build: 'Release', components: 'core,bin,axcore,axbin,axtest', cmake: '' }
+      fail-fast: false
+    steps:
+      - uses: actions/checkout@v3
+      - name: nanobind
+        #f: contains(matrix.config.image, '2023') == false
+        run: ./ci/install_nanobind.sh 2.0.0
+      - name: build
+        run: >
+          ./ci/build.sh -v
+          --build-type=${{ matrix.config.build }}
+          --components=${{ matrix.config.components }}
+          --cargs=\"
+          ${{ matrix.config.cmake }}
+          -DOPENVDB_AX_TEST_CMD_DOWNLOADS=ON
+          -DUSE_EXPLICIT_INSTANTIATION=OFF
+          -DOPENVDB_CXX_STRICT=ON
+          \"
+      - name: clean
+        if: matrix.config.components == 'core'
+        run: rm -rf build
+      - name: build
+        if: matrix.config.components == 'core'
+        run: >
+          ./ci/build.sh -v
+          --build-type=${{ matrix.config.build }}
+          --components="bin,axcore,axbin,axtest,python"
+          --cargs=\"
+          ${{ matrix.config.cmake }}
+          -DOPENVDB_AX_TEST_CMD_DOWNLOADS=ON
+          -DUSE_EXPLICIT_INSTANTIATION=OFF
+          -DOPENVDB_CXX_STRICT=ON
+          \"
+      - name: test
+        run: cd build && ctest -V
+      - name: test_doxygen_examples
+        run: ./ci/extract_test_examples.sh
 
-  # macos-ax:
-  #   if: |
-  #     github.event_name != 'workflow_dispatch' ||
-  #     github.event.inputs.type == 'all' ||
-  #     github.event.inputs.type == 'ax'
-  #   runs-on: macos-13
-  #   name: macos-cxx:${{ matrix.config.cxx }}-llvm:${{ matrix.config.llvm }}-${{ matrix.config.build }}
-  #   env:
-  #     CXX: ${{ matrix.config.cxx }}
-  #   strategy:
-  #     matrix:
-  #       config:
-  #         - { cxx: 'clang++', build: 'Release', llvm: '15' }
-  #     fail-fast: false
-  #   steps:
-  #     - uses: actions/checkout@v3
-  #     - name: install_deps
-  #       run: |
-  #         ./ci/install_macos.sh ${{ matrix.config.llvm }}
-  #         ./ci/install_tbb_macos.sh
-  #     - name: build
-  #       run: >
-  #         ./ci/build.sh -v
-  #         --build-type=${{ matrix.config.build }}
-  #         --components="core,python,bin,axcore,axbin,axtest"
-  #         --cargs=\"
-  #         -DOPENVDB_AX_TEST_CMD_DOWNLOADS=ON
-  #         -DUSE_EXPLICIT_INSTANTIATION=OFF
-  #         -DCMAKE_INSTALL_PREFIX=${{ github.workspace }}/install
-  #         -DLLVM_DIR=/opt/homebrew/opt/llvm@${{ matrix.config.llvm }}/lib/cmake/llvm
-  #         \"
-  #     - name: test
-  #       run: cd build && ctest -V
-  #     - name: test_doxygen_examples
-  #       run: ./ci/extract_test_examples.sh
+  macos-ax:
+    if: |
+      github.event_name != 'workflow_dispatch' ||
+      github.event.inputs.type == 'all' ||
+      github.event.inputs.type == 'ax'
+    runs-on: macos-13
+    name: macos-cxx:${{ matrix.config.cxx }}-llvm:${{ matrix.config.llvm }}-${{ matrix.config.build }}
+    env:
+      CXX: ${{ matrix.config.cxx }}
+    strategy:
+      matrix:
+        config:
+          - { cxx: 'clang++', build: 'Release', llvm: '15' }
+      fail-fast: false
+    steps:
+      - uses: actions/checkout@v3
+      - name: install_deps
+        run: |
+          ./ci/install_macos.sh ${{ matrix.config.llvm }}
+          ./ci/install_tbb_macos.sh
+      - name: build
+        run: >
+          ./ci/build.sh -v
+          --build-type=${{ matrix.config.build }}
+          --components="core,python,bin,axcore,axbin,axtest"
+          --cargs=\"
+          -DOPENVDB_AX_TEST_CMD_DOWNLOADS=ON
+          -DUSE_EXPLICIT_INSTANTIATION=OFF
+          -DCMAKE_INSTALL_PREFIX=${{ github.workspace }}/install
+          -DLLVM_DIR=/opt/homebrew/opt/llvm@${{ matrix.config.llvm }}/lib/cmake/llvm
+          \"
+      - name: test
+        run: cd build && ctest -V
+      - name: test_doxygen_examples
+        run: ./ci/extract_test_examples.sh
 
-  # windows-ax:
-  #   if: |
-  #     github.event_name != 'workflow_dispatch' ||
-  #     github.event.inputs.type == 'all' ||
-  #     github.event.inputs.type == 'ax'
-  #   runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'windows-2022-8c-32g-300h') || 'windows-latest' }}
-  #   name: windows-vc:${{ matrix.config.vc }}-type:${{ matrix.config.build }}
-  #   env:
-  #     VCPKG_DEFAULT_TRIPLET: ${{ matrix.config.vc }}
-  #     # Export this with '' avoid bash treating \ as escape
-  #     VDB_INSTALL_PREFIX: '${{ github.workspace }}\\install'
-  #   strategy:
-  #     matrix:
-  #       config:
-  #         # static build of blosc from vcpkg does not build internal sources.
-  #         # USE_STATIC_DEPENDENCIES is required for IlmBase/OpenEXR defines and
-  #         # Boost as both shared and static libs are installed.
-  #         # @todo  We don't currently run the axtests with shared builds of ax
-  #         # due to symbol issues using LLVM as a static lib (which is the only
-  #         # option on Windows).
-  #         - { vc: 'x64-windows',        crt: 'MD',  components: 'core,bin,axcore,axbin,python', build: 'Release', cmake: '-DOPENVDB_CORE_STATIC=OFF -DOPENVDB_AX_STATIC=OFF' }
-  #         - { vc: 'x64-windows-static', crt: 'MT',  components: 'core,bin,axcore,axbin,axtest', build: 'Release', cmake: '-DOPENVDB_CORE_SHARED=OFF -DOPENVDB_AX_SHARED=OFF -DUSE_STATIC_DEPENDENCIES=ON -DBLOSC_USE_EXTERNAL_SOURCES=ON' }
-  #         - { vc: 'x64-windows-static', crt: 'MTd', components: 'core,bin,axcore,axbin,axtest', build: 'Debug',   cmake: '-DOPENVDB_CORE_SHARED=OFF -DOPENVDB_AX_SHARED=OFF -DUSE_STATIC_DEPENDENCIES=ON -DBLOSC_USE_EXTERNAL_SOURCES=ON' }
-  #     fail-fast: false
-  #   steps:
-  #   - uses: actions/checkout@v3
-  #   - name: llvm
-  #     run: ./ci/install_llvm_windows.sh ${{ matrix.config.crt }}
-  #   - name: install
-  #     shell: powershell
-  #     run: .\ci\install_windows.ps1
-  #   - name: build
-  #     run: >
-  #       ./ci/build.sh -v
-  #       --config=${{ matrix.config.build }}
-  #       --components="${{ matrix.config.components }}"
-  #       --cargs=\'
-  #       -A x64 -G \"Visual Studio 17 2022\"
-  #       -DVCPKG_TARGET_TRIPLET=${VCPKG_DEFAULT_TRIPLET}
-  #       -DCMAKE_TOOLCHAIN_FILE=\"${VCPKG_INSTALLATION_ROOT}\\scripts\\buildsystems\\vcpkg.cmake\"
-  #       -DMSVC_COMPRESS_PDB=ON
-  #       -DOPENVDB_AX_TEST_CMD_DOWNLOADS=ON
-  #       -DUSE_EXPLICIT_INSTANTIATION=OFF
-  #       -DLLVM_DIR=\"${HOME}\\llvm_install\\lib\\cmake\\llvm\"
-  #       -DCMAKE_INSTALL_PREFIX=\"${VDB_INSTALL_PREFIX}\"
-  #       ${{ matrix.config.cmake }}
-  #       \'
-  #   - name: runtime_path
-  #     shell: pwsh
-  #     run: |
-  #       # note: system path must be modified in a previous step to it's use
-  #       echo "$Env:VCPKG_INSTALLATION_ROOT\installed\${{ matrix.config.vc }}\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-  #       echo "$Env:VDB_INSTALL_PREFIX\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-  #   - name: test
-  #     run: cd build && ctest -V -C ${{ matrix.config.build }}
+  windows-ax:
+    if: |
+      github.event_name != 'workflow_dispatch' ||
+      github.event.inputs.type == 'all' ||
+      github.event.inputs.type == 'ax'
+    runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'windows-2022-8c-32g-300h') || 'windows-latest' }}
+    name: windows-vc:${{ matrix.config.vc }}-type:${{ matrix.config.build }}
+    env:
+      VCPKG_DEFAULT_TRIPLET: ${{ matrix.config.vc }}
+      # Export this with '' avoid bash treating \ as escape
+      VDB_INSTALL_PREFIX: '${{ github.workspace }}\\install'
+    strategy:
+      matrix:
+        config:
+          # static build of blosc from vcpkg does not build internal sources.
+          # USE_STATIC_DEPENDENCIES is required for IlmBase/OpenEXR defines and
+          # Boost as both shared and static libs are installed.
+          # @todo  We don't currently run the axtests with shared builds of ax
+          # due to symbol issues using LLVM as a static lib (which is the only
+          # option on Windows).
+          - { vc: 'x64-windows',        crt: 'MD',  components: 'core,bin,axcore,axbin,python', build: 'Release', cmake: '-DOPENVDB_CORE_STATIC=OFF -DOPENVDB_AX_STATIC=OFF' }
+          - { vc: 'x64-windows-static', crt: 'MT',  components: 'core,bin,axcore,axbin,axtest', build: 'Release', cmake: '-DOPENVDB_CORE_SHARED=OFF -DOPENVDB_AX_SHARED=OFF -DUSE_STATIC_DEPENDENCIES=ON -DBLOSC_USE_EXTERNAL_SOURCES=ON' }
+          - { vc: 'x64-windows-static', crt: 'MTd', components: 'core,bin,axcore,axbin,axtest', build: 'Debug',   cmake: '-DOPENVDB_CORE_SHARED=OFF -DOPENVDB_AX_SHARED=OFF -DUSE_STATIC_DEPENDENCIES=ON -DBLOSC_USE_EXTERNAL_SOURCES=ON' }
+      fail-fast: false
+    steps:
+    - uses: actions/checkout@v3
+    - name: llvm
+      run: ./ci/install_llvm_windows.sh ${{ matrix.config.crt }}
+    - name: install
+      shell: powershell
+      run: .\ci\install_windows.ps1
+    - name: build
+      run: >
+        ./ci/build.sh -v
+        --config=${{ matrix.config.build }}
+        --components="${{ matrix.config.components }}"
+        --cargs=\'
+        -A x64 -G \"Visual Studio 17 2022\"
+        -DVCPKG_TARGET_TRIPLET=${VCPKG_DEFAULT_TRIPLET}
+        -DCMAKE_TOOLCHAIN_FILE=\"${VCPKG_INSTALLATION_ROOT}\\scripts\\buildsystems\\vcpkg.cmake\"
+        -DMSVC_COMPRESS_PDB=ON
+        -DOPENVDB_AX_TEST_CMD_DOWNLOADS=ON
+        -DUSE_EXPLICIT_INSTANTIATION=OFF
+        -DLLVM_DIR=\"${HOME}\\llvm_install\\lib\\cmake\\llvm\"
+        -DCMAKE_INSTALL_PREFIX=\"${VDB_INSTALL_PREFIX}\"
+        ${{ matrix.config.cmake }}
+        \'
+    - name: runtime_path
+      shell: pwsh
+      run: |
+        # note: system path must be modified in a previous step to it's use
+        echo "$Env:VCPKG_INSTALLATION_ROOT\installed\${{ matrix.config.vc }}\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+        echo "$Env:VDB_INSTALL_PREFIX\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+    - name: test
+      run: cd build && ctest -V -C ${{ matrix.config.build }}
 
   #############################################################################
   ################################## Blosc ####################################
@@ -488,110 +488,110 @@ jobs:
   ################################## Blosc ####################################
   #############################################################################
 
-  # linux-blosc:
-  #   if: |
-  #     github.event_name != 'workflow_dispatch' ||
-  #     github.event.inputs.type == 'all' ||
-  #     github.event.inputs.type == 'blosc'
-  #   runs-on: ${{ (github.repository_owner == 'NVIDIA-Omniverse' && 'linux-amd64-cpu32') || (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
-  #   name: linux-blosc:${{ matrix.blosc }}
-  #   container:
-  #     image: aswf/ci-base:2023
-  #   strategy:
-  #     matrix:
-  #       blosc: ['1.18.0','1.19.0','1.20.0','1.21.0']
-  #     fail-fast: false
-  #   steps:
-  #   - uses: actions/checkout@v3
-  #   - name: install_blosc
-  #     run: sudo ./ci/install_blosc.sh ${{ matrix.blosc }}
-  #   - name: build
-  #     run: >
-  #       sudo ./ci/build.sh -v
-  #       --build-type=Release
-  #       --components=\"core,test\"
-  #   - name: test
-  #     run: cd build && sudo ctest -V
+  linux-blosc:
+    if: |
+      github.event_name != 'workflow_dispatch' ||
+      github.event.inputs.type == 'all' ||
+      github.event.inputs.type == 'blosc'
+    runs-on: ${{ (github.repository_owner == 'AcademySoftwareFoundation' && 'ubuntu-20.04-8c-32g-300h') || 'ubuntu-latest' }}
+    name: linux-blosc:${{ matrix.blosc }}
+    container:
+      image: aswf/ci-base:2023
+    strategy:
+      matrix:
+        blosc: ['1.18.0','1.19.0','1.20.0','1.21.0']
+      fail-fast: false
+    steps:
+    - uses: actions/checkout@v3
+    - name: install_blosc
+      run: sudo ./ci/install_blosc.sh ${{ matrix.blosc }}
+    - name: build
+      run: >
+        sudo ./ci/build.sh -v
+        --build-type=Release
+        --components=\"core,test\"
+    - name: test
+      run: cd build && sudo ctest -V
 
   #############################################################################
   ################################## ABI ######################################
   #############################################################################
 
-  # linux-abi-checker:
-  #   if: |
-  #     github.event_name == 'workflow_dispatch' &&
-  #     (github.event.inputs.type == 'all' ||
-  #      github.event.inputs.type == 'abi')
-  #   runs-on: ubuntu-22.04
-  #   env:
-  #     # The 'abicheck' build type sets these, but older versions of the library
-  #     # may not have this build type. See OpenVDBCXX.cmake
-  #     CXXFLAGS: "-gdwarf-4 -g3 -ggdb -Og"
-  #   steps:
-  #   - name: Enable Node 16
-  #     run: |
-  #       echo "ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION=true" >> $GITHUB_ENV
-  #   - uses: actions/checkout@v3
-  #     with:
-  #       fetch-depth: 0
-  #       fetch-tags: true
-  #   # Compute the latest major version - that is used as our baseline
-  #   # note: For CI forks, make sure you have your tags synced
-  #   - name: get_major_version
-  #     run: |
-  #       LATEST_VERSION_TAG=$(git tag --merged | sort --version-sort | tail -n1)
-  #       echo "Computed latest VDB tag: ${LATEST_VERSION_TAG}"
-  #       VDB_MAJOR_VERSION=$(echo ${LATEST_VERSION_TAG} | cut -f1 -d '.' | tr -d -c 0-9)
-  #       echo "Using major version: ${VDB_MAJOR_VERSION}"
-  #       echo "VDB_MAJOR_VERSION=${VDB_MAJOR_VERSION}" >> "$GITHUB_ENV"
-  #   - name: install_deps
-  #     run: sudo apt-get -q install -y libboost-iostreams-dev libtbb-dev libblosc-dev elfutils
-  #   - name: install_abi_checker
-  #     run: sudo apt-get -q install -y abi-dumper abi-compliance-checker
-  #   - name: build_new
-  #     run: >
-  #       ./ci/build.sh -v
-  #       --build-dir=build_new
-  #       --build-type=abicheck
-  #       --target=openvdb_shared
-  #       --components=\"core\"
-  #       --cargs=\'-DUSE_EXPLICIT_INSTANTIATION=OFF -DDISABLE_DEPENDENCY_VERSION_CHECKS=ON\'
-  #   - name: checkout_baseline
-  #     run: git checkout v${VDB_MAJOR_VERSION}.0.0
-  #   - name: build_old
-  #     run: >
-  #       ./ci/build.sh -v
-  #       --build-dir=build_old
-  #       --build-type=abicheck
-  #       --target=openvdb_shared
-  #       --components=\"core\"
-  #       --cargs=\'-DUSE_EXPLICIT_INSTANTIATION=OFF -DDISABLE_DEPENDENCY_VERSION_CHECKS=ON\'
-  #   - name: abi_dump
-  #     run: |
-  #       abi-dumper build_new/openvdb/openvdb/libopenvdb.so -o ABI-NEW.dump -lver 1
-  #       abi-dumper build_old/openvdb/openvdb/libopenvdb.so -o ABI-OLD.dump -lver 2
-  #     # Replace the version namespace in the latest ABI dump with the baseline
-  #     # version we're comparing against. We should probably instead build the
-  #     # latest with the baseline version number but no CMake/defines allow us to
-  #     # do this.
-  #   - name: replace_symbols
-  #     run: sed -i -E 's/openvdb([^v]*)v[0-9]*_[0-9]/openvdb\1v'${VDB_MAJOR_VERSION}'_0/g' ABI-NEW.dump
-  #   - name: abi_check
-  #     # -strict treats warnings as errors
-  #     # -extended checks all member data
-  #     # we check everything _not_ in openvdb::**::internal namespace
-  #     run: >
-  #       abi-compliance-checker -l OPENVDB
-  #       -old ABI-OLD.dump
-  #       -new ABI-NEW.dump
-  #       -skip-internal-symbols "\d(openvdb.*internal)"
-  #       -skip-internal-types "(openvdb.*internal)::"
-  #       -strict
-  #       -extended
-  #   - name: upload_report
-  #     uses: actions/upload-artifact@v4
-  #     if: always()
-  #     with:
-  #       name: abi_report
-  #       path: ./compat_reports/OPENVDB/2_to_1/compat_report.html
-  #       retention-days: 5
+  linux-abi-checker:
+    if: |
+      github.event_name == 'workflow_dispatch' &&
+      (github.event.inputs.type == 'all' ||
+       github.event.inputs.type == 'abi')
+    runs-on: ubuntu-22.04
+    env:
+      # The 'abicheck' build type sets these, but older versions of the library
+      # may not have this build type. See OpenVDBCXX.cmake
+      CXXFLAGS: "-gdwarf-4 -g3 -ggdb -Og"
+    steps:
+    - name: Enable Node 16
+      run: |
+        echo "ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION=true" >> $GITHUB_ENV
+    - uses: actions/checkout@v3
+      with:
+        fetch-depth: 0
+        fetch-tags: true
+    # Compute the latest major version - that is used as our baseline
+    # note: For CI forks, make sure you have your tags synced
+    - name: get_major_version
+      run: |
+        LATEST_VERSION_TAG=$(git tag --merged | sort --version-sort | tail -n1)
+        echo "Computed latest VDB tag: ${LATEST_VERSION_TAG}"
+        VDB_MAJOR_VERSION=$(echo ${LATEST_VERSION_TAG} | cut -f1 -d '.' | tr -d -c 0-9)
+        echo "Using major version: ${VDB_MAJOR_VERSION}"
+        echo "VDB_MAJOR_VERSION=${VDB_MAJOR_VERSION}" >> "$GITHUB_ENV"
+    - name: install_deps
+      run: sudo apt-get -q install -y libboost-iostreams-dev libtbb-dev libblosc-dev elfutils
+    - name: install_abi_checker
+      run: sudo apt-get -q install -y abi-dumper abi-compliance-checker
+    - name: build_new
+      run: >
+        ./ci/build.sh -v
+        --build-dir=build_new
+        --build-type=abicheck
+        --target=openvdb_shared
+        --components=\"core\"
+        --cargs=\'-DUSE_EXPLICIT_INSTANTIATION=OFF -DDISABLE_DEPENDENCY_VERSION_CHECKS=ON\'
+    - name: checkout_baseline
+      run: git checkout v${VDB_MAJOR_VERSION}.0.0
+    - name: build_old
+      run: >
+        ./ci/build.sh -v
+        --build-dir=build_old
+        --build-type=abicheck
+        --target=openvdb_shared
+        --components=\"core\"
+        --cargs=\'-DUSE_EXPLICIT_INSTANTIATION=OFF -DDISABLE_DEPENDENCY_VERSION_CHECKS=ON\'
+    - name: abi_dump
+      run: |
+        abi-dumper build_new/openvdb/openvdb/libopenvdb.so -o ABI-NEW.dump -lver 1
+        abi-dumper build_old/openvdb/openvdb/libopenvdb.so -o ABI-OLD.dump -lver 2
+      # Replace the version namespace in the latest ABI dump with the baseline
+      # version we're comparing against. We should probably instead build the
+      # latest with the baseline version number but no CMake/defines allow us to
+      # do this.
+    - name: replace_symbols
+      run: sed -i -E 's/openvdb([^v]*)v[0-9]*_[0-9]/openvdb\1v'${VDB_MAJOR_VERSION}'_0/g' ABI-NEW.dump
+    - name: abi_check
+      # -strict treats warnings as errors
+      # -extended checks all member data
+      # we check everything _not_ in openvdb::**::internal namespace
+      run: >
+        abi-compliance-checker -l OPENVDB
+        -old ABI-OLD.dump
+        -new ABI-NEW.dump
+        -skip-internal-symbols "\d(openvdb.*internal)"
+        -skip-internal-types "(openvdb.*internal)::"
+        -strict
+        -extended
+    - name: upload_report
+      uses: actions/upload-artifact@v3
+      if: always()
+      with:
+        name: abi_report
+        path: ./compat_reports/OPENVDB/2_to_1/compat_report.html
+        retention-days: 5
diff --git a/.github/workflows/whitespace.yml b/.github/workflows/whitespace.yml
index 213da0e5c6..fb2e0187b1 100644
--- a/.github/workflows/whitespace.yml
+++ b/.github/workflows/whitespace.yml
@@ -1,9 +1,12 @@
+
 name: Whitespace
 
 on:
   push:
-    branches:
-      - "pull-request/[0-9]+"
+    paths-ignore:
+      - 'pendingchanges/**'
+      - 'tsc/meetings/**'
+  pull_request:
     paths-ignore:
       - 'pendingchanges/**'
       - 'tsc/meetings/**'
@@ -18,7 +21,7 @@ jobs:
   trailingspaces:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v3
     - name: test
       run: |
           set +e
@@ -29,9 +32,9 @@ jobs:
   spacesnottabs:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v3
     - name: test
       run: |
           set +e
-          git grep -n "	" -- ':!*/whitespace.yml' ':!tsc/meetings/*' ':!*.svg' ':!*.cmd' ':!*.png' ':!pendingchanges/*' ':!*.wlt' ':!*.jpg' ':!*.gif' ':!*.mp4' ':!*.pt' ':!*.pth' ':!*.nvdb' ':!*.npz' ':!*.gitmodules'
+          git grep -n "	" -- ':!*/whitespace.yml' ':!tsc/meetings/*' ':!*.svg' ':!*.cmd' ':!*.png' ':!pendingchanges/*' ':!*.wlt' ':!*.jpg' ':!*.gif' ':!*.mp4' ':!*.pt' ':!*.pth' ':!*.nvdb' ':!*.npz'
           test $? -eq 1

From 8145597272a43a565413e27caf8b536906e4f32b Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Wed, 22 Jan 2025 14:49:50 +1300
Subject: [PATCH 59/59] Adding back fix to NanoVDB CI for test failures

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 .github/workflows/nanovdb.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/nanovdb.yml b/.github/workflows/nanovdb.yml
index 0efc4580a8..fdaa3d3a36 100644
--- a/.github/workflows/nanovdb.yml
+++ b/.github/workflows/nanovdb.yml
@@ -83,6 +83,7 @@ jobs:
           --cargs=\'
           -DUSE_EXPLICIT_INSTANTIATION=OFF
           -DNANOVDB_USE_CUDA=ON
+          -DCMAKE_CUDA_ARCHITECTURES="80"
           -DNANOVDB_USE_OPENVDB=ON
           -DCMAKE_INSTALL_PREFIX=`pwd`
           -DUSE_BLOSC=OFF
@@ -127,6 +128,7 @@ jobs:
         -DMSVC_COMPRESS_PDB=ON
         -DUSE_EXPLICIT_INSTANTIATION=OFF
         -DNANOVDB_USE_CUDA=ON
+        -DCMAKE_CUDA_ARCHITECTURES="80"
         -DNANOVDB_USE_OPENVDB=ON
         -DVCPKG_TARGET_TRIPLET=${VCPKG_DEFAULT_TRIPLET}
         -DCMAKE_TOOLCHAIN_FILE=\"${VCPKG_INSTALLATION_ROOT}\\scripts\\buildsystems\\vcpkg.cmake\"