Optimize the input state-vector copy into the LGPU (#1071)

LuisAlfredoNu · ringo-but-quantum · web-flow · commit 9e26a9118a17 · 2025-03-05T13:21:52.000-05:00
**Context:** After running different algorithm with LGPU and perform a memory profile. Show a memory bottleneck for LGPU on the Python layer because the peak of memory is 3 times the need for the computation. ![image](https://github.com/user-attachments/assets/24eebcf0-49f2-45e1-a63f-71da26cb8dd4) **Description of the Change:** Remove tmp allocation and skip indexes computation for common cases. * Remove temporal GPU allocation for input values and indexes. * The input state vector is copied directly from the host if **the target wires are contiguous and start in the most/least significant wires** (which are the most common cases). * In the case of custom target wires, LGPU follow the previous algorithm but with a speedup in the index computation thought parallel computing with OpenMP **Benefits:** Using a test algorithm with 31 qubits produce the following memory profile: ![newplot(3)](https://github.com/user-attachments/assets/8c74bfb7-f7ed-4759-bd7d-54e23e8e23df) Reduction of the memory peak from 100GB to 66GB Note: `memray` measures all the memory allocation, even for the GPU `cudaMallocX`. Using the following toy circuit ```python state_init = random_normalize_sv(wires-1) target_wires = wires[:-1] dev = qml.device("lightning.gpu", wires=wires) def circuit(): qml.StatePrep(input_state, wires=target_wires) return qml.expval(qml.PauliZ(0)) ``` Produce the following times ![image](https://github.com/user-attachments/assets/9fff27a9-bd3b-42b4-9dd6-88ce3119ee07) ![image](https://github.com/user-attachments/assets/e151ed5e-4a0f-445e-af1c-3de502617f44) **Possible Drawbacks:** **Related GitHub Issues:** [sc-58833] --------- Co-authored-by: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
@@ -6,6 +6,9 @@
 
 ### Improvements
 
+* Optimize the copy of a input state-vector into the LGPU #1071 
+  [(#1071)](https://github.com/PennyLaneAI/pennylane-lightning/pull/1071)
+
 * Fix wheel naming in Docker builds for `setuptools v75.8.1` compatibility.
   [(#1075)](https://github.com/PennyLaneAI/pennylane-lightning/pull/1075)
 
diff --git a/pennylane_lightning/core/_state_vector_base.py b/pennylane_lightning/core/_state_vector_base.py
@@ -184,7 +184,7 @@ def apply_operations(
         # State preparation is currently done in Python
         if operations:  # make sure operations[0] exists
             if isinstance(operations[0], StatePrep):
-                self._apply_state_vector(operations[0].parameters[0].copy(), operations[0].wires)
+                self._apply_state_vector(operations[0].parameters[0], operations[0].wires)
                 operations = operations[1:]
             elif isinstance(operations[0], BasisState):
                 self._apply_basis_state(operations[0].parameters[0], operations[0].wires)
diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
 Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.41.0-dev28"
+__version__ = "0.41.0-dev29"
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
@@ -16,6 +16,7 @@
  */
 #pragma once
 
+#include <algorithm>
 #include <random>
 #include <type_traits>
 #include <unordered_map>
@@ -159,8 +160,8 @@ class StateVectorCudaManaged
      */
     void resetStateVector(bool use_async = false) {
         BaseType::getDataBuffer().zeroInit();
-        std::size_t index = 0;
-        ComplexT value(1.0, 0.0);
+        constexpr std::size_t index = 0;
+        constexpr ComplexT value(1.0, 0.0);
         setBasisState_(value, index, use_async);
     };
 
@@ -200,14 +201,14 @@ class StateVectorCudaManaged
      * @brief Set values for a batch of elements of the state-vector.
      *
      * @param state_ptr Pointer to the initial state data.
-     * @param num_states Length of the initial state data.
+     * @param state_size Length of the initial state data.
      * @param wires Wires.
      * @param use_async Use an asynchronous memory copy. Default is false.
      */
-    void setStateVector(const ComplexT *state_ptr, const std::size_t num_states,
+    void setStateVector(const ComplexT *state_ptr, const std::size_t state_size,
                         const std::vector<std::size_t> &wires,
                         bool use_async = false) {
-        PL_ABORT_IF_NOT(num_states == Pennylane::Util::exp2(wires.size()),
+        PL_ABORT_IF_NOT(state_size == Pennylane::Util::exp2(wires.size()),
                         "Inconsistent state and wires dimensions.");
 
         const auto num_qubits = BaseType::getNumQubits();
@@ -222,21 +223,45 @@ class StateVectorCudaManaged
             typename std::conditional<std::is_same<PrecisionT, float>::value,
                                       int32_t, int64_t>::type;
 
-        // Calculate the indices of the state-vector to be set.
-        // TODO: Could move to GPU calculation if the state size is large.
-        std::vector<index_type> indices(num_states);
-        const std::size_t num_wires = wires.size();
-        constexpr std::size_t one{1U};
-        for (std::size_t i = 0; i < num_states; i++) {
-            std::size_t index{0U};
-            for (std::size_t j = 0; j < num_wires; j++) {
-                const std::size_t bit = (i & (one << j)) >> j;
-                index |= bit << (num_qubits - 1 - wires[num_wires - 1 - j]);
+        const bool is_wires_sorted_contiguous =
+            std::is_sorted(wires.begin(), wires.end()) &&
+            wires.front() + wires.size() - 1 == wires.back();
+
+        const bool is_left_significant = wires.front() == 0;
+        const bool is_side_significant =
+            is_left_significant || wires.back() == num_qubits - 1;
+
+        if (is_wires_sorted_contiguous && is_side_significant) {
+            // Set most common case: contiguous wires
+            setSortedContiguousStateVector_<index_type>(
+                state_size, state_ptr, wires, is_left_significant, use_async);
+        } else {
+            // Set the state-vector for non-contiguous wires
+            std::vector<index_type> indices(state_size);
+
+            // Calculate the indices of the state-vector to be set.
+            // TODO: Could move to GPU calculation if the state size is large.
+#pragma omp parallel shared(state_size, num_qubits, indices, wires)
+            {
+                const std::size_t num_wires = wires.size();
+                auto local_wires = wires;
+
+#pragma omp for
+                for (std::size_t i = 0; i < state_size; i++) {
+                    constexpr std::size_t one{1U};
+                    std::size_t index{0U};
+                    for (std::size_t j = 0; j < num_wires; j++) {
+                        const std::size_t bit = (i & (one << j)) >> j;
+                        index |= bit << (num_qubits - 1 -
+                                         local_wires[num_wires - 1 - j]);
+                    }
+                    indices[i] = static_cast<index_type>(index);
+                }
             }
-            indices[i] = static_cast<index_type>(index);
+            // set the state-vector
+            setStateVector_<index_type>(state_size, state_ptr, indices.data(),
+                                        use_async);
         }
-        setStateVector_<index_type>(num_states, state_ptr, indices.data(),
-                                    use_async);
     }
 
     /**
@@ -2128,6 +2153,40 @@ class StateVectorCudaManaged
                            stream_id);
     }
 
+    /**
+     * @brief Set values for a batch of elements of the state-vector. This
+     * method is implemented by the customized CUDA kernel defined in the
+     * DataBuffer class.
+     *
+     * @tparam index_type Integer value type.
+     *
+     * @param num_indices Number of elements to be passed to the state vector.
+     * @param values Pointer to values to be set for the target elements.
+     * @param wires Wires of the target elements.
+     * @param is_left_significant If true, the target wires start from zero.
+     * Otherwise, the last target wire matches the last qubit.
+     * @param async Use an asynchronous memory copy.
+     */
+    template <class index_type>
+    void setSortedContiguousStateVector_(const index_type num_indices,
+                                         const std::complex<PrecisionT> *values,
+                                         const std::vector<std::size_t> &wires,
+                                         const bool is_left_significant = false,
+                                         const bool async = false) {
+        BaseType::getDataBuffer().zeroInit();
+
+        if (is_left_significant) {
+            size_t stride = std::size_t(1)
+                            << (BaseType::getNumQubits() - wires.size());
+            BaseType::getDataBuffer().CopyHostDataToGpuWithStride(
+                values, num_indices, stride, async);
+        } else {
+            BaseType::getDataBuffer().CopyHostDataToGpu(values, num_indices,
+                                                        std::size_t(0), async);
+        }
+        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+    }
+
     /**
      * @brief Set values for a batch of elements of the state-vector. This
      * method is implemented by the customized CUDA kernel defined in the
@@ -2140,7 +2199,7 @@ class StateVectorCudaManaged
      */
     template <class index_type, std::size_t thread_per_block = 256>
     void setStateVector_(const index_type num_indices,
-                         const std::complex<Precision> *values,
+                         const std::complex<PrecisionT> *values,
                          const index_type *indices, const bool async = false) {
         BaseType::getDataBuffer().zeroInit();
 
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_NonParam.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_NonParam.cpp
@@ -1108,11 +1108,50 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::SetStateVector",
             "the host") {
         auto init_state =
             createRandomStateVectorData<PrecisionT>(re, num_qubits);
-        auto expected_state = init_state;
 
+        StateVectorCudaManaged<TestType> sv{num_qubits};
+
+        std::vector<std::complex<PrecisionT>> values(init_state.begin(),
+                                                     init_state.end());
+
+        sv.setStateVector(values.data(), values.size(),
+                          std::vector<std::size_t>{0, 1, 2});
+        CHECK(init_state == Pennylane::Util::approx(sv.getDataVector()));
+    }
+
+    SECTION("Set state vector with values and their corresponding indices on "
+            "the host for a subset of wires right significant") {
+        auto init_state =
+            createRandomStateVectorData<PrecisionT>(re, num_qubits - 1);
+
+        std::vector<std::complex<PrecisionT>> expected_state(
+            Pennylane::Util::exp2(num_qubits), {0, 0});
+
+        std::copy(init_state.begin(), init_state.end(), expected_state.begin());
+
+        StateVectorCudaManaged<TestType> sv{num_qubits};
+
+        std::vector<std::complex<PrecisionT>> values(init_state.begin(),
+                                                     init_state.end());
+
+        sv.setStateVector(values.data(), values.size(),
+                          std::vector<std::size_t>{1, 2});
+        CHECK(expected_state == Pennylane::Util::approx(sv.getDataVector()));
+    }
+
+    SECTION("Set state vector with values and their corresponding indices on "
+            "the host for a subset of wires left significant") {
+        auto init_state =
+            createRandomStateVectorData<PrecisionT>(re, num_qubits - 1);
+
+        std::vector<std::complex<PrecisionT>> expected_state(
+            Pennylane::Util::exp2(num_qubits), {0, 0});
+
+        // Distributing along the base vector with a stride.
+        // Stride is 2**(n_qubits - n_target_wires)
         for (std::size_t i = 0; i < Pennylane::Util::exp2(num_qubits - 1);
              i++) {
-            std::swap(expected_state[i * 2], expected_state[i * 2 + 1]);
+            expected_state[i * 2] = init_state[i];
         }
 
         StateVectorCudaManaged<TestType> sv{num_qubits};
@@ -1121,8 +1160,53 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::SetStateVector",
                                                      init_state.end());
 
         sv.setStateVector(values.data(), values.size(),
-                          std::vector<std::size_t>{0, 1, 2});
-        CHECK(init_state == Pennylane::Util::approx(sv.getDataVector()));
+                          std::vector<std::size_t>{0, 1});
+        CHECK(expected_state == Pennylane::Util::approx(sv.getDataVector()));
+    }
+    SECTION("Set state vector with values and their corresponding indices on "
+            "the host for a subset of wires non-consecutive") {
+        auto init_state =
+            createRandomStateVectorData<PrecisionT>(re, num_qubits - 1);
+
+        std::vector<std::complex<PrecisionT>> expected_state(
+            Pennylane::Util::exp2(num_qubits), {0, 0});
+
+        expected_state[0] = init_state[0];
+        expected_state[1] = init_state[1];
+        expected_state[4] = init_state[2];
+        expected_state[5] = init_state[3];
+
+        StateVectorCudaManaged<TestType> sv{num_qubits};
+
+        std::vector<std::complex<PrecisionT>> values(init_state.begin(),
+                                                     init_state.end());
+
+        sv.setStateVector(values.data(), values.size(),
+                          std::vector<std::size_t>{0, 2});
+        CHECK(expected_state == Pennylane::Util::approx(sv.getDataVector()));
+    }
+    SECTION("Set state vector with values and their corresponding indices on "
+            "the host for a subset of wires consecutive and not significant") {
+        std::size_t num_qubits_local = 4;
+        auto init_state =
+            createRandomStateVectorData<PrecisionT>(re, num_qubits_local - 2);
+
+        std::vector<std::complex<PrecisionT>> expected_state(
+            Pennylane::Util::exp2(num_qubits_local), {0, 0});
+
+        expected_state[0] = init_state[0];
+        expected_state[2] = init_state[1];
+        expected_state[4] = init_state[2];
+        expected_state[6] = init_state[3];
+
+        StateVectorCudaManaged<TestType> sv{num_qubits_local};
+
+        std::vector<std::complex<PrecisionT>> values(init_state.begin(),
+                                                     init_state.end());
+
+        sv.setStateVector(values.data(), values.size(),
+                          std::vector<std::size_t>{1, 2});
+        CHECK(expected_state == Pennylane::Util::approx(sv.getDataVector()));
     }
 }
 
diff --git a/pennylane_lightning/core/src/utils/cuda_utils/DataBuffer.hpp b/pennylane_lightning/core/src/utils/cuda_utils/DataBuffer.hpp
@@ -213,6 +213,73 @@ template <class GPUDataT, class DevTagT = int> class DataBuffer {
         }
     }
 
+    /**
+     * @brief Explicitly copy data from host memory to GPU device with an
+     * offset.
+     *
+     * @tparam HostDataT Host data type.
+     *
+     * @param host_in Host data buffer.
+     * @param length Number of elements to copy.
+     * @param offset Offset in the GPU buffer.
+     * @param async Asynchronous copy flag.
+     *
+     */
+    template <class HostDataT = GPUDataT>
+    void CopyHostDataToGpu(const HostDataT *host_in, std::size_t length,
+                           std::size_t offset, bool async = false) {
+        PL_ABORT_IF(
+            (getLength() * sizeof(GPUDataT)) <
+                ((offset + length) * sizeof(HostDataT)),
+            "Sizes do not match for host & GPU data. Please ensure the source "
+            "buffer is out of bounds of the destination buffer");
+
+        if (async) {
+            PL_CUDA_IS_SUCCESS(cudaMemcpyAsync(
+                getData() + offset, host_in, sizeof(GPUDataT) * length,
+                cudaMemcpyHostToDevice, getStream()));
+        } else {
+            PL_CUDA_IS_SUCCESS(cudaMemcpy(getData() + offset, host_in,
+                                          sizeof(GPUDataT) * length,
+                                          cudaMemcpyDefault));
+        }
+    }
+
+    /**
+     * @brief Explicitly copy data from host memory to GPU device with a stride.
+     *
+     * @tparam HostDataT Host data type.
+     *
+     * @param host_in Host data buffer.
+     * @param length Number of elements to copy.
+     * @param stride Stride in the GPU buffer.
+     * @param async Asynchronous copy flag.
+     *
+     */
+    template <class HostDataT = GPUDataT>
+    void CopyHostDataToGpuWithStride(const HostDataT *host_in,
+                                     std::size_t length, std::size_t stride,
+                                     bool async = false) {
+        PL_ABORT_IF(
+            (getLength() * sizeof(GPUDataT)) <
+                ((stride * length) * sizeof(HostDataT)),
+            "Sizes do not match for host & GPU data. Please ensure the source "
+            "buffer is out of bounds of the destination buffer or the stride "
+            "is too large");
+
+        if (async) {
+            PL_CUDA_IS_SUCCESS(
+                cudaMemcpy2DAsync(getData(), sizeof(GPUDataT) * stride, host_in,
+                                  sizeof(HostDataT), sizeof(HostDataT), length,
+                                  cudaMemcpyHostToDevice, getStream()));
+        } else {
+            PL_CUDA_IS_SUCCESS(
+                cudaMemcpy2D(getData(), sizeof(GPUDataT) * stride, host_in,
+                             sizeof(HostDataT), sizeof(HostDataT), length,
+                             cudaMemcpyHostToDevice));
+        }
+    }
+
     /**
      * @brief Explicitly copy data from GPU device to host memory.
      *
diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -214,10 +214,10 @@ def _apply_state_vector(self, state, device_wires, use_async: bool = False):
             # state = state_data
 
         state = self._asarray(state, dtype=self.dtype)  # this operation on host
-        output_shape = [2] * self._num_local_wires
 
         if len(device_wires) == self.num_wires and Wires(sorted(device_wires)) == device_wires:
             # Initialize the entire device state with the input state
+            output_shape = [2] * self._num_local_wires
             if self.num_wires == self._num_local_wires:
                 self.syncH2D(np.reshape(state, output_shape))
                 return