PennyLaneAI · LuisAlfredoNu · Mar 5, 2025 · Feb 14, 2025 · Feb 27, 2025 · Feb 27, 2025
diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
@@ -6,6 +6,9 @@
 
 ### Improvements
 
+* Optimize the copy of a input state-vector into the LGPU #1071 
+  [(#1071)](https://github.com/PennyLaneAI/pennylane-lightning/pull/1071)
+
 * Fix wheel naming in Docker builds for `setuptools v75.8.1` compatibility.
   [(#1075)](https://github.com/PennyLaneAI/pennylane-lightning/pull/1075)
 

diff --git a/pennylane_lightning/core/_state_vector_base.py b/pennylane_lightning/core/_state_vector_base.py
@@ -184,7 +184,7 @@ def apply_operations(
         # State preparation is currently done in Python
         if operations:  # make sure operations[0] exists
             if isinstance(operations[0], StatePrep):
-                self._apply_state_vector(operations[0].parameters[0].copy(), operations[0].wires)
+                self._apply_state_vector(operations[0].parameters[0], operations[0].wires)
                 operations = operations[1:]
             elif isinstance(operations[0], BasisState):
                 self._apply_basis_state(operations[0].parameters[0], operations[0].wires)

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
 Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.41.0-dev28"
+__version__ = "0.41.0-dev29"
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
@@ -16,6 +16,7 @@
  */
 #pragma once
 
+#include <algorithm>
 #include <random>
 #include <type_traits>
 #include <unordered_map>
@@ -159,8 +160,8 @@ class StateVectorCudaManaged
      */
     void resetStateVector(bool use_async = false) {
         BaseType::getDataBuffer().zeroInit();
-        std::size_t index = 0;
-        ComplexT value(1.0, 0.0);
+        constexpr std::size_t index = 0;
+        constexpr ComplexT value(1.0, 0.0);
         setBasisState_(value, index, use_async);
     };
 
@@ -200,14 +201,14 @@ class StateVectorCudaManaged
      * @brief Set values for a batch of elements of the state-vector.
      *
      * @param state_ptr Pointer to the initial state data.
-     * @param num_states Length of the initial state data.
+     * @param state_size Length of the initial state data.
      * @param wires Wires.
      * @param use_async Use an asynchronous memory copy. Default is false.
      */
-    void setStateVector(const ComplexT *state_ptr, const std::size_t num_states,
+    void setStateVector(const ComplexT *state_ptr, const std::size_t state_size,
                         const std::vector<std::size_t> &wires,
                         bool use_async = false) {
-        PL_ABORT_IF_NOT(num_states == Pennylane::Util::exp2(wires.size()),
+        PL_ABORT_IF_NOT(state_size == Pennylane::Util::exp2(wires.size()),
                         "Inconsistent state and wires dimensions.");
 
         const auto num_qubits = BaseType::getNumQubits();
@@ -222,21 +223,45 @@ class StateVectorCudaManaged
             typename std::conditional<std::is_same<PrecisionT, float>::value,
                                       int32_t, int64_t>::type;
 
-        // Calculate the indices of the state-vector to be set.
-        // TODO: Could move to GPU calculation if the state size is large.
-        std::vector<index_type> indices(num_states);
-        const std::size_t num_wires = wires.size();
-        constexpr std::size_t one{1U};
-        for (std::size_t i = 0; i < num_states; i++) {
-            std::size_t index{0U};
-            for (std::size_t j = 0; j < num_wires; j++) {
-                const std::size_t bit = (i & (one << j)) >> j;
-                index |= bit << (num_qubits - 1 - wires[num_wires - 1 - j]);
+        const bool is_wires_sorted_contiguous =
+            std::is_sorted(wires.begin(), wires.end()) &&
+            wires.front() + wires.size() - 1 == wires.back();
+
+        const bool is_left_significant = wires.front() == 0;
+        const bool is_side_significant =
+            is_left_significant || wires.back() == num_qubits - 1;
+
+        if (is_wires_sorted_contiguous && is_side_significant) {
+            // Set most common case: contiguous wires
+            setSortedContiguousStateVector_<index_type>(
+                state_size, state_ptr, wires, is_left_significant, use_async);
+        } else {
+            // Set the state-vector for non-contiguous wires
+            std::vector<index_type> indices(state_size);
+
+            // Calculate the indices of the state-vector to be set.
+            // TODO: Could move to GPU calculation if the state size is large.
+#pragma omp parallel shared(state_size, num_qubits, indices, wires)
+            {
+                const std::size_t num_wires = wires.size();
+                auto local_wires = wires;
+
+#pragma omp for
+                for (std::size_t i = 0; i < state_size; i++) {
+                    constexpr std::size_t one{1U};
+                    std::size_t index{0U};
+                    for (std::size_t j = 0; j < num_wires; j++) {
+                        const std::size_t bit = (i & (one << j)) >> j;
+                        index |= bit << (num_qubits - 1 -
+                                         local_wires[num_wires - 1 - j]);
+                    }
+                    indices[i] = static_cast<index_type>(index);
+                }
             }
-            indices[i] = static_cast<index_type>(index);
+            // set the state-vector
+            setStateVector_<index_type>(state_size, state_ptr, indices.data(),
+                                        use_async);
         }
-        setStateVector_<index_type>(num_states, state_ptr, indices.data(),
-                                    use_async);
     }
 
     /**
@@ -2128,6 +2153,40 @@ class StateVectorCudaManaged
                            stream_id);
     }
 
+    /**
+     * @brief Set values for a batch of elements of the state-vector. This
+     * method is implemented by the customized CUDA kernel defined in the
+     * DataBuffer class.
+     *
+     * @tparam index_type Integer value type.
+     *
+     * @param num_indices Number of elements to be passed to the state vector.
+     * @param values Pointer to values to be set for the target elements.
+     * @param wires Wires of the target elements.
+     * @param is_left_significant If true, the target wires start from zero.
+     * Otherwise, the last target wire matches the last qubit.
+     * @param async Use an asynchronous memory copy.
+     */
+    template <class index_type>
+    void setSortedContiguousStateVector_(const index_type num_indices,
+                                         const std::complex<PrecisionT> *values,
+                                         const std::vector<std::size_t> &wires,
+                                         const bool is_left_significant = false,
+                                         const bool async = false) {
+        BaseType::getDataBuffer().zeroInit();
+
+        if (is_left_significant) {
+            size_t stride = std::size_t(1)
+                            << (BaseType::getNumQubits() - wires.size());
+            BaseType::getDataBuffer().CopyHostDataToGpuWithStride(
+                values, num_indices, stride, async);
+        } else {
+            BaseType::getDataBuffer().CopyHostDataToGpu(values, num_indices,
+                                                        std::size_t(0), async);
+        }
+        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+    }
+
     /**
      * @brief Set values for a batch of elements of the state-vector. This
      * method is implemented by the customized CUDA kernel defined in the
@@ -2140,7 +2199,7 @@ class StateVectorCudaManaged
      */
     template <class index_type, std::size_t thread_per_block = 256>
     void setStateVector_(const index_type num_indices,
-                         const std::complex<Precision> *values,
+                         const std::complex<PrecisionT> *values,
                          const index_type *indices, const bool async = false) {
         BaseType::getDataBuffer().zeroInit();
 

diff --git a/...ng/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_NonParam.cpp b/...ng/core/src/simulators/lightning_gpu/gates/tests/Test_StateVectorCudaManaged_NonParam.cpp
@@ -1108,11 +1108,50 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::SetStateVector",
             "the host") {
         auto init_state =
             createRandomStateVectorData<PrecisionT>(re, num_qubits);
-        auto expected_state = init_state;
 
+        StateVectorCudaManaged<TestType> sv{num_qubits};
+
+        std::vector<std::complex<PrecisionT>> values(init_state.begin(),
+                                                     init_state.end());
+
+        sv.setStateVector(values.data(), values.size(),
+                          std::vector<std::size_t>{0, 1, 2});
+        CHECK(init_state == Pennylane::Util::approx(sv.getDataVector()));
+    }
+
+    SECTION("Set state vector with values and their corresponding indices on "
+            "the host for a subset of wires right significant") {
+        auto init_state =
+            createRandomStateVectorData<PrecisionT>(re, num_qubits - 1);
+
+        std::vector<std::complex<PrecisionT>> expected_state(
+            Pennylane::Util::exp2(num_qubits), {0, 0});
+
+        std::copy(init_state.begin(), init_state.end(), expected_state.begin());
+
+        StateVectorCudaManaged<TestType> sv{num_qubits};
+
+        std::vector<std::complex<PrecisionT>> values(init_state.begin(),
+                                                     init_state.end());
+
+        sv.setStateVector(values.data(), values.size(),
+                          std::vector<std::size_t>{1, 2});
+        CHECK(expected_state == Pennylane::Util::approx(sv.getDataVector()));
+    }
+
+    SECTION("Set state vector with values and their corresponding indices on "
+            "the host for a subset of wires left significant") {
+        auto init_state =
+            createRandomStateVectorData<PrecisionT>(re, num_qubits - 1);
+
+        std::vector<std::complex<PrecisionT>> expected_state(
+            Pennylane::Util::exp2(num_qubits), {0, 0});
+
+        // Distributing along the base vector with a stride.
+        // Stride is 2**(n_qubits - n_target_wires)
         for (std::size_t i = 0; i < Pennylane::Util::exp2(num_qubits - 1);
              i++) {
-            std::swap(expected_state[i * 2], expected_state[i * 2 + 1]);
+            expected_state[i * 2] = init_state[i];
         }
 
         StateVectorCudaManaged<TestType> sv{num_qubits};
@@ -1121,8 +1160,53 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::SetStateVector",
                                                      init_state.end());
 
         sv.setStateVector(values.data(), values.size(),
-                          std::vector<std::size_t>{0, 1, 2});
-        CHECK(init_state == Pennylane::Util::approx(sv.getDataVector()));
+                          std::vector<std::size_t>{0, 1});
+        CHECK(expected_state == Pennylane::Util::approx(sv.getDataVector()));
+    }
+    SECTION("Set state vector with values and their corresponding indices on "
+            "the host for a subset of wires non-consecutive") {
+        auto init_state =
+            createRandomStateVectorData<PrecisionT>(re, num_qubits - 1);
+
+        std::vector<std::complex<PrecisionT>> expected_state(
+            Pennylane::Util::exp2(num_qubits), {0, 0});
+
+        expected_state[0] = init_state[0];
+        expected_state[1] = init_state[1];
+        expected_state[4] = init_state[2];
+        expected_state[5] = init_state[3];
+
+        StateVectorCudaManaged<TestType> sv{num_qubits};
+
+        std::vector<std::complex<PrecisionT>> values(init_state.begin(),
+                                                     init_state.end());
+
+        sv.setStateVector(values.data(), values.size(),
+                          std::vector<std::size_t>{0, 2});
+        CHECK(expected_state == Pennylane::Util::approx(sv.getDataVector()));
+    }
+    SECTION("Set state vector with values and their corresponding indices on "
+            "the host for a subset of wires consecutive and not significant") {
+        std::size_t num_qubits_local = 4;
+        auto init_state =
+            createRandomStateVectorData<PrecisionT>(re, num_qubits_local - 2);
+
+        std::vector<std::complex<PrecisionT>> expected_state(
+            Pennylane::Util::exp2(num_qubits_local), {0, 0});
+
+        expected_state[0] = init_state[0];
+        expected_state[2] = init_state[1];
+        expected_state[4] = init_state[2];
+        expected_state[6] = init_state[3];
+
+        StateVectorCudaManaged<TestType> sv{num_qubits_local};
+
+        std::vector<std::complex<PrecisionT>> values(init_state.begin(),
+                                                     init_state.end());
+
+        sv.setStateVector(values.data(), values.size(),
+                          std::vector<std::size_t>{1, 2});
+        CHECK(expected_state == Pennylane::Util::approx(sv.getDataVector()));
     }
 }
 

diff --git a/pennylane_lightning/core/src/utils/cuda_utils/DataBuffer.hpp b/pennylane_lightning/core/src/utils/cuda_utils/DataBuffer.hpp
@@ -213,6 +213,73 @@
         }
     }
 
+    /**
+     * @brief Explicitly copy data from host memory to GPU device with an
+     * offset.
+     *
+     * @tparam HostDataT Host data type.
+     *
+     * @param host_in Host data buffer.
+     * @param length Number of elements to copy.
+     * @param offset Offset in the GPU buffer.
+     * @param async Asynchronous copy flag.
+     *
+     */
+    template <class HostDataT = GPUDataT>
+    void CopyHostDataToGpu(const HostDataT *host_in, std::size_t length,
+                           std::size_t offset, bool async = false) {
+        PL_ABORT_IF(
+            (getLength() * sizeof(GPUDataT)) <
+                ((offset + length) * sizeof(HostDataT)),
+            "Sizes do not match for host & GPU data. Please ensure the source "
+            "buffer is out of bounds of the destination buffer");
+
+        if (async) {
+            PL_CUDA_IS_SUCCESS(cudaMemcpyAsync(
+                getData() + offset, host_in, sizeof(GPUDataT) * length,
+                cudaMemcpyHostToDevice, getStream()));
+        } else {
+            PL_CUDA_IS_SUCCESS(cudaMemcpy(getData() + offset, host_in,
+                                          sizeof(GPUDataT) * length,
+                                          cudaMemcpyDefault));
+        }
+    }
+
+    /**
+     * @brief Explicitly copy data from host memory to GPU device with a stride.
+     *
+     * @tparam HostDataT Host data type.
+     *
+     * @param host_in Host data buffer.
+     * @param length Number of elements to copy.
+     * @param stride Stride in the GPU buffer.
+     * @param async Asynchronous copy flag.
+     *
+     */
+    template <class HostDataT = GPUDataT>
+    void CopyHostDataToGpuWithStride(const HostDataT *host_in,
+                                     std::size_t length, std::size_t stride,
+                                     bool async = false) {
+        PL_ABORT_IF(
+            (getLength() * sizeof(GPUDataT)) <
+                ((stride * length) * sizeof(HostDataT)),
+            "Sizes do not match for host & GPU data. Please ensure the source "
+            "buffer is out of bounds of the destination buffer or the stride "
+            "is too large");
+
+        if (async) {
+            PL_CUDA_IS_SUCCESS(
+                cudaMemcpy2DAsync(getData(), sizeof(GPUDataT) * stride, host_in,
+                                  sizeof(HostDataT), sizeof(HostDataT), length,
+                                  cudaMemcpyHostToDevice, getStream()));
+        } else {
+            PL_CUDA_IS_SUCCESS(
+                cudaMemcpy2D(getData(), sizeof(GPUDataT) * stride, host_in,
+                             sizeof(HostDataT), sizeof(HostDataT), length,
+                             cudaMemcpyHostToDevice));
+        }
+    }
+
     /**
      * @brief Explicitly copy data from GPU device to host memory.
      *

diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -214,10 +214,10 @@
             # state = state_data
 
         state = self._asarray(state, dtype=self.dtype)  # this operation on host
-        output_shape = [2] * self._num_local_wires
 
         if len(device_wires) == self.num_wires and Wires(sorted(device_wires)) == device_wires:
             # Initialize the entire device state with the input state
+            output_shape = [2] * self._num_local_wires
             if self.num_wires == self._num_local_wires:
                 self.syncH2D(np.reshape(state, output_shape))
                 return