PennyLaneAI · vincentmr · Jul 19, 2024 · Jul 18, 2024 · Jul 18, 2024 · Jul 18, 2024
diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
@@ -15,6 +15,9 @@
 
 ### Improvements
 
+* Optimize the OpenMP parallelization of Lightning-Qubit's `probs` for all number of targets.
+  [(#807)](https://github.com/PennyLaneAI/pennylane-lightning/pull/807)
+
 * Add GPU device compute capability check for Lightning-Tensor.
   [(#803)](https://github.com/PennyLaneAI/pennylane-lightning/pull/803)
 

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.38.0-dev11"
+__version__ = "0.38.0-dev12"
diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
@@ -105,7 +105,12 @@ class Measurements final
           [[maybe_unused]] const std::vector<std::size_t> &device_wires = {})
         -> std::vector<PrecisionT> {
         const std::size_t n_wires = wires.size();
+        if (n_wires == 0) {
+            return {1.0};
+        }
         const std::size_t num_qubits = this->_statevector.getNumQubits();
+        // is_equal_to_all_wires is True if `wires` includes all wires in order
+        // and false otherwise
         bool is_equal_to_all_wires = n_wires == num_qubits;
         for (std::size_t k = 0; k < n_wires; k++) {
             if (!is_equal_to_all_wires) {
@@ -119,7 +124,7 @@ class Measurements final
 
         const ComplexT *arr_data = this->_statevector.getData();
 
-        // Templated 1-4 qubit cases
+        // Templated 1-4 wire cases; return probs 
         PROBS_SPECIAL_CASE(1);
         PROBS_SPECIAL_CASE(2);
         PROBS_SPECIAL_CASE(3);
@@ -132,15 +137,18 @@ class Measurements final
         const std::size_t n_probs = PUtil::exp2(n_wires);
         std::vector<PrecisionT> probabilities(n_probs, 0);
         auto *probs = probabilities.data();
-        std::size_t ind_probs = 0;
-        for (auto index : all_indices) {
+        // For 5 wires and more, there are at least 32 probs entries to
+        // parallelize over This scheme was found most favorable in terms of
+        // memory accesses and it prevents the stack overflow caused by
+        // `reduction(+ : probs[ : n_probs])` when n_probs approaches 2**20
 #if defined PL_LQ_KERNEL_OMP && defined _OPENMP
-#pragma omp parallel for reduction(+ : probs[ : n_probs])
+#pragma omp parallel for collapse(1)
 #endif
+        for (std::size_t ind_probs = 0; ind_probs < n_probs; ind_probs++) {
             for (auto offset : all_offsets) {
-                probs[ind_probs] += std::norm(arr_data[index + offset]);
+                probs[ind_probs] +=
+                    std::norm(arr_data[all_indices[ind_probs] + offset]);
             }
-            ind_probs++;
         }
         return probabilities;
     }