From 4a6ee275201c21fdff8f33ba0fdfa7f532f4c43e Mon Sep 17 00:00:00 2001
From: Vincent Michaud-Rioux <vincent.michaud-rioux@xanadu.ai>
Date: Fri, 19 Jul 2024 18:19:45 +0000
Subject: [PATCH 01/18] Put alias method code in own class. Add bindings for
 generate_sample(wires, num_shots).

---
 .../core/src/bindings/Bindings.hpp            |  34 ++++--
 .../measurements/MeasurementKernels.hpp       |  86 ++++++++++++++
 .../measurements/MeasurementsLQubit.hpp       | 111 ++++++------------
 .../lightning_qubit/_measurements.py          |   2 +-
 .../lightning_qubit/_state_vector.py          |   2 +-
 5 files changed, 148 insertions(+), 87 deletions(-)

diff --git a/pennylane_lightning/core/src/bindings/Bindings.hpp b/pennylane_lightning/core/src/bindings/Bindings.hpp
index fe72f1bb86..6ab4da7472 100644
--- a/pennylane_lightning/core/src/bindings/Bindings.hpp
+++ b/pennylane_lightning/core/src/bindings/Bindings.hpp
@@ -189,7 +189,7 @@ auto alignedNumpyArray(CPUMemoryModel memory_model, std::size_t size,
  * @param size Size of the array to create
  * @param dt Pybind11's datatype object
  */
-auto allocateAlignedArray(size_t size, const py::dtype &dt,
+auto allocateAlignedArray(std::size_t size, const py::dtype &dt,
                           bool zeroInit = false) -> py::array {
     // TODO: Move memset operations to here to reduce zeroInit pass-throughs.
     auto memory_model = bestCPUMemoryModel();
@@ -474,13 +474,33 @@ void registerBackendAgnosticMeasurements(PyClass &pyclass) {
                 return M.var(*ob);
             },
             "Variance of an observable object.")
+        .def("generate_samples",
+             [](Measurements<StateVectorT> &M, std::size_t num_wires,
+                std::size_t num_shots) {
+                 auto &&result = M.generate_samples(num_shots);
+                 const std::size_t ndim = 2;
+                 const std::vector<std::size_t> shape{num_shots, num_wires};
+                 constexpr auto sz = sizeof(std::size_t);
+                 const std::vector<std::size_t> strides{sz * num_wires, sz};
+                 // return 2-D NumPy array
+                 return py::array(py::buffer_info(
+                     result.data(), /* data as contiguous array  */
+                     sz,            /* size of one scalar        */
+                     py::format_descriptor<std::size_t>::format(), /* data type
+                                                                    */
+                     ndim,   /* number of dimensions      */
+                     shape,  /* shape of the matrix       */
+                     strides /* strides for each axis     */
+                     ));
+             })
         .def("generate_samples", [](Measurements<StateVectorT> &M,
-                                    std::size_t num_wires,
-                                    std::size_t num_shots) {
-            auto &&result = M.generate_samples(num_shots);
+                                    const std::vector<std::size_t> &wires,
+                                    const std::size_t num_shots) {
+            const std::size_t num_wires = wires.size();
+            auto &&result = M.generate_samples(wires, num_shots);
             const std::size_t ndim = 2;
             const std::vector<std::size_t> shape{num_shots, num_wires};
-            constexpr auto sz = sizeof(size_t);
+            constexpr auto sz = sizeof(std::size_t);
             const std::vector<std::size_t> strides{sz * num_wires, sz};
             // return 2-D NumPy array
             return py::array(py::buffer_info(
@@ -559,7 +579,7 @@ void registerBackendAgnosticAlgorithms(py::module_ &m) {
         .def("__repr__", [](const OpsData<StateVectorT> &ops) {
             using namespace Pennylane::Util;
             std::ostringstream ops_stream;
-            for (size_t op = 0; op < ops.getSize(); op++) {
+            for (std::size_t op = 0; op < ops.getSize(); op++) {
                 ops_stream << "{'name': " << ops.getOpsName()[op];
                 ops_stream << ", 'params': " << ops.getOpsParams()[op];
                 ops_stream << ", 'inv': " << ops.getOpsInverses()[op];
@@ -591,7 +611,7 @@ void registerBackendAgnosticAlgorithms(py::module_ &m) {
            const std::vector<std::vector<bool>> &ops_controlled_values) {
             std::vector<std::vector<ComplexT>> conv_matrices(
                 ops_matrices.size());
-            for (size_t op = 0; op < ops_name.size(); op++) {
+            for (std::size_t op = 0; op < ops_name.size(); op++) {
                 const auto m_buffer = ops_matrices[op].request();
                 if (m_buffer.size) {
                     const auto m_ptr =
diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp
index 066db5ad31..9b33d0451b 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp
@@ -18,7 +18,13 @@
  */
 #pragma once
 
+// #include <algorithm>
+// #include <functional>
+// #include <limits>
+// #include <random>
+
 #include <complex>
+#include <stack>
 #include <vector>
 
 #include "BitUtil.hpp"
@@ -296,6 +302,86 @@ namespace PUtil = Pennylane::Util;
 
 namespace Pennylane::LightningQubit::Measures {
 
+/**
+ * @brief Generate samples using the alias method.
+ *
+ * @tparam PrecisionT Precision data type
+ */
+template <typename PrecisionT> class discrete_random_variable {
+  private:
+    const std::vector<std::pair<double, std::size_t>> bucket_partners_;
+    std::mt19937 &gen_;
+    const std::size_t n_probs;
+    mutable std::uniform_real_distribution<PrecisionT> distribution{0.0, 1.0};
+
+  public:
+    /**
+     * @brief Create a discrete_random_variable object.
+     *
+     * @param gen Random number generator reference.
+     * @param probs Probabilities for values 0 up to N - 1, where N =
+     * probs.size().
+     */
+    discrete_random_variable(std::mt19937 &gen,
+                             const std::vector<PrecisionT> &probs)
+        : bucket_partners_(init_bucket_partners_(probs)), gen_{gen},
+          n_probs{probs.size()} {}
+
+    /**
+     * @brief Return a discrete random value.
+     */
+    std::size_t operator()() const {
+        const std::size_t idx =
+            static_cast<std::size_t>(distribution(gen_) * n_probs);
+        if (distribution(gen_) >= bucket_partners_[idx].first and
+            bucket_partners_[idx].second !=
+                std::numeric_limits<std::size_t>::max()) {
+            return bucket_partners_[idx].second;
+        } else {
+            return idx;
+        }
+    }
+
+  private:
+    /**
+     * @brief Initialize the probability table of the alias method.
+     */
+    std::vector<std::pair<double, std::size_t>>
+    init_bucket_partners_(const std::vector<PrecisionT> &probs) {
+        const std::size_t n_probs = probs.size();
+        std::vector<std::pair<double, std::size_t>> bucket_partners(
+            n_probs, {0.0, std::numeric_limits<std::size_t>::max()});
+        std::stack<std::size_t> underfull_bucket_ids;
+        std::stack<std::size_t> overfull_bucket_ids;
+
+        for (std::size_t i = 0; i != n_probs; ++i) {
+            bucket_partners[i].first = n_probs * probs[i];
+            if (bucket_partners[i].first < 1.0) {
+                underfull_bucket_ids.push(i);
+            } else {
+                overfull_bucket_ids.push(i);
+            }
+        }
+
+        while (not(underfull_bucket_ids.empty()) and
+               not(overfull_bucket_ids.empty())) {
+            auto i = overfull_bucket_ids.top();
+            auto j = underfull_bucket_ids.top();
+            underfull_bucket_ids.pop(), overfull_bucket_ids.pop();
+            bucket_partners[j].second = i;
+            bucket_partners[i].first -= (1.0 - bucket_partners[j].first);
+
+            if (bucket_partners[i].first < 1.0) {
+                underfull_bucket_ids.push(i);
+            } else {
+                overfull_bucket_ids.push(i);
+            }
+        }
+
+        return bucket_partners;
+    }
+};
+
 /**
  * @brief Probabilities for a subset of the full system.
  *
diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
index 19c0ba7035..b23c75bbc8 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
@@ -167,7 +167,7 @@ class Measurements final
      *
      * @return Floating point std::vector with probabilities.
      */
-    auto probs(size_t num_shots) -> std::vector<PrecisionT> {
+    auto probs(std::size_t num_shots) -> std::vector<PrecisionT> {
         return BaseType::probs(num_shots);
     }
 
@@ -250,7 +250,7 @@ class Measurements final
                 const index_type *entries_ptr, const ComplexT *values_ptr,
                 const index_type numNNZ) -> PrecisionT {
         PL_ABORT_IF(
-            (this->_statevector.getLength() != (size_t(row_map_size) - 1)),
+            (this->_statevector.getLength() != (std::size_t(row_map_size) - 1)),
             "Statevector and Hamiltonian have incompatible sizes.");
         auto operator_vector = Util::apply_Sparse_Matrix(
             this->_statevector.getData(),
@@ -281,7 +281,7 @@ class Measurements final
             "The lengths of the list of operations and wires do not match.");
         std::vector<PrecisionT> expected_value_list;
 
-        for (size_t index = 0; index < operations_list.size(); index++) {
+        for (std::size_t index = 0; index < operations_list.size(); index++) {
             expected_value_list.emplace_back(
                 expval(operations_list[index], wires_list[index]));
         }
@@ -453,7 +453,7 @@ class Measurements final
 
         std::vector<PrecisionT> expected_value_list;
 
-        for (size_t index = 0; index < operations_list.size(); index++) {
+        for (std::size_t index = 0; index < operations_list.size(); index++) {
             expected_value_list.emplace_back(
                 var(operations_list[index], wires_list[index]));
         }
@@ -480,7 +480,7 @@ class Measurements final
         std::size_t num_qubits = this->_statevector.getNumQubits();
         std::uniform_real_distribution<PrecisionT> distrib(0.0, 1.0);
         std::vector<std::size_t> samples(num_samples * num_qubits, 0);
-        std::unordered_map<size_t, std::size_t> cache;
+        std::unordered_map<std::size_t, std::size_t> cache;
         this->setRandomSeed();
 
         TransitionKernelType transition_kernel = TransitionKernelType::Local;
@@ -494,13 +494,13 @@ class Measurements final
         std::size_t idx = 0;
 
         // Burn In
-        for (size_t i = 0; i < num_burnin; i++) {
+        for (std::size_t i = 0; i < num_burnin; i++) {
             idx = metropolis_step(this->_statevector, tk, this->rng, distrib,
                                   idx); // Burn-in.
         }
 
         // Sample
-        for (size_t i = 0; i < num_samples; i++) {
+        for (std::size_t i = 0; i < num_samples; i++) {
             idx = metropolis_step(this->_statevector, tk, this->rng, distrib,
                                   idx);
 
@@ -513,7 +513,7 @@ class Measurements final
 
             // If not cached, compute
             else {
-                for (size_t j = 0; j < num_qubits; j++) {
+                for (std::size_t j = 0; j < num_qubits; j++) {
                     samples[i * num_qubits + (num_qubits - 1 - j)] =
                         (idx >> j) & 1U;
                 }
@@ -543,7 +543,7 @@ class Measurements final
                    const index_type *entries_ptr, const ComplexT *values_ptr,
                    const index_type numNNZ) {
         PL_ABORT_IF(
-            (this->_statevector.getLength() != (size_t(row_map_size) - 1)),
+            (this->_statevector.getLength() != (std::size_t(row_map_size) - 1)),
             "Statevector and Hamiltonian have incompatible sizes.");
         auto operator_vector = Util::apply_Sparse_Matrix(
             this->_statevector.getData(),
@@ -569,79 +569,34 @@ class Measurements final
      * @return 1-D vector of samples in binary, each sample is
      * separated by a stride equal to the number of qubits.
      */
-    std::vector<std::size_t> generate_samples(size_t num_samples) {
+    std::vector<std::size_t> generate_samples(const std::size_t num_samples) {
         const std::size_t num_qubits = this->_statevector.getNumQubits();
-        auto &&probabilities = probs();
+        std::vector<std::size_t> wires(num_qubits);
+        std::iota(wires.begin(), wires.end(), 0);
+        return generate_samples(wires, num_samples);
+    }
 
-        std::vector<std::size_t> samples(num_samples * num_qubits, 0);
-        std::uniform_real_distribution<PrecisionT> distribution(0.0, 1.0);
-        std::unordered_map<size_t, std::size_t> cache;
+    /**
+     * @brief Generate samples.
+     *
+     * @param wires Sample are generated for the specified wires.
+     * @param num_samples The number of samples to generate.
+     * @return 1-D vector of samples in binary, each sample is
+     * separated by a stride equal to the number of qubits.
+     */
+    std::vector<std::size_t>
+    generate_samples(const std::vector<std::size_t> &wires,
+                     const std::size_t num_samples) {
+        const std::size_t num_wires = wires.size();
+        const std::vector<PrecisionT> probabilities = probs(wires);
         this->setRandomSeed();
+        discrete_random_variable<PrecisionT> drv{this->rng, probabilities};
 
-        const std::size_t N = probabilities.size();
-        std::vector<double> bucket(N);
-        std::vector<std::size_t> bucket_partner(N);
-        std::stack<std::size_t> overfull_bucket_ids;
-        std::stack<std::size_t> underfull_bucket_ids;
-
-        for (size_t i = 0; i < N; i++) {
-            bucket[i] = N * probabilities[i];
-            bucket_partner[i] = i;
-            if (bucket[i] > 1.0) {
-                overfull_bucket_ids.push(i);
-            }
-            if (bucket[i] < 1.0) {
-                underfull_bucket_ids.push(i);
-            }
-        }
-
-        // Run alias algorithm
-        while (!underfull_bucket_ids.empty() && !overfull_bucket_ids.empty()) {
-            // get an overfull bucket
-            std::size_t i = overfull_bucket_ids.top();
-
-            // get an underfull bucket
-            std::size_t j = underfull_bucket_ids.top();
-            underfull_bucket_ids.pop();
-
-            // underfull bucket is partned with an overfull bucket
-            bucket_partner[j] = i;
-            bucket[i] = bucket[i] + bucket[j] - 1;
-
-            // if overfull bucket is now underfull
-            // put in underfull stack
-            if (bucket[i] < 1) {
-                overfull_bucket_ids.pop();
-                underfull_bucket_ids.push(i);
-            }
-
-            // if overfull bucket is full -> remove
-            else if (bucket[i] == 1.0) {
-                overfull_bucket_ids.pop();
-            }
-        }
-
-        // Pick samples
-        for (size_t i = 0; i < num_samples; i++) {
-            PrecisionT pct = distribution(this->rng) * N;
-            auto idx = static_cast<std::size_t>(pct);
-            if (pct - idx > bucket[idx]) {
-                idx = bucket_partner[idx];
-            }
-            // If cached, retrieve sample from cache
-            if (cache.contains(idx)) {
-                std::size_t cache_id = cache[idx];
-                auto it_temp = samples.begin() + cache_id * num_qubits;
-                std::copy(it_temp, it_temp + num_qubits,
-                          samples.begin() + i * num_qubits);
-            }
-            // If not cached, compute
-            else {
-                for (size_t j = 0; j < num_qubits; j++) {
-                    samples[i * num_qubits + (num_qubits - 1 - j)] =
-                        (idx >> j) & 1U;
-                }
-                cache[idx] = i;
+        std::vector<std::size_t> samples(num_samples * num_wires, 0);
+        for (std::size_t i = 0; i < num_samples; i++) {
+            const auto idx = drv();
+            for (std::size_t j = 0; j < num_wires; j++) {
+                samples[i * num_wires + (num_wires - 1 - j)] = (idx >> j) & 1U;
             }
         }
         return samples;
diff --git a/pennylane_lightning/lightning_qubit/_measurements.py b/pennylane_lightning/lightning_qubit/_measurements.py
index 31a9f18179..ab78942006 100644
--- a/pennylane_lightning/lightning_qubit/_measurements.py
+++ b/pennylane_lightning/lightning_qubit/_measurements.py
@@ -411,7 +411,7 @@ def _process_single_shot(samples):
                         ).astype(int, copy=False)
                     else:
                         samples = self._measurement_lightning.generate_samples(
-                            len(wires), s
+                            list(wires), s
                         ).astype(int, copy=False)
                 except ValueError as e:
                     if str(e) != "probabilities contain NaN":
diff --git a/pennylane_lightning/lightning_qubit/_state_vector.py b/pennylane_lightning/lightning_qubit/_state_vector.py
index 46e22b0ca5..7307b35d0e 100644
--- a/pennylane_lightning/lightning_qubit/_state_vector.py
+++ b/pennylane_lightning/lightning_qubit/_state_vector.py
@@ -267,10 +267,10 @@ def _apply_lightning_midmeasure(
         """
         wires = self.wires.indices(operation.wires)
         wire = list(wires)[0]
-        circuit = QuantumScript([], [qml.sample(wires=operation.wires)], shots=1)
         if postselect_mode == "fill-shots" and operation.postselect is not None:
             sample = operation.postselect
         else:
+            circuit = QuantumScript([], [qml.sample(wires=operation.wires)], shots=1)
             sample = LightningMeasurements(self).measure_final_state(circuit)
             sample = np.squeeze(sample)
         mid_measurements[operation] = sample

From d98a90a12e699cb4fac335f79f6128cf1b375541 Mon Sep 17 00:00:00 2001
From: Vincent Michaud-Rioux <vincent.michaud-rioux@xanadu.ai>
Date: Fri, 19 Jul 2024 18:55:41 +0000
Subject: [PATCH 02/18] Generate counts first, then samples.

---
 .../measurements/MeasurementsLQubit.hpp       | 42 +++++++++---
 .../tests/Test_MeasurementsLQubit.cpp         | 64 +------------------
 2 files changed, 34 insertions(+), 72 deletions(-)

diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
index b23c75bbc8..d66276a874 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
@@ -587,19 +587,43 @@ class Measurements final
     std::vector<std::size_t>
     generate_samples(const std::vector<std::size_t> &wires,
                      const std::size_t num_samples) {
-        const std::size_t num_wires = wires.size();
-        const std::vector<PrecisionT> probabilities = probs(wires);
+        const std::size_t n_wires = wires.size();
+        const std::vector<std::size_t> counts =
+            generate_counts(wires, num_samples);
+        std::vector<std::size_t> samples(num_samples * n_wires);
+        std::size_t cum_count{0};
+        for (std::size_t idx = 0; idx < counts.size(); idx++) {
+            const std::size_t count = counts[idx];
+            if (count == 0) {
+                continue;
+            }
+            for (std::size_t j = 0; j < n_wires; j++) {
+                samples[cum_count * n_wires + (n_wires - 1 - j)] =
+                    (idx >> j) & 1U;
+            }
+            const auto iterator = samples.begin() + cum_count * n_wires;
+            cum_count += 1;
+            for (std::size_t c = 1; c < count; c++) {
+                std::copy(iterator, iterator + n_wires,
+                          samples.begin() + cum_count * n_wires);
+                cum_count += 1;
+            }
+        }
+        return samples;
+    }
+
+    std::vector<std::size_t>
+    generate_counts(const std::vector<std::size_t> &wires,
+                    const std::size_t num_samples) {
+        const std::size_t n_wires = wires.size();
         this->setRandomSeed();
-        discrete_random_variable<PrecisionT> drv{this->rng, probabilities};
+        discrete_random_variable<PrecisionT> drv{this->rng, probs(wires)};
 
-        std::vector<std::size_t> samples(num_samples * num_wires, 0);
+        std::vector<std::size_t> counts(PUtil::exp2(n_wires), 0UL);
         for (std::size_t i = 0; i < num_samples; i++) {
-            const auto idx = drv();
-            for (std::size_t j = 0; j < num_wires; j++) {
-                samples[i * num_wires + (num_wires - 1 - j)] = (idx >> j) & 1U;
-            }
+            counts[drv()] += 1;
         }
-        return samples;
+        return counts;
     }
 
   private:
diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/tests/Test_MeasurementsLQubit.cpp b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/tests/Test_MeasurementsLQubit.cpp
index 58abb81b51..7ffdd28a48 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/tests/Test_MeasurementsLQubit.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/tests/Test_MeasurementsLQubit.cpp
@@ -635,69 +635,7 @@ TEMPLATE_PRODUCT_TEST_CASE("Sample with Metropolis (Local Kernel)",
     std::size_t num_samples = 100000;
     std::size_t num_burnin = 1000;
 
-    std::string kernel = "Local";
-    auto &&samples =
-        Measurer.generate_samples_metropolis(kernel, num_burnin, num_samples);
-
-    std::vector<std::size_t> counts(N, 0);
-    std::vector<std::size_t> samples_decimal(num_samples, 0);
-
-    // convert samples to decimal and then bin them in counts
-    for (size_t i = 0; i < num_samples; i++) {
-        for (size_t j = 0; j < num_qubits; j++) {
-            if (samples[i * num_qubits + j] != 0) {
-                samples_decimal[i] += twos[(num_qubits - 1 - j)];
-            }
-        }
-        counts[samples_decimal[i]] += 1;
-    }
-
-    // compute estimated probabilities from histogram
-    std::vector<PrecisionT> probabilities(counts.size());
-    for (size_t i = 0; i < counts.size(); i++) {
-        probabilities[i] = counts[i] / (PrecisionT)num_samples;
-    }
-
-    // compare estimated probabilities to real probabilities
-    SECTION("No wires provided:") {
-        CHECK_THAT(probabilities,
-                   Catch::Approx(expected_probabilities).margin(.05));
-    }
-}
-
-TEMPLATE_PRODUCT_TEST_CASE("Sample with Metropolis (NonZeroRandom Kernel)",
-                           "[Measurements][MCMC]",
-                           (StateVectorLQubitManaged, StateVectorLQubitRaw),
-                           (float, double)) {
-    using StateVectorT = TestType;
-    using PrecisionT = typename StateVectorT::PrecisionT;
-
-    constexpr uint32_t twos[] = {
-        1U << 0U,  1U << 1U,  1U << 2U,  1U << 3U,  1U << 4U,  1U << 5U,
-        1U << 6U,  1U << 7U,  1U << 8U,  1U << 9U,  1U << 10U, 1U << 11U,
-        1U << 12U, 1U << 13U, 1U << 14U, 1U << 15U, 1U << 16U, 1U << 17U,
-        1U << 18U, 1U << 19U, 1U << 20U, 1U << 21U, 1U << 22U, 1U << 23U,
-        1U << 24U, 1U << 25U, 1U << 26U, 1U << 27U, 1U << 28U, 1U << 29U,
-        1U << 30U, 1U << 31U};
-
-    // Defining the State Vector that will be measured.
-    auto statevector_data = createNonTrivialState<StateVectorT>();
-    StateVectorT statevector(statevector_data.data(), statevector_data.size());
-
-    // Initializing the measurements class.
-    // This object attaches to the statevector allowing several measurements.
-    Measurements<StateVectorT> Measurer(statevector);
-
-    std::vector<PrecisionT> expected_probabilities = {
-        0.67078706, 0.03062806, 0.0870997,  0.00397696,
-        0.17564072, 0.00801973, 0.02280642, 0.00104134};
-
-    std::size_t num_qubits = 3;
-    std::size_t N = std::pow(2, num_qubits);
-    std::size_t num_samples = 100000;
-    std::size_t num_burnin = 1000;
-
-    const std::string kernel = "NonZeroRandom";
+    const std::string kernel = GENERATE("Local", "NonZeroRandom");
     auto &&samples =
         Measurer.generate_samples_metropolis(kernel, num_burnin, num_samples);
 

From 33d2d7a2628c01fbfe39fa65cc912f3063bcd9c8 Mon Sep 17 00:00:00 2001
From: Vincent Michaud-Rioux <vincent.michaud-rioux@xanadu.ai>
Date: Fri, 19 Jul 2024 19:48:19 +0000
Subject: [PATCH 03/18] Add python tests.

---
 tests/test_measurements.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/tests/test_measurements.py b/tests/test_measurements.py
index 927354dd19..04972d9780 100644
--- a/tests/test_measurements.py
+++ b/tests/test_measurements.py
@@ -665,6 +665,37 @@ def test_sample_values(self, qubit_device, tol):
         # they square to 1
         assert np.allclose(s1**2, 1, atol=tol, rtol=0)
 
+    @pytest.mark.parametrize("seed", range(0, 10))
+    @pytest.mark.parametrize("nwires", range(1, 11))
+    def test_sample_variations(self, qubit_device, nwires, seed):
+        """Tests if `sample(wires)` returns correct statistics."""
+        shots = 100000
+        n_qubits = max(5, nwires + 1)
+        np.random.seed(seed)
+        wires = qml.wires.Wires(np.random.permutation(nwires))
+        state = np.random.rand(2**n_qubits) + 1j * np.random.rand(2**n_qubits)
+        state[np.random.randint(0, 2**n_qubits, 1)] += state.size / 10
+        state /= np.linalg.norm(state)
+        ops = [qml.StatePrep(state, wires=range(n_qubits))]
+        tape = qml.tape.QuantumScript(ops, [qml.sample(wires=wires)], shots=shots)
+
+        def reshape_samples(samples):
+            return np.atleast_3d(samples) if len(wires) == 1 else np.atleast_2d(samples)
+
+        dev = qubit_device(wires=n_qubits, shots=shots)
+        samples = dev.execute(tape)
+        probs = qml.measurements.ProbabilityMP(wires=wires).process_samples(
+            reshape_samples(samples), wire_order=wires
+        )
+
+        dev = qml.device("default.qubit", wires=n_qubits, shots=shots)
+        samples = dev.execute(tape)
+        ref = qml.measurements.ProbabilityMP(wires=wires).process_samples(
+            reshape_samples(samples), wire_order=wires
+        )
+
+        assert np.allclose(probs, ref, atol=1.0e-2, rtol=1.0e-4)
+
 
 @pytest.mark.skipif(
     device_name == "lightning.tensor",

From 3b3a7cd56b7441873794e3d6ff6f215ca2672717 Mon Sep 17 00:00:00 2001
From: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
Date: Fri, 19 Jul 2024 19:49:23 +0000
Subject: [PATCH 04/18] Auto update version from '0.38.0-dev11' to
 '0.38.0-dev12'

---
 pennylane_lightning/core/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index 2aa597931d..52575c8282 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.38.0-dev11"
+__version__ = "0.38.0-dev12"

From 38ad51c3639015d79097adc447e96762df060d8b Mon Sep 17 00:00:00 2001
From: Vincent Michaud-Rioux <vincent.michaud-rioux@xanadu.ai>
Date: Fri, 19 Jul 2024 20:21:32 +0000
Subject: [PATCH 05/18] Update changelog and fix tidy errors.

---
 .github/CHANGELOG.md                          |  3 +++
 .../core/src/bindings/Bindings.hpp            | 21 +------------------
 .../bindings/LQubitBindings.hpp               | 21 +++++++++++++++++++
 .../measurements/MeasurementKernels.hpp       | 13 ++++++------
 .../measurements/MeasurementsLQubit.hpp       | 16 ++++++++++++--
 .../lightning_qubit/_measurements.py          |  2 +-
 6 files changed, 46 insertions(+), 30 deletions(-)

diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index 4ee8cd92eb..df77415acf 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -15,6 +15,9 @@
 
 ### Improvements
 
+* Add `sample(wires)` support in LightningQubit.
+  [(#809)](https://github.com/PennyLaneAI/pennylane-lightning/pull/809)
+
 * Add GPU device compute capability check for Lightning-Tensor.
   [(#803)](https://github.com/PennyLaneAI/pennylane-lightning/pull/803)
 
diff --git a/pennylane_lightning/core/src/bindings/Bindings.hpp b/pennylane_lightning/core/src/bindings/Bindings.hpp
index 6ab4da7472..5f0786a31c 100644
--- a/pennylane_lightning/core/src/bindings/Bindings.hpp
+++ b/pennylane_lightning/core/src/bindings/Bindings.hpp
@@ -492,26 +492,7 @@ void registerBackendAgnosticMeasurements(PyClass &pyclass) {
                      shape,  /* shape of the matrix       */
                      strides /* strides for each axis     */
                      ));
-             })
-        .def("generate_samples", [](Measurements<StateVectorT> &M,
-                                    const std::vector<std::size_t> &wires,
-                                    const std::size_t num_shots) {
-            const std::size_t num_wires = wires.size();
-            auto &&result = M.generate_samples(wires, num_shots);
-            const std::size_t ndim = 2;
-            const std::vector<std::size_t> shape{num_shots, num_wires};
-            constexpr auto sz = sizeof(std::size_t);
-            const std::vector<std::size_t> strides{sz * num_wires, sz};
-            // return 2-D NumPy array
-            return py::array(py::buffer_info(
-                result.data(), /* data as contiguous array  */
-                sz,            /* size of one scalar        */
-                py::format_descriptor<std::size_t>::format(), /* data type */
-                ndim,   /* number of dimensions      */
-                shape,  /* shape of the matrix       */
-                strides /* strides for each axis     */
-                ));
-        });
+             });
 }
 
 /**
diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/bindings/LQubitBindings.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/bindings/LQubitBindings.hpp
index 9bf0f659c8..f6bf6c0dbf 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/bindings/LQubitBindings.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/bindings/LQubitBindings.hpp
@@ -292,6 +292,27 @@ void registerBackendSpecificMeasurements(PyClass &pyclass) {
                     static_cast<sparse_index_type>(values.request().size));
             },
             "Variance of a sparse Hamiltonian.")
+        .def("generate_samples",
+             [](Measurements<StateVectorT> &M,
+                const std::vector<std::size_t> &wires,
+                const std::size_t num_shots) {
+                 const std::size_t num_wires = wires.size();
+                 auto &&result = M.generate_samples(wires, num_shots);
+                 const std::size_t ndim = 2;
+                 const std::vector<std::size_t> shape{num_shots, num_wires};
+                 constexpr auto sz = sizeof(std::size_t);
+                 const std::vector<std::size_t> strides{sz * num_wires, sz};
+                 // return 2-D NumPy array
+                 return py::array(py::buffer_info(
+                     result.data(), /* data as contiguous array  */
+                     sz,            /* size of one scalar        */
+                     py::format_descriptor<std::size_t>::format(), /* data type
+                                                                    */
+                     ndim,   /* number of dimensions      */
+                     shape,  /* shape of the matrix       */
+                     strides /* strides for each axis     */
+                     ));
+             })
         .def("generate_mcmc_samples", [](Measurements<StateVectorT> &M,
                                          std::size_t num_wires,
                                          const std::string &kernelname,
diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp
index 9b33d0451b..fa4ff45d31 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp
@@ -309,6 +309,8 @@ namespace Pennylane::LightningQubit::Measures {
  */
 template <typename PrecisionT> class discrete_random_variable {
   private:
+    static constexpr std::size_t default_index =
+        std::numeric_limits<std::size_t>::max();
     const std::vector<std::pair<double, std::size_t>> bucket_partners_;
     std::mt19937 &gen_;
     const std::size_t n_probs;
@@ -331,15 +333,12 @@ template <typename PrecisionT> class discrete_random_variable {
      * @brief Return a discrete random value.
      */
     std::size_t operator()() const {
-        const std::size_t idx =
-            static_cast<std::size_t>(distribution(gen_) * n_probs);
+        const auto idx = static_cast<std::size_t>(distribution(gen_) * n_probs);
         if (distribution(gen_) >= bucket_partners_[idx].first and
-            bucket_partners_[idx].second !=
-                std::numeric_limits<std::size_t>::max()) {
+            bucket_partners_[idx].second != default_index) {
             return bucket_partners_[idx].second;
-        } else {
-            return idx;
         }
+        return idx;
     }
 
   private:
@@ -350,7 +349,7 @@ template <typename PrecisionT> class discrete_random_variable {
     init_bucket_partners_(const std::vector<PrecisionT> &probs) {
         const std::size_t n_probs = probs.size();
         std::vector<std::pair<double, std::size_t>> bucket_partners(
-            n_probs, {0.0, std::numeric_limits<std::size_t>::max()});
+            n_probs, {0.0, default_index});
         std::stack<std::size_t> underfull_bucket_ids;
         std::stack<std::size_t> overfull_bucket_ids;
 
diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
index d66276a874..f89212a3ba 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
@@ -588,11 +588,23 @@ class Measurements final
     generate_samples(const std::vector<std::size_t> &wires,
                      const std::size_t num_samples) {
         const std::size_t n_wires = wires.size();
+        const std::size_t n_counts = PUtil::exp2(n_wires);
+        std::vector<std::size_t> samples(num_samples * n_wires);
+        if (n_counts > num_samples) {
+            this->setRandomSeed();
+            discrete_random_variable<PrecisionT> drv{this->rng, probs(wires)};
+            for (std::size_t s = 0; s < num_samples; s++) {
+                const std::size_t idx = drv();
+                for (std::size_t j = 0; j < n_wires; j++) {
+                    samples[s * n_wires + (n_wires - 1 - j)] = (idx >> j) & 1U;
+                }
+            }
+            return samples;
+        }
         const std::vector<std::size_t> counts =
             generate_counts(wires, num_samples);
-        std::vector<std::size_t> samples(num_samples * n_wires);
         std::size_t cum_count{0};
-        for (std::size_t idx = 0; idx < counts.size(); idx++) {
+        for (std::size_t idx = 0; idx < n_counts; idx++) {
             const std::size_t count = counts[idx];
             if (count == 0) {
                 continue;
diff --git a/pennylane_lightning/lightning_qubit/_measurements.py b/pennylane_lightning/lightning_qubit/_measurements.py
index ab78942006..644783c926 100644
--- a/pennylane_lightning/lightning_qubit/_measurements.py
+++ b/pennylane_lightning/lightning_qubit/_measurements.py
@@ -429,7 +429,7 @@ def _process_single_shot(samples):
                 ).astype(int, copy=False)
             else:
                 samples = self._measurement_lightning.generate_samples(
-                    len(wires), shots.total_shots
+                    list(wires), shots.total_shots
                 ).astype(int, copy=False)
         except ValueError as e:
             if str(e) != "probabilities contain NaN":

From e2b97a2bafd0fc29cc7e7f7d9cef5a7fb1ac7728 Mon Sep 17 00:00:00 2001
From: Vincent Michaud-Rioux <vincent.michaud-rioux@xanadu.ai>
Date: Fri, 19 Jul 2024 20:52:28 +0000
Subject: [PATCH 06/18] revert bindings.hpp

---
 .../core/src/bindings/Bindings.hpp            | 43 +++++++++----------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/pennylane_lightning/core/src/bindings/Bindings.hpp b/pennylane_lightning/core/src/bindings/Bindings.hpp
index 5f0786a31c..fe72f1bb86 100644
--- a/pennylane_lightning/core/src/bindings/Bindings.hpp
+++ b/pennylane_lightning/core/src/bindings/Bindings.hpp
@@ -189,7 +189,7 @@ auto alignedNumpyArray(CPUMemoryModel memory_model, std::size_t size,
  * @param size Size of the array to create
  * @param dt Pybind11's datatype object
  */
-auto allocateAlignedArray(std::size_t size, const py::dtype &dt,
+auto allocateAlignedArray(size_t size, const py::dtype &dt,
                           bool zeroInit = false) -> py::array {
     // TODO: Move memset operations to here to reduce zeroInit pass-throughs.
     auto memory_model = bestCPUMemoryModel();
@@ -474,25 +474,24 @@ void registerBackendAgnosticMeasurements(PyClass &pyclass) {
                 return M.var(*ob);
             },
             "Variance of an observable object.")
-        .def("generate_samples",
-             [](Measurements<StateVectorT> &M, std::size_t num_wires,
-                std::size_t num_shots) {
-                 auto &&result = M.generate_samples(num_shots);
-                 const std::size_t ndim = 2;
-                 const std::vector<std::size_t> shape{num_shots, num_wires};
-                 constexpr auto sz = sizeof(std::size_t);
-                 const std::vector<std::size_t> strides{sz * num_wires, sz};
-                 // return 2-D NumPy array
-                 return py::array(py::buffer_info(
-                     result.data(), /* data as contiguous array  */
-                     sz,            /* size of one scalar        */
-                     py::format_descriptor<std::size_t>::format(), /* data type
-                                                                    */
-                     ndim,   /* number of dimensions      */
-                     shape,  /* shape of the matrix       */
-                     strides /* strides for each axis     */
-                     ));
-             });
+        .def("generate_samples", [](Measurements<StateVectorT> &M,
+                                    std::size_t num_wires,
+                                    std::size_t num_shots) {
+            auto &&result = M.generate_samples(num_shots);
+            const std::size_t ndim = 2;
+            const std::vector<std::size_t> shape{num_shots, num_wires};
+            constexpr auto sz = sizeof(size_t);
+            const std::vector<std::size_t> strides{sz * num_wires, sz};
+            // return 2-D NumPy array
+            return py::array(py::buffer_info(
+                result.data(), /* data as contiguous array  */
+                sz,            /* size of one scalar        */
+                py::format_descriptor<std::size_t>::format(), /* data type */
+                ndim,   /* number of dimensions      */
+                shape,  /* shape of the matrix       */
+                strides /* strides for each axis     */
+                ));
+        });
 }
 
 /**
@@ -560,7 +559,7 @@ void registerBackendAgnosticAlgorithms(py::module_ &m) {
         .def("__repr__", [](const OpsData<StateVectorT> &ops) {
             using namespace Pennylane::Util;
             std::ostringstream ops_stream;
-            for (std::size_t op = 0; op < ops.getSize(); op++) {
+            for (size_t op = 0; op < ops.getSize(); op++) {
                 ops_stream << "{'name': " << ops.getOpsName()[op];
                 ops_stream << ", 'params': " << ops.getOpsParams()[op];
                 ops_stream << ", 'inv': " << ops.getOpsInverses()[op];
@@ -592,7 +591,7 @@ void registerBackendAgnosticAlgorithms(py::module_ &m) {
            const std::vector<std::vector<bool>> &ops_controlled_values) {
             std::vector<std::vector<ComplexT>> conv_matrices(
                 ops_matrices.size());
-            for (std::size_t op = 0; op < ops_name.size(); op++) {
+            for (size_t op = 0; op < ops_name.size(); op++) {
                 const auto m_buffer = ops_matrices[op].request();
                 if (m_buffer.size) {
                     const auto m_ptr =

From 1fdd6f52db8c63798e1403ba395f6d2fbe3ade4f Mon Sep 17 00:00:00 2001
From: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
Date: Fri, 19 Jul 2024 20:57:59 +0000
Subject: [PATCH 07/18] Auto update version from '0.38.0-dev12' to
 '0.38.0-dev13'

---
 pennylane_lightning/core/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index 52575c8282..c1ad83f4ad 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.38.0-dev12"
+__version__ = "0.38.0-dev13"

From 71636abaa391b83e0ef9fb828636f99d7328e881 Mon Sep 17 00:00:00 2001
From: Vincent Michaud-Rioux <vincent.michaud-rioux@xanadu.ai>
Date: Mon, 22 Jul 2024 19:35:24 +0000
Subject: [PATCH 08/18] Simplify generate_samples

---
 .../measurements/MeasurementsLQubit.hpp       | 36 ++++---------------
 1 file changed, 6 insertions(+), 30 deletions(-)

diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
index b85176e1bd..af67ad0f5b 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
@@ -124,7 +124,7 @@ class Measurements final
 
         const ComplexT *arr_data = this->_statevector.getData();
 
-        // Templated 1-4 wire cases; return probs 
+        // Templated 1-4 wire cases; return probs
         PROBS_SPECIAL_CASE(1);
         PROBS_SPECIAL_CASE(2);
         PROBS_SPECIAL_CASE(3);
@@ -596,37 +596,13 @@ class Measurements final
     generate_samples(const std::vector<std::size_t> &wires,
                      const std::size_t num_samples) {
         const std::size_t n_wires = wires.size();
-        const std::size_t n_counts = PUtil::exp2(n_wires);
         std::vector<std::size_t> samples(num_samples * n_wires);
-        if (n_counts > num_samples) {
-            this->setRandomSeed();
-            discrete_random_variable<PrecisionT> drv{this->rng, probs(wires)};
-            for (std::size_t s = 0; s < num_samples; s++) {
-                const std::size_t idx = drv();
-                for (std::size_t j = 0; j < n_wires; j++) {
-                    samples[s * n_wires + (n_wires - 1 - j)] = (idx >> j) & 1U;
-                }
-            }
-            return samples;
-        }
-        const std::vector<std::size_t> counts =
-            generate_counts(wires, num_samples);
-        std::size_t cum_count{0};
-        for (std::size_t idx = 0; idx < n_counts; idx++) {
-            const std::size_t count = counts[idx];
-            if (count == 0) {
-                continue;
-            }
+        this->setRandomSeed();
+        discrete_random_variable<PrecisionT> drv{this->rng, probs(wires)};
+        for (std::size_t s = 0; s < num_samples; s++) {
+            const std::size_t idx = drv();
             for (std::size_t j = 0; j < n_wires; j++) {
-                samples[cum_count * n_wires + (n_wires - 1 - j)] =
-                    (idx >> j) & 1U;
-            }
-            const auto iterator = samples.begin() + cum_count * n_wires;
-            cum_count += 1;
-            for (std::size_t c = 1; c < count; c++) {
-                std::copy(iterator, iterator + n_wires,
-                          samples.begin() + cum_count * n_wires);
-                cum_count += 1;
+                samples[s * n_wires + (n_wires - 1 - j)] = (idx >> j) & 1U;
             }
         }
         return samples;

From 6825720851c3eef1d10a7ab473072a457f59a094 Mon Sep 17 00:00:00 2001
From: Vincent Michaud-Rioux <vincent.michaud-rioux@xanadu.ai>
Date: Mon, 22 Jul 2024 20:01:32 +0000
Subject: [PATCH 09/18] Paralellize samples with OpenMP

---
 .../lightning_qubit/measurements/MeasurementsLQubit.hpp        | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
index af67ad0f5b..d04660f909 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
@@ -599,6 +599,9 @@ class Measurements final
         std::vector<std::size_t> samples(num_samples * n_wires);
         this->setRandomSeed();
         discrete_random_variable<PrecisionT> drv{this->rng, probs(wires)};
+#if defined PL_LQ_KERNEL_OMP && defined _OPENMP
+#pragma omp parallel for
+#endif
         for (std::size_t s = 0; s < num_samples; s++) {
             const std::size_t idx = drv();
             for (std::size_t j = 0; j < n_wires; j++) {

From 376968c95a89cecdc6d5a03e65e576d95ed547fc Mon Sep 17 00:00:00 2001
From: Vincent Michaud-Rioux <vincent.michaud-rioux@xanadu.ai>
Date: Mon, 22 Jul 2024 21:24:48 +0000
Subject: [PATCH 10/18] Remove generate_counts

---
 .../measurements/MeasurementsLQubit.hpp            | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
index d04660f909..fa03d9f899 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
@@ -611,20 +611,6 @@ class Measurements final
         return samples;
     }
 
-    std::vector<std::size_t>
-    generate_counts(const std::vector<std::size_t> &wires,
-                    const std::size_t num_samples) {
-        const std::size_t n_wires = wires.size();
-        this->setRandomSeed();
-        discrete_random_variable<PrecisionT> drv{this->rng, probs(wires)};
-
-        std::vector<std::size_t> counts(PUtil::exp2(n_wires), 0UL);
-        for (std::size_t i = 0; i < num_samples; i++) {
-            counts[drv()] += 1;
-        }
-        return counts;
-    }
-
   private:
     /**
      * @brief Support function that calculates <bra|obs|ket> to obtain the

From 7e7f19bdd6a146c74ea838177be21be3581356b0 Mon Sep 17 00:00:00 2001
From: Vincent Michaud-Rioux <vincent.michaud-rioux@xanadu.ai>
Date: Tue, 23 Jul 2024 13:05:39 +0000
Subject: [PATCH 11/18] Couple fixes. [skip ci]

---
 .../measurements/MeasurementKernels.hpp       | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp
index fa4ff45d31..b7c6377b71 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp
@@ -334,7 +334,7 @@ template <typename PrecisionT> class discrete_random_variable {
      */
     std::size_t operator()() const {
         const auto idx = static_cast<std::size_t>(distribution(gen_) * n_probs);
-        if (distribution(gen_) >= bucket_partners_[idx].first and
+        if (distribution(gen_) >= bucket_partners_[idx].first &&
             bucket_partners_[idx].second != default_index) {
             return bucket_partners_[idx].second;
         }
@@ -353,8 +353,14 @@ template <typename PrecisionT> class discrete_random_variable {
         std::stack<std::size_t> underfull_bucket_ids;
         std::stack<std::size_t> overfull_bucket_ids;
 
-        for (std::size_t i = 0; i != n_probs; ++i) {
+#if defined PL_LQ_KERNEL_OMP && defined _OPENMP
+#pragma omp parallel for
+#endif
+        for (std::size_t i = 0; i < n_probs; ++i) {
             bucket_partners[i].first = n_probs * probs[i];
+        }
+
+        for (std::size_t i = 0; i < n_probs; ++i) {
             if (bucket_partners[i].first < 1.0) {
                 underfull_bucket_ids.push(i);
             } else {
@@ -362,13 +368,14 @@ template <typename PrecisionT> class discrete_random_variable {
             }
         }
 
-        while (not(underfull_bucket_ids.empty()) and
-               not(overfull_bucket_ids.empty())) {
+        while (!underfull_bucket_ids.empty() && !overfull_bucket_ids.empty()) {
             auto i = overfull_bucket_ids.top();
+            overfull_bucket_ids.pop();
             auto j = underfull_bucket_ids.top();
-            underfull_bucket_ids.pop(), overfull_bucket_ids.pop();
+            underfull_bucket_ids.pop();
+
             bucket_partners[j].second = i;
-            bucket_partners[i].first -= (1.0 - bucket_partners[j].first);
+            bucket_partners[i].first += bucket_partners[j].first - 1.0;
 
             if (bucket_partners[i].first < 1.0) {
                 underfull_bucket_ids.push(i);

From 04f1390bf38690e60901d77fbabdcdbc11f84e0f Mon Sep 17 00:00:00 2001
From: Vincent Michaud-Rioux <vincent.michaud-rioux@xanadu.ai>
Date: Tue, 23 Jul 2024 14:45:05 +0000
Subject: [PATCH 12/18] Remove OpenMP from sample loop.

---
 .../measurements/MeasurementKernels.hpp               | 11 -----------
 .../measurements/MeasurementsLQubit.hpp               |  3 ---
 2 files changed, 14 deletions(-)

diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp
index b7c6377b71..495adf4706 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp
@@ -18,11 +18,6 @@
  */
 #pragma once
 
-// #include <algorithm>
-// #include <functional>
-// #include <limits>
-// #include <random>
-
 #include <complex>
 #include <stack>
 #include <vector>
@@ -353,14 +348,8 @@ template <typename PrecisionT> class discrete_random_variable {
         std::stack<std::size_t> underfull_bucket_ids;
         std::stack<std::size_t> overfull_bucket_ids;
 
-#if defined PL_LQ_KERNEL_OMP && defined _OPENMP
-#pragma omp parallel for
-#endif
         for (std::size_t i = 0; i < n_probs; ++i) {
             bucket_partners[i].first = n_probs * probs[i];
-        }
-
-        for (std::size_t i = 0; i < n_probs; ++i) {
             if (bucket_partners[i].first < 1.0) {
                 underfull_bucket_ids.push(i);
             } else {
diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
index fa03d9f899..a2773be2ff 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
@@ -599,9 +599,6 @@ class Measurements final
         std::vector<std::size_t> samples(num_samples * n_wires);
         this->setRandomSeed();
         discrete_random_variable<PrecisionT> drv{this->rng, probs(wires)};
-#if defined PL_LQ_KERNEL_OMP && defined _OPENMP
-#pragma omp parallel for
-#endif
         for (std::size_t s = 0; s < num_samples; s++) {
             const std::size_t idx = drv();
             for (std::size_t j = 0; j < n_wires; j++) {

From 843356b07b09ecfd6a5c7f5432d3d38bb8b39391 Mon Sep 17 00:00:00 2001
From: Vincent Michaud-Rioux <vincentm@nanoacademic.com>
Date: Tue, 23 Jul 2024 18:00:33 -0400
Subject: [PATCH 13/18] Apply suggestions from code review

Co-authored-by: Ali Asadi <10773383+maliasadi@users.noreply.github.com>
---
 .github/CHANGELOG.md                                 |  2 +-
 .../measurements/MeasurementKernels.hpp              | 12 ++++++------
 .../measurements/MeasurementsLQubit.hpp              |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index 4157e0d314..7f3a673a27 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -15,7 +15,7 @@
 
 ### Improvements
 
-* Add `sample(wires)` support in LightningQubit.
+* Add `generate_samples(wires)` support in LightningQubit.
   [(#809)](https://github.com/PennyLaneAI/pennylane-lightning/pull/809)
   
 * Optimize the OpenMP parallelization of Lightning-Qubit's `probs` for all number of targets.
diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp
index 495adf4706..c451af1554 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp
@@ -302,24 +302,24 @@ namespace Pennylane::LightningQubit::Measures {
  *
  * @tparam PrecisionT Precision data type
  */
-template <typename PrecisionT> class discrete_random_variable {
+template <typename PrecisionT> class DiscreteRandomVariable {
   private:
     static constexpr std::size_t default_index =
         std::numeric_limits<std::size_t>::max();
     const std::vector<std::pair<double, std::size_t>> bucket_partners_;
     std::mt19937 &gen_;
-    const std::size_t n_probs;
-    mutable std::uniform_real_distribution<PrecisionT> distribution{0.0, 1.0};
+    const std::size_t _n_probs;
+    mutable std::uniform_real_distribution<PrecisionT> _distribution{0.0, 1.0};
 
   public:
     /**
-     * @brief Create a discrete_random_variable object.
+     * @brief Create a DiscreteRandomVariable object.
      *
      * @param gen Random number generator reference.
      * @param probs Probabilities for values 0 up to N - 1, where N =
      * probs.size().
      */
-    discrete_random_variable(std::mt19937 &gen,
+    DiscreteRandomVariable(std::mt19937 &gen,
                              const std::vector<PrecisionT> &probs)
         : bucket_partners_(init_bucket_partners_(probs)), gen_{gen},
           n_probs{probs.size()} {}
@@ -348,7 +348,7 @@ template <typename PrecisionT> class discrete_random_variable {
         std::stack<std::size_t> underfull_bucket_ids;
         std::stack<std::size_t> overfull_bucket_ids;
 
-        for (std::size_t i = 0; i < n_probs; ++i) {
+        for (std::size_t i = 0; i < n_probs; i++) {
             bucket_partners[i].first = n_probs * probs[i];
             if (bucket_partners[i].first < 1.0) {
                 underfull_bucket_ids.push(i);
diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
index a2773be2ff..463577d7e0 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
@@ -598,7 +598,7 @@ class Measurements final
         const std::size_t n_wires = wires.size();
         std::vector<std::size_t> samples(num_samples * n_wires);
         this->setRandomSeed();
-        discrete_random_variable<PrecisionT> drv{this->rng, probs(wires)};
+        DiscreteRandomVariable<PrecisionT> drv{this->rng, probs(wires)};
         for (std::size_t s = 0; s < num_samples; s++) {
             const std::size_t idx = drv();
             for (std::size_t j = 0; j < n_wires; j++) {

From 5be3e2a74fa672c267bfcc898eca2cf5f93a0354 Mon Sep 17 00:00:00 2001
From: Vincent Michaud-Rioux <vincent.michaud-rioux@xanadu.ai>
Date: Tue, 23 Jul 2024 22:46:40 +0000
Subject: [PATCH 14/18] Fix variable names.

---
 .../measurements/MeasurementKernels.hpp       | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp
index c451af1554..8d9a2157fc 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp
@@ -306,10 +306,10 @@ template <typename PrecisionT> class DiscreteRandomVariable {
   private:
     static constexpr std::size_t default_index =
         std::numeric_limits<std::size_t>::max();
-    const std::vector<std::pair<double, std::size_t>> bucket_partners_;
     std::mt19937 &gen_;
-    const std::size_t _n_probs;
-    mutable std::uniform_real_distribution<PrecisionT> _distribution{0.0, 1.0};
+    const std::size_t n_probs_;
+    const std::vector<std::pair<double, std::size_t>> bucket_partners_;
+    mutable std::uniform_real_distribution<PrecisionT> distribution_{0.0, 1.0};
 
   public:
     /**
@@ -320,16 +320,17 @@ template <typename PrecisionT> class DiscreteRandomVariable {
      * probs.size().
      */
     DiscreteRandomVariable(std::mt19937 &gen,
-                             const std::vector<PrecisionT> &probs)
-        : bucket_partners_(init_bucket_partners_(probs)), gen_{gen},
-          n_probs{probs.size()} {}
+                           const std::vector<PrecisionT> &probs)
+        : gen_{gen}, n_probs_{probs.size()},
+          bucket_partners_(init_bucket_partners_(probs)) {}
 
     /**
      * @brief Return a discrete random value.
      */
     std::size_t operator()() const {
-        const auto idx = static_cast<std::size_t>(distribution(gen_) * n_probs);
-        if (distribution(gen_) >= bucket_partners_[idx].first &&
+        const auto idx =
+            static_cast<std::size_t>(distribution_(gen_) * n_probs_);
+        if (distribution_(gen_) >= bucket_partners_[idx].first &&
             bucket_partners_[idx].second != default_index) {
             return bucket_partners_[idx].second;
         }
@@ -342,14 +343,13 @@ template <typename PrecisionT> class DiscreteRandomVariable {
      */
     std::vector<std::pair<double, std::size_t>>
     init_bucket_partners_(const std::vector<PrecisionT> &probs) {
-        const std::size_t n_probs = probs.size();
         std::vector<std::pair<double, std::size_t>> bucket_partners(
-            n_probs, {0.0, default_index});
+            n_probs_, {0.0, default_index});
         std::stack<std::size_t> underfull_bucket_ids;
         std::stack<std::size_t> overfull_bucket_ids;
 
-        for (std::size_t i = 0; i < n_probs; i++) {
-            bucket_partners[i].first = n_probs * probs[i];
+        for (std::size_t i = 0; i < n_probs_; ++i) {
+            bucket_partners[i].first = n_probs_ * probs[i];
             if (bucket_partners[i].first < 1.0) {
                 underfull_bucket_ids.push(i);
             } else {

From 87e43323bd887d9e85b074cf5691ea8d534a1e84 Mon Sep 17 00:00:00 2001
From: Vincent Michaud-Rioux <vincent.michaud-rioux@xanadu.ai>
Date: Wed, 24 Jul 2024 12:24:02 +0000
Subject: [PATCH 15/18] Remove stack

---
 .../lightning_qubit/measurements/MeasurementsLQubit.hpp          | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
index 463577d7e0..b7a0b65db4 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
@@ -24,7 +24,6 @@
 #include <complex>
 #include <cstdio>
 #include <random>
-#include <stack>
 #include <type_traits>
 #include <unordered_map>
 #include <vector>

From 7e4071f045766cf76cbb01bfef683fd259cd2e60 Mon Sep 17 00:00:00 2001
From: Vincent Michaud-Rioux <vincent.michaud-rioux@xanadu.ai>
Date: Wed, 24 Jul 2024 12:25:27 +0000
Subject: [PATCH 16/18] post-increment

---
 .../lightning_qubit/measurements/MeasurementKernels.hpp         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp
index 8d9a2157fc..a22bc928b5 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp
@@ -348,7 +348,7 @@ template <typename PrecisionT> class DiscreteRandomVariable {
         std::stack<std::size_t> underfull_bucket_ids;
         std::stack<std::size_t> overfull_bucket_ids;
 
-        for (std::size_t i = 0; i < n_probs_; ++i) {
+        for (std::size_t i = 0; i < n_probs_; i++) {
             bucket_partners[i].first = n_probs_ * probs[i];
             if (bucket_partners[i].first < 1.0) {
                 underfull_bucket_ids.push(i);

From 7fbc261e659de23693f70b6cb42f47d78840ddc7 Mon Sep 17 00:00:00 2001
From: Vincent Michaud-Rioux <vincent.michaud-rioux@xanadu.ai>
Date: Wed, 24 Jul 2024 13:00:59 +0000
Subject: [PATCH 17/18] Add headers and comments.

---
 .../lightning_qubit/measurements/MeasurementKernels.hpp       | 2 ++
 .../lightning_qubit/measurements/MeasurementsLQubit.hpp       | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp
index a22bc928b5..b401ee4234 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementKernels.hpp
@@ -19,7 +19,9 @@
 #pragma once
 
 #include <complex>
+#include <random>
 #include <stack>
+#include <utility>
 #include <vector>
 
 #include "BitUtil.hpp"
diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
index b7a0b65db4..f01c2a3269 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/measurements/MeasurementsLQubit.hpp
@@ -598,6 +598,10 @@ class Measurements final
         std::vector<std::size_t> samples(num_samples * n_wires);
         this->setRandomSeed();
         DiscreteRandomVariable<PrecisionT> drv{this->rng, probs(wires)};
+        // The Python layer expects a 2D array with dimensions (n_samples x
+        // n_wires) and hence the linear index is `s * n_wires + (n_wires - 1 -
+        // j)` `s` being the "slow" row index and `j` being the "fast" column
+        // index
         for (std::size_t s = 0; s < num_samples; s++) {
             const std::size_t idx = drv();
             for (std::size_t j = 0; j < n_wires; j++) {

From 3a2a34132844717f41726de414d3b14ad52d0cf0 Mon Sep 17 00:00:00 2001
From: Vincent Michaud-Rioux <vincent.michaud-rioux@xanadu.ai>
Date: Wed, 24 Jul 2024 14:32:27 +0000
Subject: [PATCH 18/18] Lower shots, increase tol.

---
 tests/test_measurements.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_measurements.py b/tests/test_measurements.py
index 04972d9780..68ce20e723 100644
--- a/tests/test_measurements.py
+++ b/tests/test_measurements.py
@@ -669,7 +669,7 @@ def test_sample_values(self, qubit_device, tol):
     @pytest.mark.parametrize("nwires", range(1, 11))
     def test_sample_variations(self, qubit_device, nwires, seed):
         """Tests if `sample(wires)` returns correct statistics."""
-        shots = 100000
+        shots = 20000
         n_qubits = max(5, nwires + 1)
         np.random.seed(seed)
         wires = qml.wires.Wires(np.random.permutation(nwires))
@@ -694,7 +694,7 @@ def reshape_samples(samples):
             reshape_samples(samples), wire_order=wires
         )
 
-        assert np.allclose(probs, ref, atol=1.0e-2, rtol=1.0e-4)
+        assert np.allclose(probs, ref, atol=2.0e-2, rtol=1.0e-4)
 
 
 @pytest.mark.skipif(