From 9a6722c643057965d7aa911d6779199781acfcbb Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Sat, 26 Feb 2022 22:45:36 -0500
Subject: [PATCH 01/94] Managed to CPU

---
 Makefile                                      |   2 +-
 pennylane_lightning/src/.clang-tidy           |   6 +-
 .../src/algorithms/AdjointDiff.hpp            |  52 +--
 .../src/examples/CMakeLists.txt               |  11 +-
 .../src/examples/benchmark_gate_list.cpp      | 223 ++++++++++
 .../src/examples/benchmark_multi_rz.cpp       |   4 +-
 .../src/examples/benchmark_operation.cpp      | 210 +++++++++
 .../src/examples/gate_benchmark.cpp           |   2 -
 .../src/examples/run_gate_benchmark.sh        |   2 +-
 .../src/gates/AvailableKernels.hpp            |   8 +-
 pennylane_lightning/src/gates/Gates.hpp       |   2 +-
 .../GateImplementationsLM.hpp                 | 194 +++++++--
 .../GateImplementationsPI.hpp                 |  84 +++-
 .../{ => cpu_kernels}/PauliGenerator.hpp      |   0
 .../DefaultKernelsForStateVector.hpp          | 400 ++++++++++++++++++
 .../src/simulator/DispatchKeys.hpp            |  87 ++++
 .../src/simulator/DynamicDispatcher.cpp       |   8 +-
 .../src/simulator/DynamicDispatcher.hpp       | 147 ++-----
 .../src/simulator/Measures.hpp                |  10 +-
 .../src/simulator/StateVectorBase.hpp         |  48 ++-
 .../src/simulator/StateVectorCPU.hpp          | 166 ++++++++
 .../src/simulator/StateVectorManaged.hpp      | 104 -----
 pennylane_lightning/src/tests/.clang-tidy     |   4 +-
 pennylane_lightning/src/tests/CMakeLists.txt  |   7 +-
 .../src/tests/CreateAllWires.cpp              |  31 ++
 .../src/tests/CreateAllWires.hpp              |  92 ++++
 pennylane_lightning/src/tests/TestHelpers.hpp |  77 ++--
 pennylane_lightning/src/tests/TestKernels.hpp |  12 +-
 .../src/tests/Test_AdjDiff.cpp                |  37 +-
 .../Test_DefaultKernelsForStateVector.cpp     |  32 ++
 .../src/tests/Test_DynamicDispatcher.cpp      |   8 +-
 ...est_GateImplementations_CompareKernels.cpp | 185 ++++++++
 .../Test_GateImplementations_Generator.cpp    |   2 +-
 .../Test_GateImplementations_Inverse.cpp      |   2 +-
 .../Test_GateImplementations_Nonparam.cpp     | 117 ++---
 .../tests/Test_GateImplementations_Param.cpp  |  18 +-
 .../src/tests/Test_Internal.cpp               |  83 +++-
 .../src/tests/Test_Measures.cpp               |  22 +-
 .../src/tests/Test_StateVectorCPU.cpp         |  48 +++
 .../src/tests/Test_StateVectorManaged.cpp     |  50 ---
 pennylane_lightning/src/tests/Test_Util.cpp   |  15 +
 pennylane_lightning/src/util/BitUtil.hpp      |  27 +-
 .../src/util/LinearAlgebra.hpp                |  29 +-
 pennylane_lightning/src/util/Macros.hpp       |  70 ++-
 pennylane_lightning/src/util/Memory.hpp       | 106 +++++
 pennylane_lightning/src/util/TypeList.hpp     |  34 +-
 pennylane_lightning/src/util/Util.hpp         |  35 ++
 47 files changed, 2381 insertions(+), 532 deletions(-)
 create mode 100644 pennylane_lightning/src/examples/benchmark_gate_list.cpp
 create mode 100644 pennylane_lightning/src/examples/benchmark_operation.cpp
 rename pennylane_lightning/src/gates/{ => cpu_kernels}/GateImplementationsLM.hpp (87%)
 rename pennylane_lightning/src/gates/{ => cpu_kernels}/GateImplementationsPI.hpp (91%)
 rename pennylane_lightning/src/gates/{ => cpu_kernels}/PauliGenerator.hpp (100%)
 create mode 100644 pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp
 create mode 100644 pennylane_lightning/src/simulator/DispatchKeys.hpp
 create mode 100644 pennylane_lightning/src/simulator/StateVectorCPU.hpp
 delete mode 100644 pennylane_lightning/src/simulator/StateVectorManaged.hpp
 create mode 100644 pennylane_lightning/src/tests/CreateAllWires.cpp
 create mode 100644 pennylane_lightning/src/tests/CreateAllWires.hpp
 create mode 100644 pennylane_lightning/src/tests/Test_DefaultKernelsForStateVector.cpp
 create mode 100644 pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
 create mode 100644 pennylane_lightning/src/tests/Test_StateVectorCPU.cpp
 delete mode 100644 pennylane_lightning/src/tests/Test_StateVectorManaged.cpp
 create mode 100644 pennylane_lightning/src/util/Memory.hpp

diff --git a/Makefile b/Makefile
index 02556dc3e0..edef79bab5 100644
--- a/Makefile
+++ b/Makefile
@@ -75,7 +75,7 @@ coverage:
 
 test-cpp:
 	rm -rf ./BuildTests
-	cmake $(LIGHTNING_CPP_DIR) -BBuildTests -DBUILD_TESTS=ON
+	cmake $(LIGHTNING_CPP_DIR) -BBuildTests -DBUILD_TESTS=ON -DENABLE_OPENMP=OFF
 	cmake --build ./BuildTests --target runner
 	cmake --build ./BuildTests --target test
 
diff --git a/pennylane_lightning/src/.clang-tidy b/pennylane_lightning/src/.clang-tidy
index f015b16a1d..e1fce11707 100644
--- a/pennylane_lightning/src/.clang-tidy
+++ b/pennylane_lightning/src/.clang-tidy
@@ -1,5 +1,5 @@
 ---
-Checks:          'clang-diagnostic-*,clang-analyzer-*,-*,-llvmlibc-*,modernize-*,-modernize-use-trailing-return-type,clang-analyzer-cplusplus*,openmp-*,performance-*,portability-*,readability-*,hicpp-*,-hicpp-no-array-decay,bugprone-suspicious-*,llvm-namespace-comment,'
+Checks:          'clang-diagnostic-*,clang-analyzer-*,-*,-llvmlibc-*,modernize-*,-modernize-use-trailing-return-type,clang-analyzer-cplusplus*,openmp-*,performance-*,portability-*,readability-*,hicpp-*,-hicpp-avoid-c-arrays,-hicpp-no-array-decay,bugprone-suspicious-*,llvm-namespace-comment,cppcoreguidelines-avoid-non-const-global-variables,cppcoreguidelines-slicing,cppcoreguidelines-special-member-functions'
 WarningsAsErrors: '*'
 HeaderFilterRegex: '.*'
 AnalyzeTemporaryDtors: false
@@ -25,8 +25,6 @@ CheckOptions:
     value:           'false'
   - key:             readability-magic-numbers.IgnoredIntegerValues
     value:           '1;2;3;4;'
-  - key:             readability-magic-numbers.IgnorePowersOf2IntegerValues
-    value:           true
   - key:             modernize-use-default-member-init.UseAssignment
     value:           'false'
   - key:             readability-function-size.NestingThreshold
@@ -218,7 +216,7 @@ CheckOptions:
   - key:             modernize-use-auto.RemoveStars
     value:           'false'
   - key:             readability-magic-numbers.IgnorePowersOf2IntegerValues
-    value:           'false'
+    value:           'true'
   - key:             portability-simd-intrinsics.Std
     value:           ''
   - key:             readability-redundant-member-init.IgnoreBaseInCopyConstructors
diff --git a/pennylane_lightning/src/algorithms/AdjointDiff.hpp b/pennylane_lightning/src/algorithms/AdjointDiff.hpp
index 1d84d139b6..9b69139260 100644
--- a/pennylane_lightning/src/algorithms/AdjointDiff.hpp
+++ b/pennylane_lightning/src/algorithms/AdjointDiff.hpp
@@ -22,11 +22,12 @@
 #include <variant>
 #include <vector>
 
+#include "DispatchKeys.hpp"
 #include "DynamicDispatcher.hpp"
 #include "Error.hpp"
 #include "JacobianTape.hpp"
 #include "LinearAlgebra.hpp"
-#include "StateVectorManaged.hpp"
+#include "StateVectorCPU.hpp"
 
 #include <iostream>
 
@@ -48,7 +49,7 @@ namespace Pennylane::Algorithms {
  */
 template <class T = double> class AdjointJacobian {
   private:
-    using GeneratorFunc = void (*)(StateVectorManaged<T> &,
+    using GeneratorFunc = void (*)(StateVectorCPU<T> &,
                                    const std::vector<size_t> &,
                                    const bool); // function pointer type
 
@@ -63,25 +64,26 @@ template <class T = double> class AdjointJacobian {
      * @param obs_index Observable index position of Jacobian to update.
      * @param param_index Parameter index position of Jacobian to update.
      */
-    inline void updateJacobian(const StateVectorManaged<T> &sv1,
-                               const StateVectorManaged<T> &sv2,
+    inline void updateJacobian(const StateVectorCPU<T> &sv1,
+                               const StateVectorCPU<T> &sv2,
                                std::vector<std::vector<T>> &jac,
                                T scaling_coeff, size_t obs_index,
                                size_t param_index) {
         jac[obs_index][param_index] =
             -2 * scaling_coeff *
-            std::imag(innerProdC(sv1.getDataVector(), sv2.getDataVector()));
+            std::imag(
+                innerProdC(sv1.getData(), sv2.getData(), sv1.getLength()));
     }
 
     /**
      * @brief Utility method to apply all operations from given `%OpsData<T>`
-     * object to `%StateVectorManaged<T>`
+     * object to `%StateVectorCPU<T>`
      *
      * @param state Statevector to be updated.
      * @param operations Operations to apply.
      * @param adj Take the adjoint of the given operations.
      */
-    inline void applyOperations(StateVectorManaged<T> &state,
+    inline void applyOperations(StateVectorCPU<T> &state,
                                 const OpsData<T> &operations,
                                 bool adj = false) {
         for (size_t op_idx = 0; op_idx < operations.getOpsName().size();
@@ -94,13 +96,13 @@ template <class T = double> class AdjointJacobian {
     }
     /**
      * @brief Utility method to apply the adjoint indexed operation from
-     * `%OpsData<T>` object to `%StateVectorManaged<T>`.
+     * `%OpsData<T>` object to `%StateVectorCPU<T>`.
      *
      * @param state Statevector to be updated.
      * @param operations Operations to apply.
      * @param op_idx Adjointed operation index to apply.
      */
-    inline void applyOperationAdj(StateVectorManaged<T> &state,
+    inline void applyOperationAdj(StateVectorCPU<T> &state,
                                   const OpsData<T> &operations, size_t op_idx) {
         state.applyOperation(operations.getOpsName()[op_idx],
                              operations.getOpsWires()[op_idx],
@@ -110,12 +112,12 @@ template <class T = double> class AdjointJacobian {
 
     /**
      * @brief Utility method to apply a given operations from given
-     * `%ObsDatum<T>` object to `%StateVectorManaged<T>`
+     * `%ObsDatum<T>` object to `%StateVectorCPU<T>`
      *
      * @param state Statevector to be updated.
      * @param observable Observable to apply.
      */
-    inline void applyObservable(StateVectorManaged<T> &state,
+    inline void applyObservable(StateVectorCPU<T> &state,
                                 const ObsDatum<T> &observable) {
         using namespace Pennylane::Util;
         for (size_t j = 0; j < observable.getSize(); j++) {
@@ -157,8 +159,8 @@ template <class T = double> class AdjointJacobian {
      * @param reference_state Reference statevector
      * @param observables Vector of observables to apply to each statevector.
      */
-    inline void applyObservables(std::vector<StateVectorManaged<T>> &states,
-                                 const StateVectorManaged<T> &reference_state,
+    inline void applyObservables(std::vector<StateVectorCPU<T>> &states,
+                                 const StateVectorCPU<T> &reference_state,
                                  const std::vector<ObsDatum<T>> &observables) {
         // clang-format off
         // Globally scoped exception value to be captured within OpenMP block.
@@ -174,7 +176,7 @@ template <class T = double> class AdjointJacobian {
         #endif
             for (size_t h_i = 0; h_i < num_observables; h_i++) {
                 try {
-                    states[h_i].updateData(reference_state.getDataVector());
+                    states[h_i].updateData(reference_state.getData());
                     applyObservable(states[h_i], observables[h_i]);
                 } catch (...) {
                     #if defined(_OPENMP)
@@ -207,7 +209,7 @@ template <class T = double> class AdjointJacobian {
      * @param op_idx Index of given operation within operations list to take
      * adjoint of.
      */
-    inline void applyOperationsAdj(std::vector<StateVectorManaged<T>> &states,
+    inline void applyOperationsAdj(std::vector<StateVectorCPU<T>> &states,
                                    const OpsData<T> &operations,
                                    size_t op_idx) {
         // clang-format off
@@ -298,7 +300,7 @@ template <class T = double> class AdjointJacobian {
      * of parametric gates.
      *
      * For the statevector data associated with `psi` of length `num_elements`,
-     * we make internal copies to a `%StateVectorManaged<T>` object, with one
+     * we make internal copies to a `%StateVectorCPU<T>` object, with one
      * per required observable. The `operations` will be applied to the internal
      * statevector copies, with the operation indices participating in the
      * gradient calculations given in `trainableParams`, and the overall number
@@ -333,7 +335,7 @@ template <class T = double> class AdjointJacobian {
             num_param_ops - 1; // total number of parametric ops
 
         // Create $U_{1:p}\vert \lambda \rangle$
-        StateVectorManaged<T> lambda(jd.getPtrStateVec(), jd.getSizeStateVec());
+        StateVectorCPU<T> lambda(jd.getPtrStateVec(), jd.getSizeStateVec());
 
         // Apply given operations to statevector if requested
         if (apply_operations) {
@@ -343,12 +345,14 @@ template <class T = double> class AdjointJacobian {
         const auto tp_begin = tp.begin();
         auto tp_it = tp.end();
 
+        StateVectorCPU<T> sv{lambda.getNumQubits(), Threading::SingleThread};
         // Create observable-applied state-vectors
-        std::vector<StateVectorManaged<T>> H_lambda(
-            num_observables, StateVectorManaged<T>{lambda.getNumQubits()});
+        std::vector<StateVectorCPU<T>> H_lambda(
+            num_observables,
+            StateVectorCPU<T>{lambda.getNumQubits(), Threading::SingleThread});
         applyObservables(H_lambda, lambda, obs);
 
-        StateVectorManaged<T> mu(lambda.getNumQubits());
+        StateVectorCPU<T> mu(lambda.getNumQubits());
 
         for (int op_idx = static_cast<int>(ops_name.size() - 1); op_idx >= 0;
              op_idx--) {
@@ -357,7 +361,7 @@ template <class T = double> class AdjointJacobian {
                         "differentiation method");
             if ((ops_name[op_idx] != "QubitStateVector") &&
                 (ops_name[op_idx] != "BasisState")) {
-                mu.updateData(lambda.getDataVector());
+                mu.updateData(lambda.getData());
                 applyOperationAdj(lambda, ops, op_idx);
 
                 if (ops.hasParams(op_idx)) {
@@ -387,9 +391,9 @@ template <class T = double> class AdjointJacobian {
                              obs_idx++) {
                             jac[mat_row_idx + obs_idx] =
                                 -2 * scalingFactor *
-                                std::imag(innerProdC(
-                                    H_lambda[obs_idx].getDataVector(),
-                                    mu.getDataVector()));
+                                std::imag(
+                                    innerProdC(H_lambda[obs_idx].getData(),
+                                               mu.getData(), mu.getLength()));
                         }
                         trainableParamNumber--;
                         std::advance(tp_it, -1);
diff --git a/pennylane_lightning/src/examples/CMakeLists.txt b/pennylane_lightning/src/examples/CMakeLists.txt
index 21bbe56c63..d58bcce5ba 100644
--- a/pennylane_lightning/src/examples/CMakeLists.txt
+++ b/pennylane_lightning/src/examples/CMakeLists.txt
@@ -21,14 +21,19 @@ target_link_libraries(lightning_examples INTERFACE lightning_compile_options
                                                    lightning_simulator
                                                    lightning_utils)
 
-add_executable(gate_benchmark_oplist gate_benchmark_oplist.cpp)
-target_link_libraries(gate_benchmark_oplist PRIVATE lightning_examples)
+add_executable(benchmark_operation benchmark_operation.cpp)
+target_link_libraries(benchmark_operation PRIVATE lightning_examples)
+
+add_executable(benchmark_operation_float benchmark_operation.cpp)
+target_compile_options(benchmark_operation_float PRIVATE "-DUSE_SINGLE_PRECISION")
+target_link_libraries(benchmark_operation_float PRIVATE lightning_examples)
+
 add_executable(benchmark_multi_rz benchmark_multi_rz.cpp)
 target_link_libraries(benchmark_multi_rz PRIVATE lightning_examples)
 
 configure_file("compiler_info.in" "compiler_info.txt")
 
-add_custom_command(TARGET gate_benchmark_oplist POST_BUILD 
+add_custom_command(TARGET benchmark_operation POST_BUILD 
                    COMMAND ${CMAKE_COMMAND} -E copy
                            ${PROJECT_SOURCE_DIR}/run_gate_benchmark.sh
                            ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/run_gate_benchmark.sh
diff --git a/pennylane_lightning/src/examples/benchmark_gate_list.cpp b/pennylane_lightning/src/examples/benchmark_gate_list.cpp
new file mode 100644
index 0000000000..5910ad0884
--- /dev/null
+++ b/pennylane_lightning/src/examples/benchmark_gate_list.cpp
@@ -0,0 +1,223 @@
+#include <algorithm>
+#include <chrono>
+#include <cstdlib>
+#include <iostream>
+#include <map>
+#include <random>
+#include <stdexcept>
+#include <string>
+
+#include "Constant.hpp"
+#include "ExampleUtil.hpp"
+#include "StateVectorManaged.hpp"
+
+using namespace Pennylane;
+using namespace Pennylane::Gates;
+using namespace Pennylane::Util;
+
+std::string_view strip(std::string_view str) {
+    auto start = str.find_first_not_of(" \t");
+    auto end = str.find_last_not_of(" \t");
+    return str.substr(start, end - start + 1);
+}
+
+struct GateDesc {
+    size_t n_wires;  // number of wires the gate applies to
+    size_t n_params; // number of parameters the gate requires
+};
+
+std::vector<std::pair<std::string, GateDesc>>
+parseGateLists(std::string_view arg) {
+    namespace Constant = Gates::Constant;
+    std::map<std::string, GateDesc> available_gates_wires;
+
+    for (const auto &[gate_op, gate_name] : Constant::gate_names) {
+        if (!array_has_elt(Constant::multi_qubit_gates, gate_op)) {
+            // We do not support multi qubit gates yet
+            size_t n_wires = Util::lookup(Constant::gate_wires, gate_op);
+            size_t n_params = Util::lookup(Constant::gate_num_params, gate_op);
+            available_gates_wires.emplace(gate_name,
+                                          GateDesc{n_wires, n_params});
+        }
+    }
+
+    if (arg.empty()) {
+        return {};
+    }
+
+    std::vector<std::pair<std::string, GateDesc>> ops;
+
+    if (auto pos = arg.find_first_of('['); pos != std::string_view::npos) {
+        // arg is a list "[...]"
+        auto start = pos + 1;
+        auto end = arg.find_last_of(']');
+        if (end == std::string_view::npos) {
+            throw std::invalid_argument(
+                "Argument must contain operators within square brackets [].");
+        }
+        arg = arg.substr(start, end - start);
+    }
+
+    size_t start;
+    size_t end = 0;
+    while ((start = arg.find_first_not_of(',', end)) != std::string::npos) {
+        end = arg.find(',', start);
+        auto op_name = strip(arg.substr(start, end - start));
+
+        auto iter = available_gates_wires.find(std::string(op_name));
+
+        if (iter == available_gates_wires.end()) {
+            std::ostringstream ss;
+            ss << "Given gate " << op_name
+               << " is not availabe"; // TODO: Change to std::format in C++20
+            throw std::invalid_argument(ss.str());
+        }
+        ops.emplace_back(*iter);
+    }
+    return ops;
+}
+
+/**
+ * @brief Benchmark Pennylane-Lightning for a given gate set
+ *
+ * Example usage:
+ *
+ *     $ gate_benchmark_oplist 10 22 # Benchmark using 10 random gates (sampled
+ * evenly from all possible gates) for 22 qubits
+ *     $ gate_benchmark_oplist 100 20 [PauliX, CNOT] # Benchmark using 100
+ * random gates (where each gate is PauliX or CNOT) for 20 qubits
+ *
+ * The whole supported gates are PauliX, PauliY, PauliZ, Hadamard, S, T, RX, RY,
+ * RZ, Rot, PhaseShift, CNOT, SWAP, ControlledPhaseShift, CRX, CRY, CRZ, CRot,
+ * Toffoli and CSWAP.
+ *
+ * @param argc Number of arguments
+ * @param argv Command line arguments
+ * @return Returns 0 is completed successfully
+ */
+int main(int argc, char *argv[]) {
+    using TestType = double;
+
+    // Handle input
+    if (argc < 4) {
+        std::cerr << "Wrong number of inputs. User provided " << argc - 1
+                  << " inputs. "
+                  << "Usage: " + std::string(argv[0]) +
+                         " num_gate_reps num_qubits kernel [gate_lists]\n"
+                         "\tExample: "
+                  << argv[0] << " 1000 10 PI [PauliX, CNOT]"
+                  << std::endl; // Change to std::format in C++20
+        return -1;
+    }
+
+    size_t num_gate_reps;
+    size_t num_qubits;
+
+    try {
+        num_gate_reps = std::stoi(argv[1]);
+        num_qubits = std::stoi(argv[2]);
+    } catch (std::exception &e) {
+        std::cerr << "Arguments num_gate_reps and num_qubits must be integers."
+                  << std::endl;
+        return -1;
+    }
+
+    std::string_view kernel_name = argv[3];
+    KernelType kernel = string_to_kernel(kernel_name);
+    if (kernel == KernelType::None) {
+        std::cerr << "Kernel " << kernel_name << " is unknown." << std::endl;
+        return 1;
+    }
+
+    // Gate list is provided
+    std::string op_list_s;
+    {
+        std::ostringstream ss;
+        for (int idx = 4; idx < argc; idx++) {
+            ss << argv[idx] << " ";
+        }
+        op_list_s = ss.str();
+    }
+
+    std::vector<std::pair<std::string, GateDesc>> op_list;
+    try {
+        op_list = parseGateLists(op_list_s);
+    } catch (std::exception &e) {
+        std::cerr << e.what() << std::endl;
+        return 1;
+    }
+
+    if (op_list.empty()) {
+        std::cerr << "Please provide a gate list." << std::endl;
+        return 1;
+    }
+
+    // Generate random gate sequences
+    std::random_device rd;
+    std::mt19937 re(rd());
+
+    std::vector<std::string_view> random_gate_names;
+    std::vector<std::vector<size_t>> random_gate_wires;
+    std::vector<bool> random_inverses;
+    std::vector<std::vector<TestType>> random_gate_parameters;
+
+    std::uniform_int_distribution<size_t> gate_dist(0, op_list.size() - 1);
+    std::uniform_int_distribution<size_t> inverse_dist(0, 1);
+    std::uniform_real_distribution<TestType> param_dist(0.0, 2 * M_PI);
+    std::uniform_int_distribution<size_t> wire_dist(0, num_qubits - 1);
+
+    auto gen_param = [&param_dist, &re]() { return param_dist(re); };
+
+    for (uint32_t k = 0; k < num_gate_reps; k++) {
+        const auto &[op_name, gate_desc] = op_list[gate_dist(re)];
+
+        std::vector<TestType> gate_params(gate_desc.n_params, 0.0);
+        std::generate(gate_params.begin(), gate_params.end(), gen_param);
+
+        random_gate_names.emplace_back(op_name);
+        random_inverses.emplace_back(static_cast<bool>(inverse_dist(re)));
+        // random_gate_wires.emplace_back(generateDistinctWires(re, num_qubits,
+        // gate_desc.n_wires));
+        random_gate_wires.emplace_back(
+            generateNeighboringWires(re, num_qubits, gate_desc.n_wires));
+        random_gate_parameters.emplace_back(std::move(gate_params));
+    }
+
+    // Log generated sequence if LOG is turned on
+    const char *env_p = std::getenv("LOG");
+    try {
+        if (env_p != nullptr && std::stoi(env_p) != 0) {
+            for (size_t gate_rep = 0; gate_rep < num_gate_reps; gate_rep++) {
+                std::cerr << random_gate_names[gate_rep] << ", "
+                          << random_gate_wires[gate_rep] << ", "
+                          << random_gate_parameters[gate_rep] << std::endl;
+            }
+        }
+    } catch (std::exception &e) {
+        // Just do not print log
+    }
+
+    // Run benchmark. Total num_gate_reps number of gates is used.
+    Pennylane::StateVectorManaged<TestType> svdat{num_qubits};
+    std::chrono::time_point<std::chrono::high_resolution_clock> t_start;
+    std::chrono::time_point<std::chrono::high_resolution_clock> t_end;
+    t_start = std::chrono::high_resolution_clock::now();
+
+    for (size_t gate_rep = 0; gate_rep < num_gate_reps; gate_rep++) {
+        svdat.applyOperation(kernel, std::string(random_gate_names[gate_rep]),
+                             random_gate_wires[gate_rep],
+                             random_inverses[gate_rep],
+                             random_gate_parameters[gate_rep]);
+    }
+
+    t_end = std::chrono::high_resolution_clock::now();
+
+    // Output walltime in csv format (Num Qubits, Time (milliseconds))
+    const auto walltime =
+        0.001 * ((std::chrono::duration_cast<std::chrono::microseconds>(
+                      t_end - t_start))
+                     .count());
+    std::cout << num_qubits << ", "
+              << walltime / static_cast<double>(num_gate_reps) << std::endl;
+    return 0;
+}
diff --git a/pennylane_lightning/src/examples/benchmark_multi_rz.cpp b/pennylane_lightning/src/examples/benchmark_multi_rz.cpp
index 180e93ba9a..49bac2ead2 100644
--- a/pennylane_lightning/src/examples/benchmark_multi_rz.cpp
+++ b/pennylane_lightning/src/examples/benchmark_multi_rz.cpp
@@ -1,5 +1,5 @@
 #include "ExampleUtil.hpp"
-#include "StateVectorManaged.hpp"
+#include "StateVectorCPU.hpp"
 
 #include <chrono>
 #include <cstdio>
@@ -54,7 +54,7 @@ int main(int argc, char *argv[]) {
         params.emplace_back(param_dist(re));
     }
 
-    StateVectorManaged<TestType> sv{num_qubits};
+    StateVectorCPU<TestType> sv{num_qubits};
 
     std::chrono::time_point<std::chrono::high_resolution_clock> t_start =
         std::chrono::high_resolution_clock::now();
diff --git a/pennylane_lightning/src/examples/benchmark_operation.cpp b/pennylane_lightning/src/examples/benchmark_operation.cpp
new file mode 100644
index 0000000000..0978a90550
--- /dev/null
+++ b/pennylane_lightning/src/examples/benchmark_operation.cpp
@@ -0,0 +1,210 @@
+#include <algorithm>
+#include <chrono>
+#include <cstdlib>
+#include <iostream>
+#include <map>
+#include <random>
+#include <stdexcept>
+#include <string>
+
+#include "Constant.hpp"
+#include "ExampleUtil.hpp"
+#include "StateVectorCPU.hpp"
+
+#ifdef USE_SINGLE_PRECISION
+using PrecisionT = float;
+#pragma message "Using single precision"
+#else
+using PrecisionT = double;
+#endif
+
+using namespace Pennylane;
+using namespace Pennylane::Gates;
+using namespace Pennylane::Util;
+
+std::string_view strip(std::string_view str) {
+    auto start = str.find_first_not_of(" \t");
+    auto end = str.find_last_not_of(" \t");
+    return str.substr(start, end - start + 1);
+}
+
+template <class RandomEngine>
+double benchmark_gate(RandomEngine &re, KernelType kernel,
+                      const std::string &gate_name, const size_t num_reps,
+                      const size_t num_qubits) {
+    const GateOperation gate_op = Util::lookup(
+        Util::reverse_pairs(Constant::gate_names), std::string_view(gate_name));
+    const size_t num_wires = Util::lookup(Constant::gate_wires, gate_op);
+    const size_t num_params = Util::lookup(Constant::gate_num_params, gate_op);
+
+    // Generate random generator sequences
+    std::vector<std::vector<size_t>> random_wires;
+    std::vector<bool> random_inverses;
+    std::vector<std::vector<PrecisionT>> random_params;
+    random_wires.reserve(num_reps);
+    random_inverses.reserve(num_reps);
+    random_params.reserve(num_reps);
+
+    std::uniform_int_distribution<size_t> inverse_dist(0, 1);
+    std::uniform_real_distribution<PrecisionT> param_dist(0.0, 2 * M_PI);
+
+    for (uint32_t k = 0; k < num_reps; k++) {
+        std::vector<PrecisionT> gate_params;
+        gate_params.reserve(num_params);
+
+        random_inverses.emplace_back(static_cast<bool>(inverse_dist(re)));
+        random_wires.emplace_back(
+            generateNeighboringWires(re, num_qubits, num_wires));
+
+        for (size_t idx = 0; idx < num_params; idx++) {
+            gate_params.emplace_back(param_dist(re));
+        }
+        random_params.emplace_back(std::move(gate_params));
+    }
+
+    // Log generated sequence if LOG is turned on
+    const char *env_p = std::getenv("LOG");
+    try {
+        if (env_p != nullptr && std::stoi(env_p) != 0) {
+            for (size_t gate_rep = 0; gate_rep < num_reps; gate_rep++) {
+                std::cerr << gate_name << ", " << random_wires[gate_rep] << ","
+                          << random_inverses[gate_rep] << ","
+                          << random_params[gate_rep] << std::endl;
+            }
+        }
+    } catch (std::exception &e) {
+        // Just do not print log
+    }
+
+    // Run benchmark. Total num_reps number of gates is used.
+    StateVectorCPU<PrecisionT> svdat{num_qubits};
+
+    std::chrono::time_point<std::chrono::high_resolution_clock> t_start =
+        std::chrono::high_resolution_clock::now();
+    for (size_t gate_rep = 0; gate_rep < num_reps; gate_rep++) {
+        svdat.applyOperation(kernel, gate_name, random_wires[gate_rep],
+                             random_inverses[gate_rep],
+                             random_params[gate_rep]);
+    }
+    std::chrono::time_point<std::chrono::high_resolution_clock> t_end =
+        std::chrono::high_resolution_clock::now();
+
+    return std::chrono::duration<double, std::milli>(t_end - t_start).count();
+}
+
+template <class RandomEngine>
+double benchmark_generator(RandomEngine &re, KernelType kernel,
+                           const std::string &gntr_name, const size_t num_reps,
+                           const size_t num_qubits) {
+    const auto gntr_name_without_prefix = gntr_name.substr(9);
+    const GeneratorOperation gntr_op =
+        Util::lookup(Util::reverse_pairs(Constant::generator_names),
+                     std::string_view(gntr_name));
+    const size_t num_wires = Util::lookup(Constant::generator_wires, gntr_op);
+
+    // Generate random generator sequences
+    std::vector<std::vector<size_t>> random_wires;
+    std::vector<bool> random_inverses;
+    random_wires.reserve(num_reps);
+    random_inverses.reserve(num_reps);
+
+    std::uniform_int_distribution<size_t> inverse_dist(0, 1);
+
+    for (uint32_t k = 0; k < num_reps; k++) {
+        random_inverses.emplace_back(static_cast<bool>(inverse_dist(re)));
+        random_wires.emplace_back(
+            generateNeighboringWires(re, num_qubits, num_wires));
+    }
+
+    // Log generated sequence if LOG is turned on
+    const char *env_p = std::getenv("LOG");
+    try {
+        if (env_p != nullptr && std::stoi(env_p) != 0) {
+            for (size_t gate_rep = 0; gate_rep < num_reps; gate_rep++) {
+                std::cerr << gntr_name << ", " << random_wires[gate_rep] << ","
+                          << random_inverses[gate_rep] << std::endl;
+            }
+        }
+    } catch (std::exception &e) {
+        // Just do not print log
+    }
+
+    // Run benchmark. Total num_reps number of gates is used.
+    StateVectorCPU<PrecisionT> svdat{num_qubits};
+
+    std::chrono::time_point<std::chrono::high_resolution_clock> t_start =
+        std::chrono::high_resolution_clock::now();
+    for (size_t gate_rep = 0; gate_rep < num_reps; gate_rep++) {
+        [[maybe_unused]] auto scale = svdat.applyGenerator(
+            kernel, gntr_name_without_prefix, random_wires[gate_rep],
+            random_inverses[gate_rep]);
+    }
+    std::chrono::time_point<std::chrono::high_resolution_clock> t_end =
+        std::chrono::high_resolution_clock::now();
+
+    return std::chrono::duration<double, std::milli>(t_end - t_start).count();
+}
+
+/**
+ * @brief Benchmark Pennylane-Lightning for a given generator
+ *
+ * @param argc Number of arguments
+ * @param argv Command line arguments
+ * @return Returns 0 is completed successfully
+ */
+int main(int argc, char *argv[]) {
+    // Handle input
+    if (argc < 5) { // NOLINT(readability-magic-numbers)
+        std::cerr << "Wrong number of inputs. User provided " << argc - 1
+                  << " inputs. \n"
+                  << "Usage: " + std::string(argv[0]) +
+                         " num_reps num_qubits kernel [generator|gate]\n"
+                         "Examples: \n"
+                         "\t"
+                  << argv[0] << " 1000 10 PI GeneratorCRX\n"
+                  << "\t" << argv[0] << " 1000 10 LM CRX"
+                  << std::endl; // Change to std::format in C++20
+        return -1;
+    }
+
+    size_t num_reps;
+    size_t num_qubits;
+
+    try {
+        num_reps = std::stoi(argv[1]);
+        num_qubits = std::stoi(argv[2]);
+    } catch (std::exception &e) {
+        std::cerr << "Arguments num_reps and num_qubits must be integers."
+                  << std::endl;
+        return -1;
+    }
+
+    std::string_view kernel_name = argv[3];
+    KernelType kernel = string_to_kernel(kernel_name);
+    if (kernel == KernelType::None) {
+        std::cerr << "Kernel " << kernel_name << " is unknown." << std::endl;
+        return 1;
+    }
+
+    const std::string_view gate_or_gntr_name = argv[4];
+    const std::string_view generator_prefix = "Generator";
+
+    std::random_device rd;
+    std::mt19937 re(rd());
+
+    double walltime;
+
+    if (gate_or_gntr_name.substr(0, generator_prefix.length()) ==
+        generator_prefix) { // generators
+        walltime = benchmark_generator(
+            re, kernel, std::string(gate_or_gntr_name), num_reps, num_qubits);
+    } else {
+        walltime = benchmark_gate(re, kernel, std::string(gate_or_gntr_name),
+                                  num_reps, num_qubits);
+    }
+
+    // Output walltime in csv format (Num Qubits, Time (milliseconds))
+    std::cout << num_qubits << ", " << walltime / static_cast<double>(num_reps)
+              << std::endl;
+    return 0;
+}
diff --git a/pennylane_lightning/src/examples/gate_benchmark.cpp b/pennylane_lightning/src/examples/gate_benchmark.cpp
index a8c8745a25..0f2a12d21c 100644
--- a/pennylane_lightning/src/examples/gate_benchmark.cpp
+++ b/pennylane_lightning/src/examples/gate_benchmark.cpp
@@ -6,7 +6,6 @@
 #include <stdexcept>
 #include <string>
 
-#include "IndicesUtil.hpp"
 #include "StateVectorManaged.hpp"
 
 /**
@@ -18,7 +17,6 @@
  */
 int main(int argc, char *argv[]) {
     using TestType = double;
-    namespace IndicesUtil = Pennylane::IndicesUtil;
 
     // Handle input
     try {
diff --git a/pennylane_lightning/src/examples/run_gate_benchmark.sh b/pennylane_lightning/src/examples/run_gate_benchmark.sh
index 3e310e0f88..315c3ebdda 100755
--- a/pennylane_lightning/src/examples/run_gate_benchmark.sh
+++ b/pennylane_lightning/src/examples/run_gate_benchmark.sh
@@ -19,7 +19,7 @@ compiler_info=$(<compiler_info.txt)
 
 if [[ "$gate" != "MultiRZ" ]]; then
 	# Creating data file
-	binary_name="./gate_benchmark_oplist"
+	binary_name="./benchmark_operation"
 	path_to_binary="$currdir/$binary_name"
 
 	resdir="$currdir/res_${compiler_info}"
diff --git a/pennylane_lightning/src/gates/AvailableKernels.hpp b/pennylane_lightning/src/gates/AvailableKernels.hpp
index af06fa536a..ae18be341b 100644
--- a/pennylane_lightning/src/gates/AvailableKernels.hpp
+++ b/pennylane_lightning/src/gates/AvailableKernels.hpp
@@ -18,9 +18,9 @@
  */
 #pragma once
 
-#include "GateImplementationsLM.hpp"
-#include "GateImplementationsPI.hpp"
 #include "TypeList.hpp"
+#include "cpu_kernels/GateImplementationsLM.hpp"
+#include "cpu_kernels/GateImplementationsPI.hpp"
 
 namespace Pennylane {
 /**
@@ -32,6 +32,6 @@ namespace Pennylane {
  * See :ref:`lightning_add_gate_implementation` for details.
  * @endrst
  */
-using AvailableKernels =
-    Util::TypeList<Gates::GateImplementationsLM, Gates::GateImplementationsPI>;
+using AvailableKernels = Util::TypeList<Gates::GateImplementationsLM,
+                                        Gates::GateImplementationsPI, void>;
 } // namespace Pennylane
diff --git a/pennylane_lightning/src/gates/Gates.hpp b/pennylane_lightning/src/gates/Gates.hpp
index a9141fec48..d12598ccc4 100644
--- a/pennylane_lightning/src/gates/Gates.hpp
+++ b/pennylane_lightning/src/gates/Gates.hpp
@@ -330,7 +330,7 @@ e^{-i(\phi-\omega)/2}\sin(\theta/2) & e^{i(\phi+\omega)/2}\cos(\theta/2)
  * @return const std::vector<std::complex<T>> Return const Rot gate data.
  */
 template <class T, class U = T>
-static auto getRot(U phi, U theta, U omega) -> std::vector<std::complex<T>> {
+static auto getRot(U phi, U theta, U omega) -> std::array<std::complex<T>, 4> {
     using namespace Util;
     const T c = std::cos(theta / 2);
     const T s = std::sin(theta / 2);
diff --git a/pennylane_lightning/src/gates/GateImplementationsLM.hpp b/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp
similarity index 87%
rename from pennylane_lightning/src/gates/GateImplementationsLM.hpp
rename to pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp
index 9f227862b2..6e0060fc54 100644
--- a/pennylane_lightning/src/gates/GateImplementationsLM.hpp
+++ b/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp
@@ -16,6 +16,7 @@
  * Defines kernel functions with less memory (and fast)
  */
 #pragma once
+#include "PauliGenerator.hpp"
 
 #include "BitUtil.hpp"
 #include "Error.hpp"
@@ -23,7 +24,6 @@
 #include "Gates.hpp"
 #include "KernelType.hpp"
 #include "LinearAlgebra.hpp"
-#include "PauliGenerator.hpp"
 
 #include <complex>
 #include <vector>
@@ -38,10 +38,20 @@ namespace Pennylane::Gates {
  * @tparam PrecisionT Floating point precision of underlying statevector data
  */
 class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
+  private:
+    /* Alias utility functions */
+    static constexpr auto fillLeadingOnes = Util::fillLeadingOnes;
+    static constexpr auto fillTrailingOnes = Util::fillTrailingOnes;
+    static constexpr auto bitswap = Util::bitswap;
+
   public:
     constexpr static KernelType kernel_id = KernelType::LM;
     constexpr static std::string_view name = "LM";
-    constexpr static uint32_t data_alignment_in_bytes = 1;
+    template <typename PrecisionT>
+    constexpr static size_t required_alignment =
+        std::alignment_of_v<PrecisionT>;
+    template <typename PrecisionT>
+    constexpr static size_t packed_bytes = sizeof(PrecisionT);
 
     constexpr static std::array implemented_gates = {
         GateOperation::PauliX,  GateOperation::PauliY,
@@ -53,25 +63,26 @@ class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
         GateOperation::CZ,      GateOperation::CNOT,
         GateOperation::SWAP,    GateOperation::ControlledPhaseShift,
         GateOperation::CRX,     GateOperation::CRY,
-        GateOperation::CRZ,     GateOperation::IsingXX,
-        GateOperation::IsingYY, GateOperation::IsingZZ,
-        GateOperation::MultiRZ, GateOperation::Matrix};
+        GateOperation::CRZ,     GateOperation::CRot,
+        GateOperation::IsingXX, GateOperation::IsingYY,
+        GateOperation::IsingZZ, GateOperation::MultiRZ,
+        GateOperation::Matrix};
 
     constexpr static std::array implemented_generators = {
-        GeneratorOperation::RX,      GeneratorOperation::RY,
-        GeneratorOperation::RZ,      GeneratorOperation::PhaseShift,
-        GeneratorOperation::CRX,     GeneratorOperation::CRY,
-        GeneratorOperation::CRZ,     GeneratorOperation::IsingXX,
-        GeneratorOperation::IsingYY, GeneratorOperation::IsingZZ,
+        GeneratorOperation::RX,
+        GeneratorOperation::RY,
+        GeneratorOperation::RZ,
+        GeneratorOperation::PhaseShift,
+        GeneratorOperation::CRX,
+        GeneratorOperation::CRY,
+        GeneratorOperation::CRZ,
+        GeneratorOperation::IsingXX,
+        GeneratorOperation::IsingYY,
+        GeneratorOperation::IsingZZ,
+        GeneratorOperation::ControlledPhaseShift,
         GeneratorOperation::MultiRZ,
     };
 
-  private:
-    /* Alias utility functions */
-    static constexpr auto fillLeadingOnes = Util::fillLeadingOnes;
-    static constexpr auto fillTrailingOnes = Util::fillTrailingOnes;
-    static constexpr auto bitswap = Util::bitswap;
-
     /**
      * @brief Apply a single qubit gate to the statevector.
      *
@@ -230,7 +241,6 @@ class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
         }
     }
 
-  public:
     template <class PrecisionT>
     static void applyMatrix(std::complex<PrecisionT> *arr, size_t num_qubits,
                             const std::complex<PrecisionT> *matrix,
@@ -257,8 +267,8 @@ class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
                     size_t idx = k | inner_idx;
                     size_t n_wires = wires.size();
                     for (size_t pos = 0; pos < n_wires; pos++) {
-                        bitswap(idx, n_wires - pos - 1,
-                                num_qubits - wires[pos] - 1);
+                        idx = bitswap(idx, n_wires - pos - 1,
+                                      num_qubits - wires[pos] - 1);
                     }
                     indices[inner_idx] = idx;
                     coeffs_in[inner_idx] = arr[idx];
@@ -281,9 +291,8 @@ class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
                             const size_t num_qubits,
                             const std::vector<size_t> &wires,
                             [[maybe_unused]] bool inverse) {
-        using Util::fillLeadingOnes, Util::fillTrailingOnes;
-
         assert(wires.size() == 1);
+
         const size_t rev_wire = num_qubits - wires[0] - 1;
         const size_t rev_wire_shift = (static_cast<size_t>(1U) << rev_wire);
         const size_t wire_parity = fillTrailingOnes(rev_wire);
@@ -341,10 +350,20 @@ class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
                               const std::vector<size_t> &wires,
                               [[maybe_unused]] bool inverse) {
         assert(wires.size() == 1);
-        constexpr auto isqrt2 = Util::INVSQRT2<PrecisionT>();
-        constexpr static std::array<std::complex<PrecisionT>, 4> hadamardMat = {
-            isqrt2, isqrt2, isqrt2, -isqrt2};
-        applySingleQubitOp(arr, num_qubits, hadamardMat.data(), wires[0]);
+        constexpr static auto isqrt2 = Util::INVSQRT2<PrecisionT>();
+        const size_t rev_wire = num_qubits - wires[0] - 1;
+        const size_t rev_wire_shift = (static_cast<size_t>(1U) << rev_wire);
+        const size_t wire_parity = fillTrailingOnes(rev_wire);
+        const size_t wire_parity_inv = fillLeadingOnes(rev_wire + 1);
+
+        for (size_t k = 0; k < Util::exp2(num_qubits - 1); k++) {
+            const size_t i0 = ((k << 1U) & wire_parity_inv) | (wire_parity & k);
+            const size_t i1 = i0 | rev_wire_shift;
+            const std::complex<PrecisionT> v0 = arr[i0];
+            const std::complex<PrecisionT> v1 = arr[i1];
+            arr[i0] = isqrt2 * v0 + isqrt2 * v1;
+            arr[i1] = isqrt2 * v0 - isqrt2 * v1;
+        }
     }
 
     template <class PrecisionT>
@@ -377,11 +396,10 @@ class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
         const size_t wire_parity = fillTrailingOnes(rev_wire);
         const size_t wire_parity_inv = fillLeadingOnes(rev_wire + 1);
 
-        const std::complex<PrecisionT> shift =
-            (inverse) ? std::conj(std::exp(std::complex<PrecisionT>(
-                            0, static_cast<PrecisionT>(M_PI / 4))))
-                      : std::exp(std::complex<PrecisionT>(
-                            0, static_cast<PrecisionT>(M_PI / 4)));
+        constexpr static auto isqrt2 = Util::INVSQRT2<PrecisionT>();
+
+        const std::complex<PrecisionT> shift = {isqrt2,
+                                                inverse ? -isqrt2 : isqrt2};
 
         for (size_t k = 0; k < Util::exp2(num_qubits - 1); k++) {
             const size_t i0 = ((k << 1U) & wire_parity_inv) | (wire_parity & k);
@@ -395,8 +413,6 @@ class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
                                 const size_t num_qubits,
                                 const std::vector<size_t> &wires, bool inverse,
                                 ParamT angle) {
-        using Util::fillLeadingOnes, Util::fillTrailingOnes;
-
         assert(wires.size() == 1);
         const size_t rev_wire = num_qubits - wires[0] - 1;
         const size_t rev_wire_shift = (static_cast<size_t>(1U) << rev_wire);
@@ -419,14 +435,25 @@ class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
                         const std::vector<size_t> &wires, bool inverse,
                         ParamT angle) {
         assert(wires.size() == 1);
+        const size_t rev_wire = num_qubits - wires[0] - 1;
+        const size_t rev_wire_shift = (static_cast<size_t>(1U) << rev_wire);
+        const size_t wire_parity = fillTrailingOnes(rev_wire);
+        const size_t wire_parity_inv = fillLeadingOnes(rev_wire + 1);
 
         const PrecisionT c = std::cos(angle / 2);
         const PrecisionT js =
             (inverse) ? -std::sin(-angle / 2) : std::sin(-angle / 2);
 
-        const std::array<std::complex<PrecisionT>, 4> RXMat = {
-            c, Util::IMAG<PrecisionT>() * js, Util::IMAG<PrecisionT>() * js, c};
-        applySingleQubitOp(arr, num_qubits, RXMat.data(), wires[0]);
+        for (size_t k = 0; k < Util::exp2(num_qubits - 1); k++) {
+            const size_t i0 = ((k << 1U) & wire_parity_inv) | (wire_parity & k);
+            const size_t i1 = i0 | rev_wire_shift;
+            const std::complex<PrecisionT> v0 = arr[i0];
+            const std::complex<PrecisionT> v1 = arr[i1];
+            arr[i0] = c * v0 +
+                      std::complex<PrecisionT>{-imag(v1) * js, real(v1) * js};
+            arr[i1] = std::complex<PrecisionT>{-imag(v0) * js, real(v0) * js} +
+                      c * v1;
+        }
     }
 
     template <class PrecisionT, class ParamT = PrecisionT>
@@ -434,13 +461,25 @@ class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
                         const std::vector<size_t> &wires, bool inverse,
                         ParamT angle) {
         assert(wires.size() == 1);
+        const size_t rev_wire = num_qubits - wires[0] - 1;
+        const size_t rev_wire_shift = (static_cast<size_t>(1U) << rev_wire);
+        const size_t wire_parity = fillTrailingOnes(rev_wire);
+        const size_t wire_parity_inv = fillLeadingOnes(rev_wire + 1);
 
         const PrecisionT c = std::cos(angle / 2);
         const PrecisionT s =
             (inverse) ? -std::sin(angle / 2) : std::sin(angle / 2);
 
-        const std::array<std::complex<PrecisionT>, 4> RYMat = {c, -s, s, c};
-        applySingleQubitOp(arr, num_qubits, RYMat.data(), wires[0]);
+        for (size_t k = 0; k < Util::exp2(num_qubits - 1); k++) {
+            const size_t i0 = ((k << 1U) & wire_parity_inv) | (wire_parity & k);
+            const size_t i1 = i0 | rev_wire_shift;
+            const std::complex<PrecisionT> v0 = arr[i0];
+            const std::complex<PrecisionT> v1 = arr[i1];
+            arr[i0] = std::complex<PrecisionT>{c * real(v0) - s * real(v1),
+                                               c * imag(v0) - s * imag(v1)};
+            arr[i1] = std::complex<PrecisionT>{s * real(v0) + c * real(v1),
+                                               s * imag(v0) + c * imag(v1)};
+        }
     }
 
     template <class PrecisionT, class ParamT = PrecisionT>
@@ -571,7 +610,6 @@ class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
         const size_t parity_middle =
             fillLeadingOnes(rev_wire_min + 1) & fillTrailingOnes(rev_wire_max);
 
-        /* This is faster than iterate over all indices */
         for (size_t k = 0; k < Util::exp2(num_qubits - 2); k++) {
             const size_t i00 = ((k << 2U) & parity_high) |
                                ((k << 1U) & parity_middle) | (k & parity_low);
@@ -580,6 +618,43 @@ class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
         }
     }
 
+    template <class PrecisionT, class ParamT = PrecisionT>
+    static void applyCRot(std::complex<PrecisionT> *arr, size_t num_qubits,
+                          const std::vector<size_t> &wires, bool inverse,
+                          ParamT phi, ParamT theta, ParamT omega) {
+        assert(wires.size() == 2);
+
+        const size_t rev_wire0 = num_qubits - wires[1] - 1;
+        const size_t rev_wire1 = num_qubits - wires[0] - 1; // Control qubit
+
+        const size_t rev_wire0_shift = static_cast<size_t>(1U) << rev_wire0;
+        const size_t rev_wire1_shift = static_cast<size_t>(1U) << rev_wire1;
+
+        const size_t rev_wire_min = std::min(rev_wire0, rev_wire1);
+        const size_t rev_wire_max = std::max(rev_wire0, rev_wire1);
+
+        const size_t parity_low = fillTrailingOnes(rev_wire_min);
+        const size_t parity_high = fillLeadingOnes(rev_wire_max + 1);
+        const size_t parity_middle =
+            fillLeadingOnes(rev_wire_min + 1) & fillTrailingOnes(rev_wire_max);
+
+        const auto rotMat =
+            (inverse) ? Gates::getRot<PrecisionT>(-omega, -theta, -phi)
+                      : Gates::getRot<PrecisionT>(phi, theta, omega);
+
+        for (size_t k = 0; k < Util::exp2(num_qubits - 2); k++) {
+            const size_t i00 = ((k << 2U) & parity_high) |
+                               ((k << 1U) & parity_middle) | (k & parity_low);
+            const size_t i10 = i00 | rev_wire1_shift;
+            const size_t i11 = i00 | rev_wire0_shift | rev_wire1_shift;
+
+            const std::complex<PrecisionT> v0 = arr[i10];
+            const std::complex<PrecisionT> v1 = arr[i11];
+            arr[i10] = rotMat[0] * v0 + rotMat[1] * v1;
+            arr[i11] = rotMat[2] * v0 + rotMat[3] * v1;
+        }
+    }
+
     template <class PrecisionT>
     static void applySWAP(std::complex<PrecisionT> *arr, size_t num_qubits,
                           const std::vector<size_t> &wires,
@@ -863,8 +938,10 @@ class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
             const std::complex<PrecisionT> v10 = arr[i10];
             const std::complex<PrecisionT> v11 = arr[i11];
 
-            arr[i10] = c * v10 + -s * v11;
-            arr[i11] = s * v10 + c * v11;
+            arr[i10] = std::complex<PrecisionT>{c * real(v10) - s * real(v11),
+                                                c * imag(v10) - s * imag(v11)};
+            arr[i11] = std::complex<PrecisionT>{s * real(v10) + c * real(v11),
+                                                s * imag(v10) + c * imag(v11)};
         }
     }
 
@@ -1025,6 +1102,7 @@ class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
         // NOLINTNEXTLINE(readability-magic-numbers)
         return -static_cast<PrecisionT>(0.5);
     }
+
     template <class PrecisionT>
     [[nodiscard]] static auto
     applyGeneratorIsingZZ(std::complex<PrecisionT> *arr, size_t num_qubits,
@@ -1176,6 +1254,42 @@ class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
         return -static_cast<PrecisionT>(0.5);
     }
 
+    template <class PrecisionT>
+    [[nodiscard]] static auto applyGeneratorControlledPhaseShift(
+        std::complex<PrecisionT> *arr, size_t num_qubits,
+        const std::vector<size_t> &wires, [[maybe_unused]] bool adj)
+        -> PrecisionT {
+        using ComplexPrecisionT = std::complex<PrecisionT>;
+        assert(wires.size() == 2);
+
+        const size_t rev_wire0 = num_qubits - wires[1] - 1;
+        const size_t rev_wire1 = num_qubits - wires[0] - 1; // Control qubit
+
+        const size_t rev_wire0_shift = static_cast<size_t>(1U) << rev_wire0;
+        const size_t rev_wire1_shift = static_cast<size_t>(1U) << rev_wire1;
+
+        const size_t rev_wire_min = std::min(rev_wire0, rev_wire1);
+        const size_t rev_wire_max = std::max(rev_wire0, rev_wire1);
+
+        const size_t parity_low = fillTrailingOnes(rev_wire_min);
+        const size_t parity_high = fillLeadingOnes(rev_wire_max + 1);
+        const size_t parity_middle =
+            fillLeadingOnes(rev_wire_min + 1) & fillTrailingOnes(rev_wire_max);
+
+        for (size_t k = 0; k < Util::exp2(num_qubits - 2); k++) {
+            const size_t i00 = ((k << 2U) & parity_high) |
+                               ((k << 1U) & parity_middle) | (k & parity_low);
+            const size_t i01 = i00 | rev_wire0_shift;
+            const size_t i10 = i00 | rev_wire1_shift;
+
+            arr[i00] = ComplexPrecisionT{};
+            arr[i01] = ComplexPrecisionT{};
+            arr[i10] = ComplexPrecisionT{};
+        }
+        // NOLINTNEXTLINE(readability-magic-numbers)
+        return static_cast<PrecisionT>(1);
+    }
+
     template <class PrecisionT>
     [[nodiscard]] static auto
     applyGeneratorMultiRZ(std::complex<PrecisionT> *arr, size_t num_qubits,
diff --git a/pennylane_lightning/src/gates/GateImplementationsPI.hpp b/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsPI.hpp
similarity index 91%
rename from pennylane_lightning/src/gates/GateImplementationsPI.hpp
rename to pennylane_lightning/src/gates/cpu_kernels/GateImplementationsPI.hpp
index 69a5826efc..82a0edf924 100644
--- a/pennylane_lightning/src/gates/GateImplementationsPI.hpp
+++ b/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsPI.hpp
@@ -24,13 +24,14 @@
 #endif
 /// @endcond
 
+#include "PauliGenerator.hpp"
+
 #include "BitUtil.hpp"
 #include "GateOperation.hpp"
 #include "GateUtil.hpp"
 #include "Gates.hpp"
 #include "KernelType.hpp"
 #include "LinearAlgebra.hpp"
-#include "PauliGenerator.hpp"
 
 #include <complex>
 #include <vector>
@@ -48,7 +49,11 @@ class GateImplementationsPI : public PauliGenerator<GateImplementationsPI> {
   public:
     constexpr static KernelType kernel_id = KernelType::PI;
     constexpr static std::string_view name = "PI";
-    constexpr static uint32_t data_alignment_in_bytes = 1;
+    template <typename PrecisionT>
+    constexpr static size_t required_alignment =
+        std::alignment_of_v<PrecisionT>;
+    template <typename PrecisionT>
+    constexpr static uint32_t packed_bytes = std::alignment_of_v<PrecisionT>;
 
     constexpr static std::array implemented_gates = {
         GateOperation::PauliX,  GateOperation::PauliY,
@@ -66,10 +71,17 @@ class GateImplementationsPI : public PauliGenerator<GateImplementationsPI> {
         GateOperation::CSWAP,   GateOperation::MultiRZ,
         GateOperation::Matrix};
     constexpr static std::array implemented_generators = {
-        GeneratorOperation::RX,  GeneratorOperation::RY,
-        GeneratorOperation::RZ,  GeneratorOperation::PhaseShift,
-        GeneratorOperation::CRX, GeneratorOperation::CRY,
-        GeneratorOperation::CRZ, GeneratorOperation::ControlledPhaseShift};
+        GeneratorOperation::RX,
+        GeneratorOperation::RY,
+        GeneratorOperation::RZ,
+        GeneratorOperation::PhaseShift,
+        GeneratorOperation::IsingXX,
+        GeneratorOperation::IsingYY,
+        GeneratorOperation::IsingZZ,
+        GeneratorOperation::CRX,
+        GeneratorOperation::CRY,
+        GeneratorOperation::CRZ,
+        GeneratorOperation::ControlledPhaseShift};
 
     /**
      * @brief Apply a given matrix directly to the statevector.
@@ -327,8 +339,7 @@ class GateImplementationsPI : public PauliGenerator<GateImplementationsPI> {
         assert(wires.size() == 1);
         const auto [indices, externalIndices] = GateIndices(wires, num_qubits);
 
-        const std::vector<std::complex<PrecisionT>> rot =
-            Gates::getRot<PrecisionT>(phi, theta, omega);
+        const auto rot = Gates::getRot<PrecisionT>(phi, theta, omega);
 
         const std::complex<PrecisionT> t1 =
             (inverse) ? std::conj(rot[0]) : rot[0];
@@ -687,6 +698,63 @@ class GateImplementationsPI : public PauliGenerator<GateImplementationsPI> {
         return -static_cast<PrecisionT>(0.5);
     }
 
+    template <class PrecisionT>
+    [[nodiscard]] static auto
+    applyGeneratorIsingXX(std::complex<PrecisionT> *arr, size_t num_qubits,
+                          const std::vector<size_t> &wires,
+                          [[maybe_unused]] bool adj) -> PrecisionT {
+        assert(wires.size() == 2);
+        const auto [indices, externalIndices] = GateIndices(wires, num_qubits);
+
+        for (const size_t &externalIndex : externalIndices) {
+            std::complex<PrecisionT> *shiftedState = arr + externalIndex;
+            std::swap(shiftedState[indices[0]], shiftedState[indices[3]]);
+            std::swap(shiftedState[indices[2]], shiftedState[indices[1]]);
+        }
+
+        // NOLINTNEXTLINE(readability-magic-numbers)
+        return -static_cast<PrecisionT>(0.5);
+    }
+
+    template <class PrecisionT>
+    [[nodiscard]] static auto
+    applyGeneratorIsingYY(std::complex<PrecisionT> *arr, size_t num_qubits,
+                          const std::vector<size_t> &wires,
+                          [[maybe_unused]] bool adj) -> PrecisionT {
+        assert(wires.size() == 2);
+        const auto [indices, externalIndices] = GateIndices(wires, num_qubits);
+
+        for (const size_t &externalIndex : externalIndices) {
+            std::complex<PrecisionT> *shiftedState = arr + externalIndex;
+            const auto v00 = shiftedState[indices[0]];
+            shiftedState[indices[0]] = -shiftedState[indices[3]];
+            shiftedState[indices[3]] = -v00;
+            std::swap(shiftedState[indices[2]], shiftedState[indices[1]]);
+        }
+
+        // NOLINTNEXTLINE(readability-magic-numbers)
+        return -static_cast<PrecisionT>(0.5);
+    }
+
+    template <class PrecisionT>
+    [[nodiscard]] static auto
+    applyGeneratorIsingZZ(std::complex<PrecisionT> *arr, size_t num_qubits,
+                          const std::vector<size_t> &wires,
+                          [[maybe_unused]] bool adj) -> PrecisionT {
+        assert(wires.size() == 2);
+        const auto [indices, externalIndices] = GateIndices(wires, num_qubits);
+
+        for (const size_t &externalIndex : externalIndices) {
+            std::complex<PrecisionT> *shiftedState = arr + externalIndex;
+
+            shiftedState[indices[1]] *= -1;
+            shiftedState[indices[2]] *= -1;
+        }
+
+        // NOLINTNEXTLINE(readability-magic-numbers)
+        return -static_cast<PrecisionT>(0.5);
+    }
+
     template <class PrecisionT>
     [[nodiscard]] static auto
     applyGeneratorCRY(std::complex<PrecisionT> *arr, size_t num_qubits,
diff --git a/pennylane_lightning/src/gates/PauliGenerator.hpp b/pennylane_lightning/src/gates/cpu_kernels/PauliGenerator.hpp
similarity index 100%
rename from pennylane_lightning/src/gates/PauliGenerator.hpp
rename to pennylane_lightning/src/gates/cpu_kernels/PauliGenerator.hpp
diff --git a/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp b/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp
new file mode 100644
index 0000000000..72613bc386
--- /dev/null
+++ b/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp
@@ -0,0 +1,400 @@
+// Copyright 2022 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/**
+ * @file
+ */
+#include "DispatchKeys.hpp"
+#include "GateOperation.hpp"
+#include "KernelType.hpp"
+
+#include <functional>
+#include <unordered_map>
+
+namespace Pennylane {
+
+inline auto larger_than(size_t size) {
+    return [=](size_t num_qubits) { return num_qubits > size; };
+}
+inline auto larger_than_equal_to(size_t size) {
+    return [=](size_t num_qubits) { return num_qubits >= size; };
+}
+inline auto less_than(size_t size) {
+    return [=](size_t num_qubits) { return num_qubits < size; };
+}
+inline auto less_than_equal_to(size_t size) {
+    return [=](size_t num_qubits) { return num_qubits <= size; };
+}
+inline auto in_between_closed(size_t l1, size_t l2) {
+    return [=](size_t num_qubits) {
+        return (l1 <= num_qubits) && (num_qubits <= l2);
+    };
+}
+
+class DefaultKernelsForStateVector {
+  private:
+    const static inline std::unordered_map<CPUMemoryModel,
+                                           std::vector<Gates::KernelType>>
+        allowed_kernels{
+            {CPUMemoryModel::Unaligned,
+             {Gates::KernelType::LM, Gates::KernelType::PI}},
+            {CPUMemoryModel::Aligned256,
+             {Gates::KernelType::LM, Gates::KernelType::PI}},
+            {CPUMemoryModel::Aligned512,
+             {Gates::KernelType::LM, Gates::KernelType::PI}},
+        };
+
+    std::unordered_map<
+        Gates::GateOperation,
+        std::vector<std::tuple<uint32_t, std::function<bool(size_t)>,
+                               Gates::KernelType>>>
+        gate_kernel_map_;
+
+    std::unordered_map<
+        Gates::GeneratorOperation,
+        std::vector<std::tuple<uint32_t, std::function<bool(size_t)>,
+                               Gates::KernelType>>>
+        generator_kernel_map_;
+
+    void registerDefaultGates() {
+        using Gates::GateOperation;
+        auto &instance = *this;
+        auto all_qubit_numbers = []([[maybe_unused]] size_t num_qubits) {
+            return true;
+        };
+        /* Single-qubit gates */
+        instance.assignKernelForGate(GateOperation::PauliX, all_threading,
+                                     all_memory_model, all_qubit_numbers,
+                                     Gates::KernelType::LM);
+        instance.assignKernelForGate(GateOperation::PauliY, all_threading,
+                                     all_memory_model, all_qubit_numbers,
+                                     Gates::KernelType::LM);
+        instance.assignKernelForGate(GateOperation::PauliZ, all_threading,
+                                     all_memory_model, all_qubit_numbers,
+                                     Gates::KernelType::LM);
+        instance.assignKernelForGate(GateOperation::Hadamard, all_threading,
+                                     all_memory_model, all_qubit_numbers,
+                                     Gates::KernelType::LM);
+        instance.assignKernelForGate(GateOperation::S, all_threading,
+                                     all_memory_model, all_qubit_numbers,
+                                     Gates::KernelType::LM);
+        instance.assignKernelForGate(GateOperation::T, all_threading,
+                                     all_memory_model, all_qubit_numbers,
+                                     Gates::KernelType::LM);
+        instance.assignKernelForGate(GateOperation::PhaseShift, all_threading,
+                                     all_memory_model, all_qubit_numbers,
+                                     Gates::KernelType::LM);
+        instance.assignKernelForGate(GateOperation::RX, all_threading,
+                                     all_memory_model, all_qubit_numbers,
+                                     Gates::KernelType::LM);
+        instance.assignKernelForGate(GateOperation::RY, all_threading,
+                                     all_memory_model, all_qubit_numbers,
+                                     Gates::KernelType::LM);
+        instance.assignKernelForGate(GateOperation::RZ, all_threading,
+                                     all_memory_model, all_qubit_numbers,
+                                     Gates::KernelType::LM);
+        instance.assignKernelForGate(GateOperation::Rot, all_threading,
+                                     all_memory_model, all_qubit_numbers,
+                                     Gates::KernelType::LM);
+        /* Two-qubit gates */
+        instance.assignKernelForGate(GateOperation::CNOT, all_threading,
+                                     all_memory_model, all_qubit_numbers,
+                                     Gates::KernelType::LM);
+        instance.assignKernelForGate(GateOperation::CY, all_threading,
+                                     all_memory_model, all_qubit_numbers,
+                                     Gates::KernelType::LM);
+        instance.assignKernelForGate(GateOperation::CZ, all_threading,
+                                     all_memory_model, all_qubit_numbers,
+                                     Gates::KernelType::LM);
+        instance.assignKernelForGate(GateOperation::ControlledPhaseShift,
+                                     all_threading, all_memory_model,
+                                     all_qubit_numbers, Gates::KernelType::LM);
+        instance.assignKernelForGate(GateOperation::SWAP, all_threading,
+                                     all_memory_model, all_qubit_numbers,
+                                     Gates::KernelType::LM);
+
+        instance.assignKernelForGate(GateOperation::IsingXX, all_threading,
+                                     all_memory_model, less_than(12),
+                                     Gates::KernelType::LM);
+        instance.assignKernelForGate(
+            GateOperation::IsingXX, all_threading, all_memory_model,
+            in_between_closed(12, 20), Gates::KernelType::PI);
+        instance.assignKernelForGate(GateOperation::IsingXX, all_threading,
+                                     all_memory_model, larger_than(20),
+                                     Gates::KernelType::LM);
+
+        instance.assignKernelForGate(GateOperation::IsingYY, all_threading,
+                                     all_memory_model, all_qubit_numbers,
+                                     Gates::KernelType::LM);
+        instance.assignKernelForGate(GateOperation::IsingZZ, all_threading,
+                                     all_memory_model, all_qubit_numbers,
+                                     Gates::KernelType::LM);
+        instance.assignKernelForGate(GateOperation::CRX, all_threading,
+                                     all_memory_model, all_qubit_numbers,
+                                     Gates::KernelType::LM);
+        instance.assignKernelForGate(GateOperation::CRY, all_threading,
+                                     all_memory_model, all_qubit_numbers,
+                                     Gates::KernelType::LM);
+        instance.assignKernelForGate(GateOperation::CRZ, all_threading,
+                                     all_memory_model, all_qubit_numbers,
+                                     Gates::KernelType::LM);
+        instance.assignKernelForGate(GateOperation::CRot, all_threading,
+                                     all_memory_model, all_qubit_numbers,
+                                     Gates::KernelType::LM);
+        instance.assignKernelForGate(GateOperation::Toffoli, all_threading,
+                                     all_memory_model, all_qubit_numbers,
+                                     Gates::KernelType::PI);
+        instance.assignKernelForGate(GateOperation::CSWAP, all_threading,
+                                     all_memory_model, all_qubit_numbers,
+                                     Gates::KernelType::PI);
+        instance.assignKernelForGate(GateOperation::MultiRZ, all_threading,
+                                     all_memory_model, all_qubit_numbers,
+                                     Gates::KernelType::LM);
+    }
+
+    void registerDefaultGenerators() {
+        using Gates::GeneratorOperation;
+        using Gates::KernelType;
+        auto &instance = *this;
+        auto all_qubit_numbers = []([[maybe_unused]] size_t num_qubits) {
+            return true;
+        };
+
+        instance.assignKernelForGenerator(GeneratorOperation::PhaseShift,
+                                          all_threading, all_memory_model,
+                                          all_qubit_numbers, KernelType::LM);
+        instance.assignKernelForGenerator(GeneratorOperation::RX, all_threading,
+                                          all_memory_model, all_qubit_numbers,
+                                          KernelType::LM);
+        instance.assignKernelForGenerator(GeneratorOperation::RY, all_threading,
+                                          all_memory_model, all_qubit_numbers,
+                                          KernelType::LM);
+        instance.assignKernelForGenerator(GeneratorOperation::RZ, all_threading,
+                                          all_memory_model, all_qubit_numbers,
+                                          KernelType::LM);
+        instance.assignKernelForGenerator(GeneratorOperation::IsingXX,
+                                          all_threading, all_memory_model,
+                                          all_qubit_numbers, KernelType::LM);
+        instance.assignKernelForGenerator(GeneratorOperation::IsingYY,
+                                          all_threading, all_memory_model,
+                                          all_qubit_numbers, KernelType::LM);
+        instance.assignKernelForGenerator(GeneratorOperation::IsingZZ,
+                                          all_threading, all_memory_model,
+                                          all_qubit_numbers, KernelType::LM);
+        instance.assignKernelForGenerator(GeneratorOperation::CRX,
+                                          all_threading, all_memory_model,
+                                          all_qubit_numbers, KernelType::LM);
+        instance.assignKernelForGenerator(GeneratorOperation::CRY,
+                                          all_threading, all_memory_model,
+                                          all_qubit_numbers, KernelType::LM);
+        instance.assignKernelForGenerator(GeneratorOperation::CRZ,
+                                          all_threading, all_memory_model,
+                                          all_qubit_numbers, KernelType::LM);
+        instance.assignKernelForGenerator(
+            GeneratorOperation::ControlledPhaseShift, all_threading,
+            all_memory_model, all_qubit_numbers, KernelType::LM);
+        instance.assignKernelForGenerator(GeneratorOperation::MultiRZ,
+                                          all_threading, all_memory_model,
+                                          all_qubit_numbers, KernelType::LM);
+    }
+
+    DefaultKernelsForStateVector() {
+        registerDefaultGates();
+        registerDefaultGenerators();
+    }
+
+  public:
+    struct AllThreading {};
+
+    struct AllMemoryModel {};
+
+    constexpr static AllThreading all_threading{};
+    constexpr static AllMemoryModel all_memory_model{};
+
+    static auto getInstance() -> DefaultKernelsForStateVector & {
+        static DefaultKernelsForStateVector instance;
+
+        return instance;
+    }
+
+    void
+    assignKernelForGate(Gates::GateOperation gate_op, Threading threading,
+                        CPUMemoryModel memory_model,
+                        const std::function<bool(size_t)> &num_qubits_criterion,
+                        Gates::KernelType kernel) {
+        if (std::find(allowed_kernels.at(memory_model).cbegin(),
+                      allowed_kernels.at(memory_model).cend(),
+                      kernel) == allowed_kernels.at(memory_model).cend()) {
+            throw std::invalid_argument("The given kernel is now allowed for "
+                                        "the given memory model.");
+        }
+        gate_kernel_map_[gate_op].emplace_back(
+            toDispatchKey(threading, memory_model), num_qubits_criterion,
+            kernel);
+    }
+
+    void
+    assignKernelForGate(Gates::GateOperation gate_op,
+                        [[maybe_unused]] AllThreading dummy,
+                        CPUMemoryModel memory_model,
+                        const std::function<bool(size_t)> &num_qubits_criterion,
+                        Gates::KernelType kernel) {
+        Util::for_each_enum<Threading>([=](Threading threading) {
+            assignKernelForGate(gate_op, threading, memory_model,
+                                num_qubits_criterion, kernel);
+        });
+    }
+
+    void
+    assignKernelForGate(Gates::GateOperation gate_op, Threading threading,
+                        [[maybe_unused]] AllMemoryModel dummy,
+                        const std::function<bool(size_t)> &num_qubits_criterion,
+                        Gates::KernelType kernel) {
+        Util::for_each_enum<CPUMemoryModel>([=](CPUMemoryModel memory_model) {
+            assignKernelForGate(gate_op, threading, memory_model,
+                                num_qubits_criterion, kernel);
+        });
+    }
+
+    void
+    assignKernelForGate(Gates::GateOperation gate_op,
+                        [[maybe_unused]] AllThreading dummy1,
+                        [[maybe_unused]] AllMemoryModel dummy2,
+                        const std::function<bool(size_t)> &num_qubits_criterion,
+                        Gates::KernelType kernel) {
+        Util::for_each_enum<Threading, CPUMemoryModel>(
+            [=](Threading threading, CPUMemoryModel memory_model) {
+                assignKernelForGate(gate_op, threading, memory_model,
+                                    num_qubits_criterion, kernel);
+            });
+    }
+
+    void assignKernelForGenerator(
+        Gates::GeneratorOperation gntr_op, Threading threading,
+        CPUMemoryModel memory_model,
+        const std::function<bool(size_t)> &num_qubits_criterion,
+        Gates::KernelType kernel) {
+        if (std::find(allowed_kernels.at(memory_model).cbegin(),
+                      allowed_kernels.at(memory_model).cend(),
+                      kernel) == allowed_kernels.at(memory_model).cend()) {
+            throw std::invalid_argument("The given kernel is now allowed for "
+                                        "the given memory model.");
+        }
+        generator_kernel_map_[gntr_op].emplace_back(
+            toDispatchKey(threading, memory_model), num_qubits_criterion,
+            kernel);
+    }
+
+    void assignKernelForGenerator(
+        Gates::GeneratorOperation gntr_op, [[maybe_unused]] AllThreading dummy,
+        CPUMemoryModel memory_model,
+        const std::function<bool(size_t)> &num_qubits_criterion,
+        Gates::KernelType kernel) {
+        Util::for_each_enum<Threading>([=](Threading threading) {
+            assignKernelForGenerator(gntr_op, threading, memory_model,
+                                     num_qubits_criterion, kernel);
+        });
+    }
+
+    void assignKernelForGenerator(
+        Gates::GeneratorOperation gntr_op, Threading threading,
+        [[maybe_unused]] AllMemoryModel dummy,
+        const std::function<bool(size_t)> &num_qubits_criterion,
+        Gates::KernelType kernel) {
+        Util::for_each_enum<CPUMemoryModel>([=](CPUMemoryModel memory_model) {
+            assignKernelForGenerator(gntr_op, threading, memory_model,
+                                     num_qubits_criterion, kernel);
+        });
+    }
+
+    void assignKernelForGenerator(
+        Gates::GeneratorOperation gntr_op, [[maybe_unused]] AllThreading dummy1,
+        [[maybe_unused]] AllMemoryModel dummy2,
+        const std::function<bool(size_t)> &num_qubits_criterion,
+        Gates::KernelType kernel) {
+        Util::for_each_enum<Threading, CPUMemoryModel>(
+            [=](Threading threading, CPUMemoryModel memory_model) {
+                assignKernelForGenerator(gntr_op, threading, memory_model,
+                                         num_qubits_criterion, kernel);
+            });
+    }
+
+    /**
+     * @brief Create default kernels for all generators
+     * @param num_qubits Number of qubits
+     * @param threading Threading context
+     * @param memory_model Memory model of the underlying data
+     */
+    auto getGeneratorKernelMap(size_t num_qubits, Threading threading,
+                               CPUMemoryModel memory_model) const
+        -> std::unordered_map<Gates::GeneratorOperation, Gates::KernelType> {
+        uint32_t dispatch_key = toDispatchKey(threading, memory_model);
+
+        std::unordered_map<Gates::GeneratorOperation, Gates::KernelType>
+            kernel_for_generators;
+
+        for (auto generator = Gates::GeneratorOperation::BEGIN;
+             generator != Gates::GeneratorOperation::END;
+             generator = static_cast<Gates::GeneratorOperation>(
+                 static_cast<uint32_t>(generator) + 1)) {
+
+            const auto iter =
+                std::find_if(generator_kernel_map_.at(generator).cbegin(),
+                             generator_kernel_map_.at(generator).cend(),
+                             [dispatch_key = dispatch_key,
+                              num_qubits = num_qubits](const auto &t) {
+                                 return (std::get<0>(t) == dispatch_key &&
+                                         std::get<1>(t)(num_qubits));
+                             });
+            if (iter == generator_kernel_map_.at(generator).cend()) {
+                throw std::range_error("Cannot find registered kernel for a "
+                                       "dispatch key and number of qubits.");
+            }
+            kernel_for_generators.emplace(generator, std::get<2>(*iter));
+        }
+        return kernel_for_generators;
+    }
+
+    auto getGateKernelMap(size_t num_qubits, Threading threading,
+                          CPUMemoryModel memory_model) const
+        -> std::unordered_map<Gates::GateOperation, Gates::KernelType> {
+        uint32_t dispatch_key = toDispatchKey(threading, memory_model);
+
+        std::unordered_map<Gates::GateOperation, Gates::KernelType>
+            kernel_for_gates;
+
+        for (auto gate = Gates::GateOperation::BEGIN;
+             gate != Gates::GateOperation::END;
+             gate = static_cast<Gates::GateOperation>(
+                 static_cast<uint32_t>(gate) + 1)) {
+
+            if (gate == Gates::GateOperation::Matrix) {
+                continue;
+            }
+
+            const auto iter = std::find_if(
+                gate_kernel_map_.at(gate).cbegin(),
+                gate_kernel_map_.at(gate).cend(), [=](const auto &t) {
+                    return (std::get<0>(t) == dispatch_key &&
+                            std::get<1>(t)(num_qubits));
+                });
+            if (iter == gate_kernel_map_.at(gate).cend()) {
+                throw std::range_error("Cannot find registered kernel for a "
+                                       "dispatch key and number of qubits.");
+            }
+            kernel_for_gates.emplace(gate, std::get<2>(*iter));
+        }
+        return kernel_for_gates;
+    }
+};
+} // namespace Pennylane
diff --git a/pennylane_lightning/src/simulator/DispatchKeys.hpp b/pennylane_lightning/src/simulator/DispatchKeys.hpp
new file mode 100644
index 0000000000..08265c9c59
--- /dev/null
+++ b/pennylane_lightning/src/simulator/DispatchKeys.hpp
@@ -0,0 +1,87 @@
+// Copyright 2022 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/**
+ * @file
+ * Define keys to select kernels
+ */
+#pragma once
+
+#include "Macros.hpp"
+
+#include <cstdint>
+
+#ifdef PL_USE_OMP
+#include <omp.h>
+#endif
+
+namespace Pennylane {
+enum class Threading : uint8_t {
+    SingleThread,
+    MultiThread,
+    END,
+    BEGIN = SingleThread,
+};
+
+enum class CPUMemoryModel : uint8_t {
+    Unaligned,
+    Aligned256,
+    Aligned512,
+    END,
+    BEGIN = Unaligned,
+};
+
+constexpr uint32_t toDispatchKey(Threading threading,
+                                 CPUMemoryModel memory_model) {
+    /* Threading is in higher priority */
+    return (static_cast<uint32_t>(threading) << 8U) |
+           static_cast<uint32_t>(memory_model);
+}
+
+inline auto getMemoryModel(const void *ptr) -> CPUMemoryModel {
+    if ((reinterpret_cast<uintptr_t>(ptr) % 64) == 0) {
+        return CPUMemoryModel::Aligned512;
+    }
+
+    if ((reinterpret_cast<uintptr_t>(ptr) % 32) == 0) {
+        return CPUMemoryModel::Aligned256;
+    }
+
+    return CPUMemoryModel::Unaligned;
+}
+
+/**
+ * @brief Choose the best threading based on the current context.
+ */
+inline auto bestThreading() -> Threading {
+#ifdef PL_USE_OMP
+    if (omp_in_parallel() != 0) {
+        // We are already inside of the openmp parallel region (e.g.
+        // inside adjoint diff).
+        return Threading::SingleThread;
+    }
+    return Threading::MultiThread;
+#endif
+    return Threading::SingleThread;
+}
+
+constexpr inline auto bestCPUMemoryModel() -> CPUMemoryModel {
+    if constexpr (use_avx512f) {
+        return CPUMemoryModel::Aligned512;
+    } else if (use_avx2) {
+        return CPUMemoryModel::Aligned256;
+    }
+    return CPUMemoryModel::Unaligned;
+}
+
+} // namespace Pennylane
diff --git a/pennylane_lightning/src/simulator/DynamicDispatcher.cpp b/pennylane_lightning/src/simulator/DynamicDispatcher.cpp
index 315b7a102e..22187d4fcf 100644
--- a/pennylane_lightning/src/simulator/DynamicDispatcher.cpp
+++ b/pennylane_lightning/src/simulator/DynamicDispatcher.cpp
@@ -143,9 +143,7 @@ void registerAllImplementedGateOps() {
     auto registerGateToDispatcher = [&dispatcher](
                                         const auto &gate_op_func_pair) {
         const auto &[gate_op, func] = gate_op_func_pair;
-        std::string op_name =
-            std::string(lookup(Gates::Constant::gate_names, gate_op));
-        dispatcher.registerGateOperation(op_name, GateImplementation::kernel_id,
+        dispatcher.registerGateOperation(gate_op, GateImplementation::kernel_id,
                                          func);
         return gate_op;
     };
@@ -169,10 +167,8 @@ void registerAllImplementedGeneratorOps() {
     auto registerGeneratorToDispatcher =
         [&dispatcher](const auto &gntr_op_func_pair) {
             const auto &[gntr_op, func] = gntr_op_func_pair;
-            std::string op_name =
-                std::string(lookup(Gates::Constant::generator_names, gntr_op));
             dispatcher.registerGeneratorOperation(
-                op_name, GateImplementation::kernel_id, func);
+                gntr_op, GateImplementation::kernel_id, func);
             return gntr_op;
         };
 
diff --git a/pennylane_lightning/src/simulator/DynamicDispatcher.hpp b/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
index 83536f9076..284e221002 100644
--- a/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
+++ b/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
@@ -36,10 +36,9 @@
 /// @cond DEV
 namespace Pennylane::Internal {
 struct PairHash {
-    size_t
-    operator()(const std::pair<std::string, Gates::KernelType> &p) const {
-        return std::hash<std::string>()(p.first) ^
-               std::hash<int>()(static_cast<int>(p.second));
+    template <typename T, typename U>
+    size_t operator()(const std::pair<T, U> &p) const {
+        return std::hash<T>()(p.first) ^ std::hash<U>()(p.second);
     }
 };
 /**
@@ -60,12 +59,12 @@ namespace Pennylane {
 template <class PrecisionT, class ParamT> struct registerBeforeMain;
 
 template <> struct registerBeforeMain<float, float> {
-    static inline int dummy =
+    static inline const int dummy =
         Internal::registerAllAvailableKernels<float, float>();
 };
 
 template <> struct registerBeforeMain<double, double> {
-    static inline int dummy =
+    static inline const int dummy =
         Internal::registerAllAvailableKernels<double, double>();
 };
 
@@ -89,28 +88,19 @@ template <typename PrecisionT> class DynamicDispatcher {
                                          bool /*adjoint*/);
 
   private:
-    std::unordered_map<std::string, size_t> gate_wires_;
+    std::unordered_map<std::string, Gates::GateOperation> str_to_gates_;
+    std::unordered_map<std::string, Gates::GeneratorOperation> str_to_gntrs_;
 
-    std::unordered_map<std::string, Gates::KernelType> gate_kernel_map_;
-    std::unordered_map<std::string, Gates::KernelType> generator_kernel_map_;
-
-    std::unordered_map<std::pair<std::string, Gates::KernelType>, GateFunc,
-                       Internal::PairHash>
+    std::unordered_map<std::pair<Gates::GateOperation, Gates::KernelType>,
+                       GateFunc, Internal::PairHash>
         gates_;
 
-    std::unordered_map<std::pair<std::string, Gates::KernelType>, GeneratorFunc,
-                       Internal::PairHash>
+    std::unordered_map<std::pair<Gates::GeneratorOperation, Gates::KernelType>,
+                       GeneratorFunc, Internal::PairHash>
         generators_;
 
-    std::string removeGeneratorPrefix(const std::string &op_name) {
-        constexpr std::string_view prefix = "Generator";
-        // TODO: change to string::starts_with in C++20
-        if (op_name.rfind(prefix) != 0) {
-            return op_name;
-        }
-        return op_name.substr(prefix.size());
-    }
-    std::string_view removeGeneratorPrefix(std::string_view op_name) {
+    constexpr static auto removeGeneratorPrefix(std::string_view op_name)
+        -> std::string_view {
         constexpr std::string_view prefix = "Generator";
         // TODO: change to string::starts_with in C++20
         if (op_name.rfind(prefix) != 0) {
@@ -121,38 +111,13 @@ template <typename PrecisionT> class DynamicDispatcher {
 
     DynamicDispatcher() {
         using Gates::KernelType;
-        for (const auto &[gate_op, n_wires] : Gates::Constant::gate_wires) {
-            gate_wires_.emplace(
-                Util::lookup(Gates::Constant::gate_names, gate_op), n_wires);
-        }
 
         for (const auto &[gate_op, gate_name] : Gates::Constant::gate_names) {
-            KernelType kernel = Util::lookup(
-                Gates::Constant::default_kernel_for_gates, gate_op);
-            const auto implemented_gates = implementedGatesForKernel(kernel);
-            if (std::find(std::cbegin(implemented_gates),
-                          std::cend(implemented_gates),
-                          gate_op) == std::cend(implemented_gates)) {
-                PL_ABORT("Default kernel for " + std::string(gate_name) +
-                         " does not implement the gate.");
-            }
-            gate_kernel_map_.emplace(gate_name, kernel);
+            str_to_gates_.emplace(gate_name, gate_op);
         }
-
         for (const auto &[gntr_op, gntr_name] :
              Gates::Constant::generator_names) {
-            KernelType kernel = Util::lookup(
-                Gates::Constant::default_kernel_for_generators, gntr_op);
-            const auto implemented_generators =
-                implementedGeneratorsForKernel(kernel);
-            if (std::find(std::cbegin(implemented_generators),
-                          std::cend(implemented_generators),
-                          gntr_op) == std::cend(implemented_generators)) {
-                PL_ABORT("Default kernel for " + std::string(gntr_name) +
-                         " does not implement the generator.");
-            }
-            generator_kernel_map_.emplace(removeGeneratorPrefix(gntr_name),
-                                          kernel);
+            str_to_gntrs_.emplace(removeGeneratorPrefix(gntr_name), gntr_op);
         }
     }
 
@@ -162,15 +127,24 @@ template <typename PrecisionT> class DynamicDispatcher {
         return singleton;
     }
 
+    auto strToGateOp(const std::string &gate_name) const
+        -> Gates::GateOperation {
+        return str_to_gates_.at(gate_name);
+    }
+    auto strToGeneratorOp(const std::string &gntr_name) const
+        -> Gates::GeneratorOperation {
+        return str_to_gntrs_.at(gntr_name);
+    }
+
     /**
      * @brief Register a new gate operation for the operation. Can pass a custom
      * kernel
      */
     template <typename FunctionType>
-    void registerGateOperation(const std::string &op_name,
+    void registerGateOperation(Gates::GateOperation gate_op,
                                Gates::KernelType kernel, FunctionType &&func) {
         // TODO: Add mutex when we go to multithreading
-        gates_.emplace(std::make_pair(op_name, kernel),
+        gates_.emplace(std::make_pair(gate_op, kernel),
                        std::forward<FunctionType>(func));
     }
 
@@ -179,13 +153,12 @@ template <typename PrecisionT> class DynamicDispatcher {
      * kernel
      */
     template <typename FunctionType>
-    void registerGeneratorOperation(const std::string &op_name,
+    void registerGeneratorOperation(Gates::GeneratorOperation gntr_op,
                                     Gates::KernelType kernel,
                                     FunctionType &&func) {
         // TODO: Add mutex when we go to multithreading
-        generators_.emplace(
-            std::make_pair(removeGeneratorPrefix(op_name), kernel),
-            std::forward<FunctionType>(func));
+        generators_.emplace(std::make_pair(gntr_op, kernel),
+                            std::forward<FunctionType>(func));
     }
 
     /**
@@ -203,26 +176,19 @@ template <typename PrecisionT> class DynamicDispatcher {
                         size_t num_qubits, const std::string &op_name,
                         const std::vector<size_t> &wires, bool inverse,
                         const std::vector<PrecisionT> &params = {}) const {
-        const auto iter = gates_.find(std::make_pair(op_name, kernel));
+        const auto iter =
+            gates_.find(std::make_pair(strToGateOp(op_name), kernel));
         if (iter == gates_.cend()) {
             throw std::invalid_argument(
                 "Cannot find a gate with a given name \"" + op_name + "\".");
         }
-        const auto gate_wire_iter = gate_wires_.find(op_name);
-        if ((gate_wire_iter != gate_wires_.end()) &&
-            (gate_wire_iter->second != wires.size())) {
-            throw std::invalid_argument(
-                std::string("The supplied gate requires ") +
-                std::to_string(gate_wire_iter->second) + " wires, but " +
-                std::to_string(wires.size()) + " were supplied.");
-            // TODO: change to std::format in C++20
-        }
         (iter->second)(data, num_qubits, wires, inverse, params);
     }
 
     /**
-     * @brief Apply a single gate to the state-vector using a registered kernel
+     * @brief Apply a single gate to the state-vector using the given kernel.
      *
+     * @param kernel Kernel to run the gate operation.
      * @param data Pointer to data.
      * @param num_qubits Number of qubits.
      * @param op_name Gate operation name.
@@ -230,17 +196,19 @@ template <typename PrecisionT> class DynamicDispatcher {
      * @param inverse Indicates whether to use inverse of gate.
      * @param params Optional parameter list for parametric gates.
      */
-    inline void
-    applyOperation(CFP_t *data, size_t num_qubits, const std::string &op_name,
-                   const std::vector<size_t> &wires, bool inverse,
-                   const std::vector<PrecisionT> &params = {}) const {
-        const auto kernel_iter = gate_kernel_map_.find(op_name);
-        if (kernel_iter == gate_kernel_map_.end()) {
-            PL_ABORT("Kernel for gate " + op_name + " is not registered.");
+    void applyOperation(Gates::KernelType kernel, CFP_t *data,
+                        size_t num_qubits, Gates::GateOperation gate_op,
+                        const std::vector<size_t> &wires, bool inverse,
+                        const std::vector<PrecisionT> &params = {}) const {
+        const auto iter = gates_.find(std::make_pair(gate_op, kernel));
+        if (iter == gates_.cend()) {
+            throw std::invalid_argument(
+                std::string("Cannot find a gate with a given name \"") +
+                std::string(
+                    Util::lookup(Gates::Constant::gate_names, gate_op)) +
+                "\".");
         }
-
-        applyOperation(kernel_iter->second, data, num_qubits, op_name, wires,
-                       inverse, params);
+        (iter->second)(data, num_qubits, wires, inverse, params);
     }
 
     /**
@@ -312,34 +280,13 @@ template <typename PrecisionT> class DynamicDispatcher {
                         size_t num_qubits, const std::string &op_name,
                         const std::vector<size_t> &wires, bool adj) const
         -> PrecisionT {
-        const auto iter = generators_.find(std::make_pair(op_name, kernel));
+        const auto iter =
+            generators_.find(std::make_pair(strToGeneratorOp(op_name), kernel));
         if (iter == generators_.cend()) {
             throw std::invalid_argument(
                 "Cannot find a gate with a given name \"" + op_name + "\".");
         }
         return (iter->second)(data, num_qubits, wires, adj);
     }
-
-    /**
-     * @brief Apply a single gate to the state-vector using a registered kernel
-     *
-     * @param data Pointer to data.
-     * @param num_qubits Number of qubits.
-     * @param op_name Gate operation name.
-     * @param wires Wires to apply gate to.
-     * @param adj Indicates whether to use adjoint of gate.
-     */
-    inline auto applyGenerator(CFP_t *data, size_t num_qubits,
-                               const std::string &op_name,
-                               const std::vector<size_t> &wires, bool adj) const
-        -> PrecisionT {
-        const auto kernel_iter = generator_kernel_map_.find(op_name);
-        if (kernel_iter == generator_kernel_map_.end()) {
-            PL_ABORT("Kernel for gate " + op_name + " is not registered.");
-        }
-
-        return applyGenerator(kernel_iter->second, data, num_qubits, op_name,
-                              wires, adj);
-    }
 };
 } // namespace Pennylane
diff --git a/pennylane_lightning/src/simulator/Measures.hpp b/pennylane_lightning/src/simulator/Measures.hpp
index 26208b6ba1..f2f1cc010a 100644
--- a/pennylane_lightning/src/simulator/Measures.hpp
+++ b/pennylane_lightning/src/simulator/Measures.hpp
@@ -26,7 +26,7 @@
 #include <vector>
 
 #include "LinearAlgebra.hpp"
-#include "StateVectorManaged.hpp"
+#include "StateVectorCPU.hpp"
 #include "StateVectorRaw.hpp"
 
 namespace Pennylane {
@@ -123,7 +123,7 @@ class Measures {
                 const std::vector<size_t> &wires) {
         // Copying the original state vector, for the application of the
         // observable operator.
-        StateVectorManaged<fp_t> operator_statevector(original_statevector);
+        StateVectorCPU<fp_t> operator_statevector(original_statevector);
 
         operator_statevector.applyMatrix(matrix, wires);
 
@@ -143,7 +143,7 @@ class Measures {
                 const std::vector<size_t> &wires) {
         // Copying the original state vector, for the application of the
         // observable operator.
-        StateVectorManaged<fp_t> operator_statevector(original_statevector);
+        StateVectorCPU<fp_t> operator_statevector(original_statevector);
 
         operator_statevector.applyOperation(operation, wires);
 
@@ -190,7 +190,7 @@ class Measures {
     fp_t var(const std::string &operation, const std::vector<size_t> &wires) {
         // Copying the original state vector, for the application of the
         // observable operator.
-        StateVectorManaged<fp_t> operator_statevector(original_statevector);
+        StateVectorCPU<fp_t> operator_statevector(original_statevector);
 
         operator_statevector.applyOperation(operation, wires);
 
@@ -216,7 +216,7 @@ class Measures {
              const std::vector<size_t> &wires) {
         // Copying the original state vector, for the application of the
         // observable operator.
-        StateVectorManaged<fp_t> operator_statevector(original_statevector);
+        StateVectorCPU<fp_t> operator_statevector(original_statevector);
 
         operator_statevector.applyMatrix(matrix, wires);
 
diff --git a/pennylane_lightning/src/simulator/StateVectorBase.hpp b/pennylane_lightning/src/simulator/StateVectorBase.hpp
index dec223408c..e2b3ac32e8 100644
--- a/pennylane_lightning/src/simulator/StateVectorBase.hpp
+++ b/pennylane_lightning/src/simulator/StateVectorBase.hpp
@@ -141,6 +141,18 @@ template <class PrecisionT, class Derived> class StateVectorBase {
         return static_cast<const Derived *>(this)->getData();
     }
 
+    [[nodiscard]] inline auto
+    getKernelForGate(Gates::GateOperation gate_op) const -> Gates::KernelType {
+        return static_cast<const Derived *>(this)->getKernelForGate(gate_op);
+    }
+
+    [[nodiscard]] inline auto
+    getKernelForGenerator(Gates::GeneratorOperation gntr_op) const
+        -> Gates::KernelType {
+        return static_cast<const Derived *>(this)->getKernelForGenerator(
+            gntr_op);
+    }
+
     /**
      * @brief Compare two statevectors.
      *
@@ -192,8 +204,10 @@ template <class PrecisionT, class Derived> class StateVectorBase {
                         const std::vector<size_t> &wires, bool inverse = false,
                         const std::vector<PrecisionT> &params = {}) {
         auto *arr = getData();
-        DynamicDispatcher<PrecisionT>::getInstance().applyOperation(
-            arr, num_qubits_, opName, wires, inverse, params);
+        auto &dispatcher = DynamicDispatcher<PrecisionT>::getInstance();
+        const auto gate_op = dispatcher.strToGateOp(opName);
+        dispatcher.applyOperation(getKernelForGate(gate_op), arr, num_qubits_,
+                                  gate_op, wires, inverse, params);
     }
 
     /**
@@ -208,9 +222,15 @@ template <class PrecisionT, class Derived> class StateVectorBase {
                          const std::vector<std::vector<size_t>> &wires,
                          const std::vector<bool> &inverse,
                          const std::vector<std::vector<PrecisionT>> &params) {
-        auto *arr = getData();
-        DynamicDispatcher<PrecisionT>::getInstance().applyOperations(
-            arr, num_qubits_, ops, wires, inverse, params);
+        const size_t numOperations = ops.size();
+        if (numOperations != wires.size()) {
+            throw std::invalid_argument(
+                "Invalid arguments: number of operations, wires, and "
+                "parameters must all be equal");
+        }
+        for (size_t i = 0; i < numOperations; i++) {
+            applyOperation(ops[i], wires[i], inverse[i], params[i]);
+        }
     }
 
     /**
@@ -223,9 +243,15 @@ template <class PrecisionT, class Derived> class StateVectorBase {
     void applyOperations(const std::vector<std::string> &ops,
                          const std::vector<std::vector<size_t>> &wires,
                          const std::vector<bool> &inverse) {
-        auto *arr = getData();
-        DynamicDispatcher<PrecisionT>::getInstance().applyOperations(
-            arr, num_qubits_, ops, wires, inverse);
+        const size_t numOperations = ops.size();
+        if (numOperations != wires.size()) {
+            throw std::invalid_argument(
+                "Invalid arguments: number of operations, wires, and "
+                "parameters must all be equal");
+        }
+        for (size_t i = 0; i < numOperations; i++) {
+            applyOperation(ops[i], wires[i], inverse[i], {});
+        }
     }
 
     /**
@@ -256,8 +282,10 @@ template <class PrecisionT, class Derived> class StateVectorBase {
                                       const std::vector<size_t> &wires,
                                       bool adj = false) -> PrecisionT {
         auto *arr = getData();
-        return DynamicDispatcher<PrecisionT>::getInstance().applyGenerator(
-            arr, num_qubits_, opName, wires, adj);
+        auto &dispatcher = DynamicDispatcher<PrecisionT>::getInstance();
+        return dispatcher.applyGenerator(
+            getKernelForGenerator(dispatcher.strToGeneratorOp(opName)), arr,
+            num_qubits_, opName, wires, adj);
     }
 
     /**
diff --git a/pennylane_lightning/src/simulator/StateVectorCPU.hpp b/pennylane_lightning/src/simulator/StateVectorCPU.hpp
new file mode 100644
index 0000000000..a003adf1ba
--- /dev/null
+++ b/pennylane_lightning/src/simulator/StateVectorCPU.hpp
@@ -0,0 +1,166 @@
+// Copyright 2021 Xanadu Quantum Technologies Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//     http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "BitUtil.hpp"
+#include "DefaultKernelsForStateVector.hpp"
+#include "DispatchKeys.hpp"
+#include "Gates.hpp"
+#include "KernelType.hpp"
+#include "Memory.hpp"
+#include "StateVectorBase.hpp"
+#include "Util.hpp"
+
+namespace Pennylane {
+
+/**
+ * @brief StateVector class where data resides in CPU memory. Memory ownership
+ * resides within class.
+ *
+ * We currently use std::unique_ptr to C-style array as we want to choose
+ * allocator in runtime. This is impossible with std::vector.
+ *
+ * @tparam PrecisionT
+ */
+template <class PrecisionT = double>
+class StateVectorCPU
+    : public StateVectorBase<PrecisionT, StateVectorCPU<PrecisionT>> {
+  public:
+    using ComplexPrecisionT = std::complex<PrecisionT>;
+
+  private:
+    using BaseType = StateVectorBase<PrecisionT, StateVectorCPU>;
+
+    Threading threading_;
+    CPUMemoryModel memory_model_;
+
+    std::unordered_map<Gates::GateOperation, Gates::KernelType>
+        kernel_for_gates_;
+    std::unordered_map<Gates::GeneratorOperation, Gates::KernelType>
+        kernel_for_generators_;
+    std::unique_ptr<ComplexPrecisionT[]>
+        data_; // NOLINT(modernize-avoid-c-arrays)
+
+    void setKernels(size_t num_qubits, Threading threading,
+                    CPUMemoryModel memory_model) {
+        auto &default_kernels = DefaultKernelsForStateVector::getInstance();
+        kernel_for_gates_ = default_kernels.getGateKernelMap(
+            num_qubits, threading, memory_model);
+        kernel_for_generators_ = default_kernels.getGeneratorKernelMap(
+            num_qubits, threading, memory_model);
+    }
+
+  public:
+    explicit StateVectorCPU(size_t num_qubits,
+                            Threading threading = bestThreading(),
+                            CPUMemoryModel memory_model = bestCPUMemoryModel())
+        : BaseType(num_qubits), threading_{threading}, memory_model_{
+                                                           memory_model} {
+
+        setKernels(num_qubits, threading, memory_model);
+
+        size_t length = BaseType::getLength();
+        data_ = std::unique_ptr<ComplexPrecisionT[]>{new (std::align_val_t{
+            64}) ComplexPrecisionT[length]}; // NOLINT(modernize-avoid-c-arrays)
+        std::fill(data_.get(), data_.get() + length,
+                  ComplexPrecisionT{0.0, 0.0});
+        data_[0] = {1, 0};
+    }
+
+    template <class OtherDerived>
+    explicit StateVectorCPU(
+        const StateVectorBase<PrecisionT, OtherDerived> &other,
+        Threading threading = bestThreading(),
+        CPUMemoryModel memory_model = bestCPUMemoryModel())
+        : BaseType(other.getNumQubits()), threading_{threading},
+          memory_model_{memory_model} {
+
+        size_t length = BaseType::getLength();
+        data_ = std::unique_ptr<ComplexPrecisionT[]>{new (std::align_val_t{
+            64}) ComplexPrecisionT[length]}; // NOLINT(modernize-avoid-c-arrays)
+
+        std::copy(other.getData(), other.getData() + length, data_.get());
+
+        setKernels(BaseType::getNumQubits(), threading, memory_model);
+    }
+
+    StateVectorCPU(const ComplexPrecisionT *other_data, size_t other_size,
+                   Threading threading = bestThreading(),
+                   CPUMemoryModel memory_model = bestCPUMemoryModel())
+        : BaseType(Util::log2PerfectPower(other_size)), threading_{threading},
+          memory_model_{memory_model} {
+        PL_ABORT_IF_NOT(Util::isPerfectPowerOf2(other_size),
+                        "The size of provided data must be a power of 2.");
+        data_ = std::unique_ptr<ComplexPrecisionT[]>{
+            new (std::align_val_t{64}) ComplexPrecisionT
+                [other_size]}; // NOLINT(modernize-avoid-c-arrays)
+        setKernels(BaseType::getNumQubits(), threading, memory_model);
+
+        updateData(other_data);
+    }
+
+    template <class Alloc>
+    explicit StateVectorCPU(
+        const std::vector<std::complex<PrecisionT>, Alloc> &rhs,
+        Threading threading = bestThreading(),
+        CPUMemoryModel memory_model = bestCPUMemoryModel())
+        : StateVectorCPU(rhs.data(), rhs.size(), threading,
+                         memory_model) // NOLINT(hicpp-member-init)
+                                       // this is false positive for delegating
+                                       // constructor from clang-tidy
+    {}
+
+    StateVectorCPU(const StateVectorCPU &rhs)
+        : BaseType(rhs.getNumQubits()), threading_{rhs.threading_},
+          memory_model_{rhs.memory_model_} {
+        setKernels(BaseType::getNumQubits(), threading_, memory_model_);
+
+        size_t length = BaseType::getLength();
+        data_ = std::unique_ptr<ComplexPrecisionT[]>{new (std::align_val_t{
+            64}) ComplexPrecisionT[length]}; // NOLINT(modernize-avoid-c-arrays)
+        std::copy(rhs.getData(), rhs.getData() + length, data_.get());
+    }
+
+    StateVectorCPU(StateVectorCPU &&) noexcept = default;
+
+    StateVectorCPU &operator=(const StateVectorCPU &) = delete;
+    StateVectorCPU &operator=(StateVectorCPU &&) noexcept = default;
+
+    ~StateVectorCPU() = default;
+
+    [[nodiscard]] auto getData() -> ComplexPrecisionT * { return data_.get(); }
+
+    [[nodiscard]] auto getData() const -> const ComplexPrecisionT * {
+        return data_.get();
+    }
+
+    [[nodiscard]] inline auto
+    getKernelForGate(Gates::GateOperation gate_op) const -> Gates::KernelType {
+        return kernel_for_gates_.at(gate_op);
+    }
+
+    [[nodiscard]] inline auto
+    getKernelForGenerator(Gates::GeneratorOperation gntr_op) const
+        -> Gates::KernelType {
+        return kernel_for_generators_.at(gntr_op);
+    }
+
+    /**
+     * @brief Update data of the class to new_data
+     *
+     * @param new_data std::vector contains data.
+     */
+    void updateData(const ComplexPrecisionT *data) {
+        std::copy(data, data + BaseType::getLength(), data_.get());
+    }
+};
+
+} // namespace Pennylane
diff --git a/pennylane_lightning/src/simulator/StateVectorManaged.hpp b/pennylane_lightning/src/simulator/StateVectorManaged.hpp
deleted file mode 100644
index a1317e75d5..0000000000
--- a/pennylane_lightning/src/simulator/StateVectorManaged.hpp
+++ /dev/null
@@ -1,104 +0,0 @@
-// Copyright 2021 Xanadu Quantum Technologies Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//     http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include "BitUtil.hpp"
-#include "StateVectorBase.hpp"
-#include "Util.hpp"
-
-namespace Pennylane {
-
-/**
- * @brief Managed memory version of StateVector class. Memory ownership resides
- * within class.
- *
- * This class is only internally used in C++ code.
- *
- * @tparam PrecisionT
- */
-template <class PrecisionT = double>
-class StateVectorManaged
-    : public StateVectorBase<PrecisionT, StateVectorManaged<PrecisionT>> {
-  public:
-    using ComplexPrecisionT = std::complex<PrecisionT>;
-
-  private:
-    using BaseType = StateVectorBase<PrecisionT, StateVectorManaged>;
-
-    std::vector<ComplexPrecisionT> data_;
-
-  public:
-    StateVectorManaged() : StateVectorBase<PrecisionT, StateVectorManaged>() {}
-
-    explicit StateVectorManaged(size_t num_qubits)
-        : BaseType(num_qubits),
-          data_(static_cast<size_t>(Util::exp2(num_qubits)),
-                ComplexPrecisionT{0, 0}) {
-        data_[0] = {1, 0};
-    }
-
-    template <class OtherDerived>
-    explicit StateVectorManaged(
-        const StateVectorBase<PrecisionT, OtherDerived> &other)
-        : BaseType(other.getNumQubits()), data_{other.getData(),
-                                                other.getData() +
-                                                    other.getLength()} {}
-
-    explicit StateVectorManaged(
-        const std::vector<ComplexPrecisionT> &other_data)
-        : BaseType(Util::log2(other_data.size())), data_{other_data} {
-        PL_ABORT_IF_NOT(Util::isPerfectPowerOf2(other_data.size()),
-                        "The size of provided data must be a power of 2.");
-    }
-
-    StateVectorManaged(const ComplexPrecisionT *other_data, size_t other_size)
-        : BaseType(Util::log2(other_size)), data_{other_data,
-                                                  other_data + other_size} {
-        PL_ABORT_IF_NOT(Util::isPerfectPowerOf2(other_size),
-                        "The size of provided data must be a power of 2.");
-    }
-
-    StateVectorManaged(const StateVectorManaged<PrecisionT> &other) = default;
-    StateVectorManaged(StateVectorManaged<PrecisionT> &&other) noexcept =
-        default;
-
-    ~StateVectorManaged() = default;
-
-    auto operator=(const StateVectorManaged<PrecisionT> &other)
-        -> StateVectorManaged<PrecisionT> & = default;
-    auto operator=(StateVectorManaged<PrecisionT> &&other) noexcept
-        -> StateVectorManaged<PrecisionT> & = default;
-
-    auto getDataVector() -> std::vector<ComplexPrecisionT> & { return data_; }
-    [[nodiscard]] auto getDataVector() const
-        -> const std::vector<ComplexPrecisionT> & {
-        return data_;
-    }
-
-    [[nodiscard]] auto getData() -> ComplexPrecisionT * { return data_.data(); }
-
-    [[nodiscard]] auto getData() const -> const ComplexPrecisionT * {
-        return data_.data();
-    }
-
-    /**
-     * @brief Update data of the class to new_data
-     *
-     * @param new_data std::vector contains data.
-     */
-    void updateData(const std::vector<ComplexPrecisionT> &new_data) {
-        PL_ABORT_IF_NOT(data_.size() == new_data.size(),
-                        "New data must be the same size as old data.")
-        std::copy(new_data.begin(), new_data.end(), data_.begin());
-    }
-};
-
-} // namespace Pennylane
diff --git a/pennylane_lightning/src/tests/.clang-tidy b/pennylane_lightning/src/tests/.clang-tidy
index 3ed93f21bf..0a70c347b0 100644
--- a/pennylane_lightning/src/tests/.clang-tidy
+++ b/pennylane_lightning/src/tests/.clang-tidy
@@ -1,5 +1,5 @@
 ---
-Checks:          'clang-diagnostic-*,clang-analyzer-*,-*,-llvmlibc-*,modernize-*,-modernize-use-trailing-return-type,clang-analyzer-cplusplus*,openmp-*,performance-*,portability-*,readability-*,-readability-magic-numbers,-modernize-avoid-c-arrays'
+Checks:          'clang-diagnostic-*,clang-analyzer-*,-*,-llvmlibc-*,modernize-*,-modernize-use-trailing-return-type,clang-analyzer-cplusplus*,openmp-*,performance-*,portability-*,readability-*,-readability-magic-numbers,hicpp-*,-hicpp-no-array-decay,bugprone-suspicious-*,llvm-namespace-comment,cppcoreguidelines-avoid-non-const-global-variables,cppcoreguidelines-slicing,cppcoreguidelines-special-member-functions'
 WarningsAsErrors: '*'
 HeaderFilterRegex: '.*'
 AnalyzeTemporaryDtors: false
@@ -216,7 +216,7 @@ CheckOptions:
   - key:             modernize-use-auto.RemoveStars
     value:           'false'
   - key:             readability-magic-numbers.IgnorePowersOf2IntegerValues
-    value:           'false'
+    value:           'true'
   - key:             portability-simd-intrinsics.Std
     value:           ''
   - key:             readability-redundant-member-init.IgnoreBaseInCopyConstructors
diff --git a/pennylane_lightning/src/tests/CMakeLists.txt b/pennylane_lightning/src/tests/CMakeLists.txt
index c507f938bf..fbe9b621ea 100644
--- a/pennylane_lightning/src/tests/CMakeLists.txt
+++ b/pennylane_lightning/src/tests/CMakeLists.txt
@@ -68,9 +68,12 @@ endif()
 add_executable(compile_time_tests compile_time_tests.cpp)
 target_link_libraries(compile_time_tests lightning_gates lightning_utils)
 
-set(TEST_SOURCES Test_AdjDiff.cpp
+set(TEST_SOURCES CreateAllWires.cpp
+                 Test_AdjDiff.cpp
 #                 Test_Bindings.cpp
                  Test_DynamicDispatcher.cpp
+                 Test_DefaultKernelsForStateVector.cpp
+                 Test_GateImplementations_CompareKernels.cpp
                  Test_GateImplementations_Generator.cpp
                  Test_GateImplementations_Inverse.cpp
                  Test_GateImplementations_Matrix.cpp
@@ -80,7 +83,7 @@ set(TEST_SOURCES Test_AdjDiff.cpp
                  Test_Internal.cpp
                  Test_Measures.cpp
                  Test_OpToMemberFuncPtr.cpp
-                 Test_StateVectorManaged.cpp
+                 Test_StateVectorCPU.cpp
                  Test_StateVectorRaw.cpp
                  Test_Util.cpp
                  Test_VectorJacobianProduct.cpp)
diff --git a/pennylane_lightning/src/tests/CreateAllWires.cpp b/pennylane_lightning/src/tests/CreateAllWires.cpp
new file mode 100644
index 0000000000..43a7e80ce4
--- /dev/null
+++ b/pennylane_lightning/src/tests/CreateAllWires.cpp
@@ -0,0 +1,31 @@
+#include "CreateAllWires.hpp"
+namespace Pennylane {
+auto crateAllWires(size_t n_qubits, Gates::GateOperation gate_op, bool order)
+    -> std::vector<std::vector<size_t>> {
+
+    if (Util::array_has_elt(Gates::Constant::multi_qubit_gates, gate_op)) {
+        // make all possible 2^N permutations
+        std::vector<std::vector<size_t>> res;
+        res.reserve((1U << n_qubits) - 1);
+        ;
+        for (size_t k = 1; k < (1U << n_qubits); k++) {
+            std::vector<size_t> wires;
+            wires.reserve(Util::popcount(k));
+
+            for (size_t i = 0; i < n_qubits; i++) {
+                if (((k >> i) & 1) == 1) {
+                    wires.emplace_back(i);
+                }
+            }
+
+            res.push_back(wires);
+        }
+        return res;
+    } // else
+    const size_t n_wires = Util::lookup(Gates::Constant::gate_wires, gate_op);
+    if (order) {
+        return PermutationGenerator(n_qubits, n_wires).all_perms();
+    } // else
+    return CombinationGenerator(n_qubits, n_wires).all_perms();
+}
+} // namespace Pennylane
diff --git a/pennylane_lightning/src/tests/CreateAllWires.hpp b/pennylane_lightning/src/tests/CreateAllWires.hpp
new file mode 100644
index 0000000000..54d3cd9e9a
--- /dev/null
+++ b/pennylane_lightning/src/tests/CreateAllWires.hpp
@@ -0,0 +1,92 @@
+#pragma once
+#include "BitUtil.hpp"
+#include "Constant.hpp"
+#include "ConstantUtil.hpp"
+#include "GateOperation.hpp"
+
+#include <cstdlib>
+#include <vector>
+
+namespace Pennylane {
+
+class WiresGenerator {
+  public:
+    [[nodiscard]] virtual auto all_perms() const
+        -> const std::vector<std::vector<size_t>> & = 0;
+};
+class CombinationGenerator : public WiresGenerator {
+  private:
+    std::vector<size_t> v_;
+    std::vector<std::vector<size_t>> all_perms_;
+
+  public:
+    void comb(size_t n, size_t r) {
+        if (r == 0) {
+            all_perms_.push_back(v_);
+            return;
+        }
+        if (n < r) {
+            return;
+        }
+
+        v_[r - 1] = n - 1;
+        comb(n - 1, r - 1);
+
+        comb(n - 1, r);
+    }
+
+    CombinationGenerator(size_t n, size_t r) {
+        v_.resize(r);
+        comb(n, r);
+    }
+
+    [[nodiscard]] auto all_perms() const
+        -> const std::vector<std::vector<size_t>> & override {
+        return all_perms_;
+    }
+};
+class PermutationGenerator : public WiresGenerator {
+  private:
+    std::vector<std::vector<size_t>> all_perms_;
+    std::vector<size_t> available_elts_;
+    std::vector<size_t> v;
+
+  public:
+    void perm(size_t n, size_t r) {
+        if (r == 0) {
+            all_perms_.push_back(v);
+            return;
+        }
+        for (size_t i = 0; i < n; i++) {
+            v[r - 1] = available_elts_[i];
+            std::swap(available_elts_[n - 1], available_elts_[i]);
+            perm(n - 1, r - 1);
+            std::swap(available_elts_[n - 1], available_elts_[i]);
+        }
+    }
+
+    PermutationGenerator(size_t n, size_t r) {
+        v.resize(r);
+
+        available_elts_.resize(n);
+        std::iota(available_elts_.begin(), available_elts_.end(), 0);
+        perm(n, r);
+    }
+
+    [[nodiscard]] auto all_perms() const
+        -> const std::vector<std::vector<size_t>> & override {
+        return all_perms_;
+    }
+};
+
+/**
+ * @brief Create all possible combination of wires
+ * for a given number of qubits and gate operation
+ *
+ * @param n_qubits Number of qubits
+ * @param gate_op Gate operation
+ * @param order Whether the ordering matters (if true, permutation is used)
+ */
+auto crateAllWires(size_t n_qubits, Gates::GateOperation gate_op, bool order)
+    -> std::vector<std::vector<size_t>>;
+} // namespace Pennylane
diff --git a/pennylane_lightning/src/tests/TestHelpers.hpp b/pennylane_lightning/src/tests/TestHelpers.hpp
index 74faeeb5ce..723d03b10a 100644
--- a/pennylane_lightning/src/tests/TestHelpers.hpp
+++ b/pennylane_lightning/src/tests/TestHelpers.hpp
@@ -1,17 +1,20 @@
-#include <algorithm>
-#include <complex>
-#include <random>
-#include <string>
-#include <type_traits>
-#include <vector>
-
 #include "Constant.hpp"
 #include "ConstantUtil.hpp"
 #include "Error.hpp"
 #include "GateOperation.hpp"
 #include "LinearAlgebra.hpp"
+#include "Macros.hpp"
+#include "Memory.hpp"
+#include "TestKernels.hpp"
 #include "Util.hpp"
 
+#include <algorithm>
+#include <complex>
+#include <random>
+#include <string>
+#include <type_traits>
+#include <vector>
+
 #include <catch2/catch.hpp>
 
 namespace Pennylane {
@@ -94,6 +97,20 @@ bool operator!=(const std::vector<T, AllocA> &lhs,
     return !rhs.compare(lhs);
 }
 
+template <class T, class AllocA, class AllocB>
+bool operator==(const std::vector<T, AllocA> &lhs,
+                const std::vector<T, AllocB> &rhs) {
+    if (lhs.size() != rhs.size()) {
+        return false;
+    }
+    for (size_t idx = 0; idx < lhs.size(); idx++) {
+        if (lhs[idx] != rhs[idx]) {
+            return false;
+        }
+    }
+    return true;
+}
+
 /**
  * @brief Utility function to compare complex statevector data.
  *
@@ -132,6 +149,11 @@ isApproxEqual(const Data_t &data1, const Data_t &data2,
              data1.imag() != Approx(data2.imag()).epsilon(eps));
 }
 
+template <typename T>
+using TestVector = std::vector<
+    T,
+    PLAllocator<T, Util::common_alignment_v<remove_complex_t<T>, TestKernels>>>;
+
 /**
  * @brief Multiplies every value in a dataset by a given complex scalar value.
  *
@@ -140,8 +162,8 @@ isApproxEqual(const Data_t &data1, const Data_t &data2,
  * @param data Data to be scaled.
  * @param scalar Scalar value.
  */
-template <class Data_t>
-void scaleVector(std::vector<std::complex<Data_t>> &data,
+template <class Data_t, class Alloc>
+void scaleVector(std::vector<std::complex<Data_t>, Alloc> &data,
                  std::complex<Data_t> scalar) {
     std::transform(
         data.begin(), data.end(), data.begin(),
@@ -156,8 +178,9 @@ void scaleVector(std::vector<std::complex<Data_t>> &data,
  * @param data Data to be scaled.
  * @param scalar Scalar value.
  */
-template <class Data_t>
-void scaleVector(std::vector<std::complex<Data_t>> &data, Data_t scalar) {
+template <class Data_t, class Alloc>
+void scaleVector(std::vector<std::complex<Data_t>, Alloc> &data,
+                 Data_t scalar) {
     std::transform(
         data.begin(), data.end(), data.begin(),
         [scalar](const std::complex<Data_t> &c) { return c * scalar; });
@@ -168,8 +191,8 @@ void scaleVector(std::vector<std::complex<Data_t>> &data, Data_t scalar) {
  */
 template <typename PrecisionT>
 auto createZeroState(size_t num_qubits)
-    -> std::vector<std::complex<PrecisionT>> {
-    std::vector<std::complex<PrecisionT>> res(1U << num_qubits, {0.0, 0.0});
+    -> TestVector<std::complex<PrecisionT>> {
+    TestVector<std::complex<PrecisionT>> res(1U << num_qubits, {0.0, 0.0});
     res[0] = std::complex<PrecisionT>{1.0, 0.0};
     return res;
 }
@@ -179,8 +202,8 @@ auto createZeroState(size_t num_qubits)
  */
 template <typename PrecisionT>
 auto createPlusState(size_t num_qubits)
-    -> std::vector<std::complex<PrecisionT>> {
-    std::vector<std::complex<PrecisionT>> res(1U << num_qubits, {1.0, 0.0});
+    -> TestVector<std::complex<PrecisionT>> {
+    TestVector<std::complex<PrecisionT>> res(1U << num_qubits, {1.0, 0.0});
     for (auto &elt : res) {
         elt /= std::sqrt(1U << num_qubits);
     }
@@ -204,8 +227,8 @@ auto squaredNorm(const std::complex<PrecisionT> *data, size_t data_size)
  */
 template <typename PrecisionT, class RandomEngine>
 auto createRandomState(RandomEngine &re, size_t num_qubits)
-    -> std::vector<std::complex<PrecisionT>> {
-    std::vector<std::complex<PrecisionT>> res(1U << num_qubits, {0.0, 0.0});
+    -> TestVector<std::complex<PrecisionT>> {
+    TestVector<std::complex<PrecisionT>> res(1U << num_qubits, {0.0, 0.0});
     std::uniform_real_distribution<PrecisionT> dist;
     for (size_t idx = 0; idx < (1U << num_qubits); idx++) {
         res[idx] = {dist(re), dist(re)};
@@ -221,9 +244,11 @@ auto createRandomState(RandomEngine &re, size_t num_qubits)
  *
  * Example: createProductState("+01") will produce |+01> state.
  */
-template <typename PrecisionT> auto createProductState(std::string_view str) {
+template <typename PrecisionT>
+auto createProductState(std::string_view str)
+    -> TestVector<std::complex<PrecisionT>> {
     using Pennylane::Util::INVSQRT2;
-    std::vector<std::complex<PrecisionT>> st;
+    TestVector<std::complex<PrecisionT>> st;
     st.resize(1U << str.length());
 
     std::vector<PrecisionT> zero{1.0, 0.0};
@@ -261,11 +286,13 @@ template <typename PrecisionT> auto createProductState(std::string_view str) {
     return st;
 }
 
-inline auto createWires(Gates::GateOperation op) -> std::vector<size_t> {
+inline auto createWires(Gates::GateOperation op, size_t num_qubits)
+    -> std::vector<size_t> {
     if (Pennylane::Util::array_has_elt(Gates::Constant::multi_qubit_gates,
                                        op)) {
-        // if multi-qubit gates
-        return {0, 1, 2};
+        std::vector<size_t> wires(num_qubits);
+        std::iota(wires.begin(), wires.end(), 0);
+        return wires;
     }
     switch (Pennylane::Util::lookup(Gates::Constant::gate_wires, op)) {
     case 1:
@@ -301,10 +328,10 @@ auto createParams(Gates::GateOperation op) -> std::vector<PrecisionT> {
  */
 template <typename PrecisionT, class RandomEngine>
 auto randomUnitary(RandomEngine &re, size_t num_qubits)
-    -> std::vector<std::complex<PrecisionT>> {
+    -> TestVector<std::complex<PrecisionT>> {
     using ComplexPrecisionT = std::complex<PrecisionT>;
     const size_t dim = (1U << num_qubits);
-    std::vector<ComplexPrecisionT> res(dim * dim, ComplexPrecisionT{});
+    TestVector<ComplexPrecisionT> res(dim * dim, ComplexPrecisionT{});
 
     std::normal_distribution<PrecisionT> dist;
 
@@ -339,7 +366,7 @@ auto randomUnitary(RandomEngine &re, size_t num_qubits)
         ComplexPrecisionT *row_p = res.data() + row * dim;
         PrecisionT norm2 = std::sqrt(squaredNorm(row_p, dim));
 
-        // noramlize row2
+        // normalize row2
         std::transform(row_p, row_p + dim, row_p, [norm2](const auto c) {
             return (static_cast<PrecisionT>(1.0) / norm2) * c;
         });
diff --git a/pennylane_lightning/src/tests/TestKernels.hpp b/pennylane_lightning/src/tests/TestKernels.hpp
index e9b9cfa785..9e83198073 100644
--- a/pennylane_lightning/src/tests/TestKernels.hpp
+++ b/pennylane_lightning/src/tests/TestKernels.hpp
@@ -1,13 +1,15 @@
 #pragma once
 /**
- * @brief We define test kernels. Note that kernels not registered to
+ * @file
+ * We define test kernels. Note that kernels not registered to
  * AvailableKernels can be also tested by adding it to here.
  */
-#include "GateImplementationsLM.hpp"
-#include "GateImplementationsPI.hpp"
-
+#include "Macros.hpp"
 #include "TypeList.hpp"
 
+#include "cpu_kernels/GateImplementationsLM.hpp"
+#include "cpu_kernels/GateImplementationsPI.hpp"
+
 using TestKernels =
     Pennylane::Util::TypeList<Pennylane::Gates::GateImplementationsLM,
-                              Pennylane::Gates::GateImplementationsPI>;
+                              Pennylane::Gates::GateImplementationsPI, void>;
diff --git a/pennylane_lightning/src/tests/Test_AdjDiff.cpp b/pennylane_lightning/src/tests/Test_AdjDiff.cpp
index dfbac67214..6a05a36018 100644
--- a/pennylane_lightning/src/tests/Test_AdjDiff.cpp
+++ b/pennylane_lightning/src/tests/Test_AdjDiff.cpp
@@ -13,7 +13,7 @@
 #include <catch2/catch.hpp>
 
 #include "AdjointDiff.hpp"
-#include "StateVectorRaw.hpp"
+#include "StateVectorCPU.hpp"
 #include "Util.hpp"
 
 #include "TestHelpers.hpp"
@@ -50,10 +50,12 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=RX, Obs=Z",
         for (const auto &p : param) {
             auto ops = OpsData<double>({"RX"}, {{p}}, {{0}}, {false});
 
+            /*
             std::vector<std::complex<double>> cdata(0b1 << num_qubits);
             cdata[0] = std::complex<double>{1, 0};
+            */
 
-            StateVectorRaw<double> psi(cdata.data(), cdata.size());
+            StateVectorCPU<double> psi(num_qubits);
 
             std::vector<size_t> tp{0};
             std::vector<ObsDatum<double>> obs_ls{obs};
@@ -82,10 +84,7 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=RY, Obs=X",
         for (const auto &p : param) {
             auto ops = OpsData<double>({"RY"}, {{p}}, {{0}}, {false});
 
-            std::vector<std::complex<double>> cdata(0b1 << num_qubits);
-            cdata[0] = std::complex<double>{1, 0};
-
-            StateVectorRaw<double> psi(cdata.data(), cdata.size());
+            StateVectorCPU<double> psi(num_qubits);
 
             std::vector<size_t> tp{0};
             std::vector<ObsDatum<double>> obs_ls{obs};
@@ -109,9 +108,7 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=RX, Obs=[Z,Z]",
         const size_t num_obs = 2;
         std::vector<double> jacobian(num_obs * num_params, 0);
 
-        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
-        StateVectorRaw<double> psi(cdata.data(), cdata.size());
-        cdata[0] = std::complex<double>{1, 0};
+        StateVectorCPU<double> psi(num_qubits);
 
         auto obs1 = ObsDatum<double>({"PauliZ"}, {{}}, {{0}});
         auto obs2 = ObsDatum<double>({"PauliZ"}, {{}}, {{1}});
@@ -140,9 +137,7 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=[RX,RX,RX], Obs=[Z,Z,Z]",
         const size_t num_obs = 3;
         std::vector<double> jacobian(num_obs * num_params, 0);
 
-        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
-        StateVectorRaw<double> psi(cdata.data(), cdata.size());
-        cdata[0] = std::complex<double>{1, 0};
+        StateVectorCPU<double> psi(num_qubits);
 
         auto obs1 = ObsDatum<double>({"PauliZ"}, {{}}, {{0}});
         auto obs2 = ObsDatum<double>({"PauliZ"}, {{}}, {{1}});
@@ -179,9 +174,7 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=[RX,RX,RX], Obs=[Z,Z,Z], "
         std::vector<double> jacobian(num_obs * num_params, 0);
         std::vector<size_t> t_params{0, 2};
 
-        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
-        StateVectorRaw<double> psi(cdata.data(), cdata.size());
-        cdata[0] = std::complex<double>{1, 0};
+        StateVectorCPU<double> psi(num_qubits);
 
         auto obs1 = ObsDatum<double>({"PauliZ"}, {{}}, {{0}});
         auto obs2 = ObsDatum<double>({"PauliZ"}, {{}}, {{1}});
@@ -214,9 +207,7 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=[RX,RX,RX], Obs=[ZZZ]",
         const size_t num_obs = 1;
         std::vector<double> jacobian(num_obs * num_params, 0);
 
-        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
-        StateVectorRaw<double> psi(cdata.data(), cdata.size());
-        cdata[0] = std::complex<double>{1, 0};
+        StateVectorCPU<double> psi(num_qubits);
 
         auto obs = ObsDatum<double>({"PauliZ", "PauliZ", "PauliZ"},
                                     {{}, {}, {}}, {{0}, {1}, {2}});
@@ -249,9 +240,7 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=Mixed, Obs=[XXX]",
         const size_t num_obs = 1;
         std::vector<double> jacobian(num_obs * num_params, 0);
 
-        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
-        StateVectorRaw<double> psi(cdata.data(), cdata.size());
-        cdata[0] = std::complex<double>{1, 0};
+        StateVectorCPU<double> psi(num_qubits);
 
         auto obs = ObsDatum<double>({"PauliX", "PauliX", "PauliX"},
                                     {{}, {}, {}}, {{0}, {1}, {2}});
@@ -312,7 +301,7 @@ TEST_CASE("AdjointJacobian::adjointJacobian Decomposed Rot gate, non "
 
             std::vector<std::complex<double>> cdata{INVSQRT2<double>(),
                                                     -INVSQRT2<double>()};
-            StateVectorRaw<double> psi(cdata.data(), cdata.size());
+            StateVectorCPU<double> psi(cdata.data(), cdata.size());
 
             auto obs = ObsDatum<double>({"PauliZ"}, {{}}, {{0}});
             auto ops = OpsData<double>(
@@ -353,7 +342,7 @@ TEST_CASE("AdjointJacobian::adjointJacobian Mixed Ops, Obs and TParams",
 
         std::vector<std::complex<double>> cdata{ONE<double>(), ZERO<double>(),
                                                 ZERO<double>(), ZERO<double>()};
-        StateVectorRaw<double> psi(cdata.data(), cdata.size());
+        StateVectorCPU<double> psi(cdata);
 
         auto obs = ObsDatum<double>({"PauliX", "PauliZ"}, {{}, {}}, {{0}, {1}});
         auto ops = OpsData<double>(
@@ -388,4 +377,4 @@ TEST_CASE("AdjointJacobian::adjointJacobian Mixed Ops, Obs and TParams",
         CHECK(expected[1] == Approx(jacobian[1]));
         CHECK(expected[2] == Approx(jacobian[2]));
     }
-}
\ No newline at end of file
+}
diff --git a/pennylane_lightning/src/tests/Test_DefaultKernelsForStateVector.cpp b/pennylane_lightning/src/tests/Test_DefaultKernelsForStateVector.cpp
new file mode 100644
index 0000000000..aadc5426d0
--- /dev/null
+++ b/pennylane_lightning/src/tests/Test_DefaultKernelsForStateVector.cpp
@@ -0,0 +1,32 @@
+#include "Constant.hpp"
+#include "ConstantUtil.hpp"
+#include "DefaultKernelsForStateVector.hpp"
+#include "Util.hpp"
+
+#include <catch2/catch.hpp>
+
+using namespace Pennylane;
+
+TEST_CASE("Test default kernels for gates are well defined",
+          "[Test_DefaultKernelsForStateVector]") {
+    auto &instance = DefaultKernelsForStateVector::getInstance();
+    Util::for_each_enum<Threading, CPUMemoryModel>(
+        [&instance](Threading threading, CPUMemoryModel memory_model) {
+            for (size_t num_qubits = 1; num_qubits < 27; num_qubits++) {
+                REQUIRE_NOTHROW(instance.getGateKernelMap(num_qubits, threading,
+                                                          memory_model));
+            }
+        });
+}
+
+TEST_CASE("Test default kernels for generators are well defined",
+          "[Test_DefaultKernelsForStateVector]") {
+    auto &instance = DefaultKernelsForStateVector::getInstance();
+    Util::for_each_enum<Threading, CPUMemoryModel>(
+        [&instance](Threading threading, CPUMemoryModel memory_model) {
+            for (size_t num_qubits = 1; num_qubits < 27; num_qubits++) {
+                REQUIRE_NOTHROW(instance.getGeneratorKernelMap(
+                    num_qubits, threading, memory_model));
+            }
+        });
+}
diff --git a/pennylane_lightning/src/tests/Test_DynamicDispatcher.cpp b/pennylane_lightning/src/tests/Test_DynamicDispatcher.cpp
index f4dcf3b4c2..0146f99c35 100644
--- a/pennylane_lightning/src/tests/Test_DynamicDispatcher.cpp
+++ b/pennylane_lightning/src/tests/Test_DynamicDispatcher.cpp
@@ -36,12 +36,10 @@ struct testDispatchForKernel {
             Util::array_has_elt(GateImplementation::implemented_gates, gate_op),
             bool> = true>
     static void test(RandomEngine &re, size_t num_qubits) {
-        using CFP_t = std::complex<PrecisionT>;
-        const std::vector<CFP_t> ini_st =
-            createRandomState<PrecisionT>(re, num_qubits);
-        std::vector<CFP_t> expected = ini_st;
+        const auto ini_st = createRandomState<PrecisionT>(re, num_qubits);
+        auto expected = ini_st;
 
-        const auto wires = createWires(gate_op);
+        const auto wires = createWires(gate_op, num_qubits);
         const auto params = createParams<PrecisionT>(gate_op);
 
         // We first calculate expected directly calling a static member function
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
new file mode 100644
index 0000000000..30d6894b08
--- /dev/null
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
@@ -0,0 +1,185 @@
+#include "CreateAllWires.hpp"
+#include "TestHelpers.hpp"
+
+#include "OpToMemberFuncPtr.hpp"
+#include "TestKernels.hpp"
+#include "Util.hpp"
+
+#include <catch2/catch.hpp>
+
+#include <algorithm>
+#include <complex>
+#include <iostream>
+#include <limits>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+/**
+ * @file Test_GateImplementations_Nonparam.cpp
+ *
+ * This file tests all gate operations (besides matrix) by comparing results
+ * between different kernels (gate implementations).
+ */
+using namespace Pennylane;
+using namespace Pennylane::Gates;
+using namespace Pennylane::Util;
+
+namespace {
+using std::vector;
+}
+
+template <typename TypeList> std::string kernelsToString() {
+    if constexpr (!std::is_same_v<TypeList, void>) {
+        if constexpr (!std::is_same_v<typename TypeList::Next, void>) {
+            return std::string(TypeList::Type::name) + ", " +
+                   kernelsToString<typename TypeList::Next>();
+        }
+        return std::string(TypeList::Type::name);
+    }
+}
+
+/* Type transformation */
+template <Gates::GateOperation gate_op, typename TypeList>
+struct KernelsImplementingGateHelper {
+    using Type = std::conditional_t<
+        array_has_elt(TypeList::Type::implemented_gates, gate_op),
+        typename PrependToTypeList<
+            typename TypeList::Type,
+            typename KernelsImplementingGateHelper<
+                gate_op, typename TypeList::Next>::Type>::Type,
+        typename KernelsImplementingGateHelper<gate_op,
+                                               typename TypeList::Next>::Type>;
+};
+template <Gates::GateOperation gate_op>
+struct KernelsImplementingGateHelper<gate_op, void> {
+    using Type = void;
+};
+
+template <Gates::GateOperation gate_op> struct KernelsImplementingGate {
+    using Type =
+        typename KernelsImplementingGateHelper<gate_op, TestKernels>::Type;
+};
+
+/**
+ * @brief Apply the given gate operation with the given gate implementation.
+ *
+ * @tparam gate_op Gate operation to test
+ * @tparam PrecisionT Floating point data type for statevector
+ * @tparam ParamT Floating point data type for parameter
+ * @tparam GateImplementation Gate implementation class
+ * @param ini Initial statevector
+ * @param num_qubits Number of qubits
+ * @param wires Wires the gate applies to
+ * @param inverse Whether to use inverse of gate
+ * @param params Paramters for gate
+ */
+template <Gates::GateOperation gate_op, typename PrecisionT, typename ParamT,
+          typename GateImplementation>
+auto applyGate(TestVector<std::complex<PrecisionT>> ini, size_t num_qubits,
+               const std::vector<size_t> &wires, bool inverse,
+               const std::vector<ParamT> &params)
+    -> TestVector<std::complex<PrecisionT>> {
+    callGateOps(GateOpToMemberFuncPtr<PrecisionT, ParamT, GateImplementation,
+                                      gate_op>::value,
+                ini.data(), num_qubits, wires, inverse, params);
+    return ini;
+}
+
+/**
+ * @brief Apply the given gate using all implementing kernels and return
+ * results in tuple.
+ */
+template <Gates::GateOperation gate_op, typename PrecisionT, typename ParamT,
+          typename Kernels, size_t... I>
+auto applyGateForImplemetingKernels(
+    const TestVector<std::complex<PrecisionT>> &ini, size_t num_qubits,
+    const std::vector<size_t> &wires, bool inverse,
+    const std::vector<ParamT> &params,
+    [[maybe_unused]] std::index_sequence<I...> dummy) {
+    return std::make_tuple(
+        applyGate<gate_op, PrecisionT, ParamT, getNthType<Kernels, I>>(
+            ini, num_qubits, wires, inverse, params)...);
+}
+
+template <Gates::GateOperation gate_op, typename PrecisionT, typename ParamT,
+          class RandomEngine>
+void testApplyGate(RandomEngine &re, size_t num_qubits) {
+    const auto ini = createRandomState<PrecisionT>(re, num_qubits);
+
+    using Kernels = typename KernelsImplementingGate<gate_op>::Type;
+
+    INFO("Kernels implementing " << lookup(Constant::gate_names, gate_op)
+                                 << " are " << kernelsToString<Kernels>());
+
+    INFO("PrecisionT, ParamT = " << PrecisionToName<PrecisionT>::value << ", "
+                                 << PrecisionToName<ParamT>::value);
+
+    if constexpr (gate_op != GateOperation::Matrix) {
+        const auto all_wires = crateAllWires(num_qubits, gate_op, true);
+        for (const auto &wires : all_wires) {
+            const auto params = createParams<ParamT>(gate_op);
+            const auto gate_name = lookup(Constant::gate_names, gate_op);
+            DYNAMIC_SECTION(
+                "Test gate "
+                << gate_name
+                << " with inverse = false") { // Test with inverse = false
+                const auto results = Util::tuple_to_array(
+                    applyGateForImplemetingKernels<gate_op, PrecisionT, ParamT,
+                                                   Kernels>(
+                        ini, num_qubits, wires, false, params,
+                        std::make_index_sequence<length<Kernels>()>()));
+
+                for (size_t i = 0; i < results.size() - 1; i++) {
+                    REQUIRE(results[i] ==
+                            PLApprox(results[i + 1]).margin(1e-7));
+                }
+            }
+
+            DYNAMIC_SECTION(
+                "Test gate "
+                << gate_name
+                << " with inverse = true") { // Test with inverse = true
+                const auto results = Util::tuple_to_array(
+                    applyGateForImplemetingKernels<gate_op, PrecisionT, ParamT,
+                                                   Kernels>(
+                        ini, num_qubits, wires, true, params,
+                        std::make_index_sequence<length<Kernels>()>()));
+
+                for (size_t i = 0; i < results.size() - 1; i++) {
+                    REQUIRE(results[i] ==
+                            PLApprox(results[i + 1]).margin(1e-7));
+                }
+            }
+        }
+    }
+}
+
+template <size_t gate_idx, typename PrecisionT, typename ParamT,
+          class RandomEngine>
+void testAllGatesIter(RandomEngine &re, size_t max_num_qubits) {
+    if constexpr (gate_idx < static_cast<size_t>(GateOperation::END)) {
+        constexpr static auto gate_op = static_cast<GateOperation>(gate_idx);
+
+        size_t min_num_qubits =
+            array_has_elt(Constant::multi_qubit_gates, gate_op)
+                ? 1
+                : lookup(Constant::gate_wires, gate_op);
+        for (size_t num_qubits = min_num_qubits; num_qubits < max_num_qubits;
+             num_qubits++) {
+            testApplyGate<gate_op, PrecisionT, ParamT>(re, num_qubits);
+        }
+        testAllGatesIter<gate_idx + 1, PrecisionT, ParamT>(re, max_num_qubits);
+    }
+}
+
+template <typename PrecisionT, typename ParamT, class RandomEngine>
+void testAllGates(RandomEngine &re, size_t max_num_qubits) {
+    testAllGatesIter<0, PrecisionT, ParamT>(re, max_num_qubits);
+}
+
+TEMPLATE_TEST_CASE("Test all kernels give the same results",
+                   "[Test_GateImplementations_CompareKernels]", float, double) {
+    std::mt19937 re{1337};
+    testAllGates<TestType, TestType>(re, 6);
+}
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp
index 377c45bd5f..2e9cd9cdcb 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp
@@ -83,7 +83,7 @@ void testGeneratorForGate(RandomEngine &re, size_t num_qubits) {
 
     DYNAMIC_SECTION("Test generator of " << gate_name << " for kernel "
                                          << GateImplementation::name) {
-        const auto wires = createWires(gate_op);
+        const auto wires = createWires(gate_op, num_qubits);
         const auto ini_st = createRandomState<PrecisionT>(re, num_qubits);
 
         auto gntr_func =
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Inverse.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Inverse.cpp
index fb172dafdb..19ffb8535b 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Inverse.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Inverse.cpp
@@ -38,7 +38,7 @@ void testInverseKernelGate(RandomEngine &re, size_t num_qubits) {
                 GateOpToMemberFuncPtr<PrecisionT, ParamT, GateImplementation,
                                       gate_op>::value;
 
-            const auto wires = createWires(gate_op);
+            const auto wires = createWires(gate_op, num_qubits);
             const auto params = createParams<ParamT>(gate_op);
 
             callGateOps(func_ptr, st.data(), num_qubits, wires, false, params);
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Nonparam.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Nonparam.cpp
index 86894a47f7..85772294ff 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Nonparam.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Nonparam.cpp
@@ -71,13 +71,18 @@ using std::vector;
 template <typename PrecisionT, class GateImplementation>
 void testApplyPauliX() {
     const size_t num_qubits = 3;
-    for (size_t index = 0; index < num_qubits; index++) {
-        auto st = createZeroState<PrecisionT>(num_qubits);
-        CHECK(st[0] == Util::ONE<PrecisionT>());
-
-        GateImplementation::applyPauliX(st.data(), num_qubits, {index}, false);
-        CHECK(st[0] == Util::ZERO<PrecisionT>());
-        CHECK(st[0b1 << (num_qubits - index - 1)] == Util::ONE<PrecisionT>());
+    DYNAMIC_SECTION(GateImplementation::name
+                    << ", PauliX - " << PrecisionToName<PrecisionT>::value) {
+        for (size_t index = 0; index < num_qubits; index++) {
+            auto st = createZeroState<PrecisionT>(num_qubits);
+            CHECK(st[0] == Util::ONE<PrecisionT>());
+
+            GateImplementation::applyPauliX(st.data(), num_qubits, {index},
+                                            false);
+            CHECK(st[0] == Util::ZERO<PrecisionT>());
+            CHECK(st[0b1 << (num_qubits - index - 1)] ==
+                  Util::ONE<PrecisionT>());
+        }
     }
 }
 PENNYLANE_RUN_TEST(PauliX);
@@ -133,23 +138,20 @@ PENNYLANE_RUN_TEST(PauliZ);
 
 template <typename PrecisionT, class GateImplementation>
 void testApplyHadamard() {
-    using ComplexPrecisionT = std::complex<PrecisionT>;
     const size_t num_qubits = 3;
     for (size_t index = 0; index < num_qubits; index++) {
         auto st = createZeroState<PrecisionT>(num_qubits);
 
-        CHECK(st[0] == ComplexPrecisionT{1, 0});
         GateImplementation::applyHadamard(st.data(), num_qubits, {index},
                                           false);
 
-        ComplexPrecisionT expected(1 / std::sqrt(2), 0);
-        CHECK(expected.real() == Approx(st[0].real()));
-        CHECK(expected.imag() == Approx(st[0].imag()));
-
-        CHECK(expected.real() ==
-              Approx(st[0b1 << (num_qubits - index - 1)].real()));
-        CHECK(expected.imag() ==
-              Approx(st[0b1 << (num_qubits - index - 1)].imag()));
+        std::vector<char> expected_string;
+        expected_string.resize(num_qubits);
+        std::fill(expected_string.begin(), expected_string.end(), '0');
+        expected_string[index] = '+';
+        const auto expected = createProductState<PrecisionT>(
+            std::string_view{expected_string.data(), num_qubits});
+        CHECK(expected == PLApprox(st));
     }
 }
 PENNYLANE_RUN_TEST(Hadamard);
@@ -205,17 +207,40 @@ PENNYLANE_RUN_TEST(T);
 
 template <typename PrecisionT, class GateImplementation> void testApplyCNOT() {
     const size_t num_qubits = 3;
-    auto st = createZeroState<PrecisionT>(num_qubits);
 
-    // Test using |+00> state to generate 3-qubit GHZ state
-    GateImplementation::applyHadamard(st.data(), num_qubits, {0}, false);
+    SECTION("CNOT0,1 |000> = |000>") {
+        const auto ini_st = createProductState<PrecisionT>("000");
+        auto st = ini_st;
+        GateImplementation::applyCNOT(st.data(), num_qubits, {0, 1}, false);
+        CHECK(st == ini_st);
+    }
+
+    SECTION("CNOT0,1 |100> = |110>") {
+        const auto ini_st = createProductState<PrecisionT>("100");
+        auto st = ini_st;
+        GateImplementation::applyCNOT(st.data(), num_qubits, {0, 1}, false);
+        CHECK(st ==
+              PLApprox(createProductState<PrecisionT>("110")).margin(1e-7));
+    }
+    SECTION("CNOT1,2 |110> = |111>") {
+        const auto ini_st = createProductState<PrecisionT>("110");
+        auto st = ini_st;
+        GateImplementation::applyCNOT(st.data(), num_qubits, {1, 2}, false);
+        CHECK(st ==
+              PLApprox(createProductState<PrecisionT>("111")).margin(1e-7));
+    }
+
+    SECTION("Generate GHZ state") {
+        auto st = createProductState<PrecisionT>("+00");
 
-    for (size_t index = 1; index < num_qubits; index++) {
-        GateImplementation::applyCNOT(st.data(), num_qubits, {index - 1, index},
-                                      false);
+        // Test using |+00> state to generate 3-qubit GHZ state
+        for (size_t index = 1; index < num_qubits; index++) {
+            GateImplementation::applyCNOT(st.data(), num_qubits,
+                                          {index - 1, index}, false);
+        }
+        CHECK(st.front() == Util::INVSQRT2<PrecisionT>());
+        CHECK(st.back() == Util::INVSQRT2<PrecisionT>());
     }
-    CHECK(st.front() == Util::INVSQRT2<PrecisionT>());
-    CHECK(st.back() == Util::INVSQRT2<PrecisionT>());
 }
 PENNYLANE_RUN_TEST(CNOT);
 
@@ -223,11 +248,8 @@ PENNYLANE_RUN_TEST(CNOT);
 template <typename PrecisionT, class GateImplementation> void testApplyCY() {
     using ComplexPrecisionT = std::complex<PrecisionT>;
     const size_t num_qubits = 3;
-    auto ini_st = createZeroState<PrecisionT>(num_qubits);
-
-    // Test using |+10> state
-    GateImplementation::applyHadamard(ini_st.data(), num_qubits, {0}, false);
-    GateImplementation::applyPauliX(ini_st.data(), num_qubits, {1}, false);
+    auto ini_st =
+        createProductState<PrecisionT>("+10"); // Test using |+10> state
 
     CHECK(ini_st == std::vector<ComplexPrecisionT>{
                         Util::ZERO<PrecisionT>(), Util::ZERO<PrecisionT>(),
@@ -299,20 +321,7 @@ template <typename PrecisionT, class GateImplementation> void testApplyCZ() {
     using ComplexPrecisionT = std::complex<PrecisionT>;
     const size_t num_qubits = 3;
 
-    auto ini_st = createZeroState<PrecisionT>(num_qubits);
-
-    // Test using |+10> state
-    GateImplementation::applyHadamard(ini_st.data(), num_qubits, {0}, false);
-    GateImplementation::applyPauliX(ini_st.data(), num_qubits, {1}, false);
-
-    auto st = ini_st;
-    CHECK(st == std::vector<ComplexPrecisionT>{
-                    Util::ZERO<PrecisionT>(), Util::ZERO<PrecisionT>(),
-                    std::complex<PrecisionT>(1.0 / sqrt(2), 0),
-                    Util::ZERO<PrecisionT>(), Util::ZERO<PrecisionT>(),
-                    Util::ZERO<PrecisionT>(),
-                    std::complex<PrecisionT>(1.0 / sqrt(2), 0),
-                    Util::ZERO<PrecisionT>()});
+    auto ini_st = createProductState<PrecisionT>("+10");
 
     DYNAMIC_SECTION(GateImplementation::name
                     << ", CZ0,1 |+10> -> |-10> - "
@@ -340,7 +349,7 @@ template <typename PrecisionT, class GateImplementation> void testApplyCZ() {
     DYNAMIC_SECTION(GateImplementation::name
                     << ", CZ0,2 |+10> -> |+10> - "
                     << PrecisionToName<PrecisionT>::value) {
-        const std::vector<ComplexPrecisionT> &expected{ini_st};
+        const auto &expected = ini_st;
 
         auto sv02 = ini_st;
         auto sv20 = ini_st;
@@ -354,7 +363,7 @@ template <typename PrecisionT, class GateImplementation> void testApplyCZ() {
     DYNAMIC_SECTION(GateImplementation::name
                     << ", CZ1,2 |+10> -> |+10> - "
                     << PrecisionToName<PrecisionT>::value) {
-        const std::vector<ComplexPrecisionT> &expected{ini_st};
+        const auto &expected = ini_st;
 
         auto sv12 = ini_st;
         auto sv21 = ini_st;
@@ -372,11 +381,9 @@ PENNYLANE_RUN_TEST(CZ);
 template <typename PrecisionT, class GateImplementation> void testApplySWAP() {
     using ComplexPrecisionT = std::complex<PrecisionT>;
     const size_t num_qubits = 3;
-    auto ini_st = createZeroState<PrecisionT>(num_qubits);
+    auto ini_st = createProductState<PrecisionT>("+10");
 
     // Test using |+10> state
-    GateImplementation::applyHadamard(ini_st.data(), num_qubits, {0}, false);
-    GateImplementation::applyPauliX(ini_st.data(), num_qubits, {1}, false);
 
     CHECK(ini_st == std::vector<ComplexPrecisionT>{
                         Util::ZERO<PrecisionT>(), Util::ZERO<PrecisionT>(),
@@ -461,12 +468,9 @@ template <typename PrecisionT, class GateImplementation>
 void testApplyToffoli() {
     using ComplexPrecisionT = std::complex<PrecisionT>;
     const size_t num_qubits = 3;
-    auto ini_st = createZeroState<PrecisionT>(num_qubits);
+    auto ini_st = createProductState<PrecisionT>("+10");
 
     // Test using |+10> state
-    GateImplementation::applyHadamard(ini_st.data(), num_qubits, {0}, false);
-    GateImplementation::applyPauliX(ini_st.data(), num_qubits, {1}, false);
-
     DYNAMIC_SECTION(GateImplementation::name
                     << ", Toffoli 0,1,2 |+10> -> |010> + |111> - "
                     << PrecisionToName<PrecisionT>::value) {
@@ -539,11 +543,8 @@ template <typename PrecisionT, class GateImplementation> void testApplyCSWAP() {
     using ComplexPrecisionT = std::complex<PrecisionT>;
     const size_t num_qubits = 3;
 
-    auto ini_st = createZeroState<PrecisionT>(num_qubits);
-
-    // Test using |+10> state
-    GateImplementation::applyHadamard(ini_st.data(), num_qubits, {0}, false);
-    GateImplementation::applyPauliX(ini_st.data(), num_qubits, {1}, false);
+    auto ini_st =
+        createProductState<PrecisionT>("+10"); // Test using |+10> state
 
     DYNAMIC_SECTION(GateImplementation::name
                     << ", CSWAP 0,1,2 |+10> -> |010> + |101> - "
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Param.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Param.cpp
index cc6f687e11..8e594fe3d6 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Param.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Param.cpp
@@ -164,8 +164,8 @@ void testApplyRY() {
             {0.10575112905629831, -0.47593196040758534},
             {-0.8711876098966215, -0.0577721051072477}}};
 
-    const std::vector<ComplexPrecisionT> init_state{
-        {0.8775825618903728, 0.0}, {0.0, -0.47942553860420306}};
+    const TestVector<ComplexPrecisionT> init_state{{0.8775825618903728, 0.0},
+                                                   {0.0, -0.47942553860420306}};
     DYNAMIC_SECTION(GateImplementation::name
                     << ", RY - " << PrecisionToName<PrecisionT>::value) {
         for (size_t index = 0; index < angles.size(); index++) {
@@ -222,6 +222,14 @@ void testApplyRZ() {
 
         CHECK(st == PLApprox(expected_results[index]));
     }
+
+    for (size_t index = 0; index < num_qubits; index++) {
+        auto st = createPlusState<PrecisionT>(num_qubits);
+
+        GateImplementation::applyRZ(st.data(), num_qubits, {index}, true,
+                                    {-angles[index]});
+        CHECK(st == PLApprox(expected_results[index]));
+    }
 }
 PENNYLANE_RUN_TEST(RZ);
 
@@ -364,7 +372,7 @@ void testApplyIsingXX() {
                     << ", IsingXX0,2 - "
                     << PrecisionToName<PrecisionT>::value) {
         const size_t num_qubits = 3;
-        std::vector<ComplexPrecisionT> ini_st{
+        const auto ini_st = TestVector<ComplexPrecisionT>{
             ComplexPrecisionT{0.125681356503, 0.252712197380},
             ComplexPrecisionT{0.262591068130, 0.370189000494},
             ComplexPrecisionT{0.129300299863, 0.371057794075},
@@ -498,7 +506,7 @@ void testApplyIsingYY() {
                     << PrecisionToName<PrecisionT>::value) {
         const size_t num_qubits = 4;
 
-        std::vector<ComplexPrecisionT> ini_st{
+        const auto ini_st = TestVector<ComplexPrecisionT>{
             ComplexPrecisionT{0.276522701942, 0.192601873155},
             ComplexPrecisionT{0.035951282872, 0.224882549474},
             ComplexPrecisionT{0.142578003191, 0.016769549184},
@@ -652,7 +660,7 @@ void testApplyIsingZZ() {
                     << PrecisionToName<PrecisionT>::value) {
         const size_t num_qubits = 4;
 
-        std::vector<ComplexPrecisionT> ini_st{
+        TestVector<ComplexPrecisionT> ini_st{
             ComplexPrecisionT{0.267462841882, 0.010768564798},
             ComplexPrecisionT{0.228575129706, 0.010564590956},
             ComplexPrecisionT{0.099492749900, 0.260849823392},
diff --git a/pennylane_lightning/src/tests/Test_Internal.cpp b/pennylane_lightning/src/tests/Test_Internal.cpp
index 00ead21271..819f472586 100644
--- a/pennylane_lightning/src/tests/Test_Internal.cpp
+++ b/pennylane_lightning/src/tests/Test_Internal.cpp
@@ -1,6 +1,8 @@
-#include "GateImplementationsPI.hpp"
+#include "CreateAllWires.hpp"
 #include "TestHelpers.hpp"
+#include "cpu_kernels/GateImplementationsPI.hpp"
 
+#include <algorithm>
 #include <catch2/catch.hpp>
 
 #include <random>
@@ -95,8 +97,7 @@ TEMPLATE_TEST_CASE("randomUnitary", "[Test_Internal]", float, double) {
         const size_t dim = (1U << num_qubits);
         const auto unitary = randomUnitary<PrecisionT>(re, num_qubits);
 
-        std::vector<std::complex<PrecisionT>> unitary_dagger =
-            Util::Transpose(unitary, dim, dim);
+        auto unitary_dagger = Util::Transpose(unitary, dim, dim);
         std::transform(
             unitary_dagger.begin(), unitary_dagger.end(),
             unitary_dagger.begin(),
@@ -115,3 +116,79 @@ TEMPLATE_TEST_CASE("randomUnitary", "[Test_Internal]", float, double) {
         REQUIRE(mat == PLApprox(identity).margin(1e-5));
     }
 }
+
+size_t binomialCeff(size_t n, size_t r) {
+    size_t num = 1;
+    size_t dem = 1;
+    for (size_t k = 0; k < r; k++) {
+        num *= (n - k);
+    }
+    for (size_t k = 1; k <= r; k++) {
+        dem *= k;
+    }
+    return num / dem;
+}
+
+size_t permSize(size_t n, size_t r) {
+    size_t res = 1;
+    for (size_t k = 0; k < r; k++) {
+        res *= (n - k);
+    }
+    return res;
+}
+
+/**
+ * @brief Test create all wires
+ */
+TEST_CASE("createAllWires", "[Test_Internal]") {
+
+    SECTION("order = false") {
+        const std::vector<std::pair<size_t, size_t>> test_pairs{
+            {4, 2},  {8, 3},  {12, 1}, {12, 2}, {12, 3},  {12, 4},  {12, 5},
+            {12, 6}, {12, 7}, {12, 8}, {12, 9}, {12, 10}, {12, 11}, {12, 12}};
+
+        for (const auto [n, r] : test_pairs) {
+            std::vector<std::set<size_t>> vec;
+            auto v = CombinationGenerator(n, r).all_perms();
+
+            REQUIRE(v.size() == binomialCeff(n, r));
+            for (const auto &perm : v) {
+                REQUIRE(perm.size() == r);
+                vec.emplace_back(perm.begin(), perm.end());
+            }
+
+            std::sort(v.begin(), v.end(),
+                      [](const std::vector<size_t> &v1,
+                         const std::vector<size_t> &v2) {
+                          return std::lexicographical_compare(
+                              v1.begin(), v1.end(), v2.begin(), v2.end());
+                      }); // sort lexicographically
+            for (size_t i = 0; i < v.size() - 1; i++) {
+                REQUIRE(v[i] != v[i + 1]); // all combinations must be different
+            }
+        }
+    }
+    SECTION("order = true") {
+        const std::vector<std::pair<size_t, size_t>> test_pairs{
+            {4, 2}, {8, 3}, {12, 1}, {12, 2}, {12, 3}, {12, 4}, {12, 5}};
+
+        for (const auto [n, r] : test_pairs) {
+            auto v = PermutationGenerator(n, r).all_perms();
+
+            REQUIRE(v.size() == permSize(n, r));
+            for (const auto &perm : v) {
+                REQUIRE(perm.size() == r);
+            }
+
+            std::sort(v.begin(), v.end(),
+                      [](const std::vector<size_t> &v1,
+                         const std::vector<size_t> &v2) {
+                          return std::lexicographical_compare(
+                              v1.begin(), v1.end(), v2.begin(), v2.end());
+                      }); // sort lexicographically
+            for (size_t i = 0; i < v.size() - 1; i++) {
+                REQUIRE(v[i] != v[i + 1]); // all permutations must be different
+            }
+        }
+    }
+}
diff --git a/pennylane_lightning/src/tests/Test_Measures.cpp b/pennylane_lightning/src/tests/Test_Measures.cpp
index c72a9b68cf..b7ec1e8fd5 100644
--- a/pennylane_lightning/src/tests/Test_Measures.cpp
+++ b/pennylane_lightning/src/tests/Test_Measures.cpp
@@ -3,8 +3,7 @@
 #include <vector>
 
 #include "Measures.hpp"
-#include "StateVectorManaged.hpp"
-#include "StateVectorRaw.hpp"
+#include "StateVectorCPU.hpp"
 #include "Util.hpp"
 
 #include <catch2/catch.hpp>
@@ -18,14 +17,14 @@ using std::string;
 using std::vector;
 }; // namespace
 
-StateVectorManaged<double> Initializing_StateVector() {
+StateVectorCPU<double> Initializing_StateVector() {
     // Defining a StateVector in a non-trivial configuration:
     size_t num_qubits = 3;
     size_t data_size = std::pow(2, num_qubits);
 
     std::vector<std::complex<double>> arr(data_size, 0);
     arr[0] = 1;
-    StateVectorManaged<double> Measured_StateVector(arr.data(), data_size);
+    StateVectorCPU<double> Measured_StateVector(arr.data(), data_size);
 
     std::vector<size_t> wires;
 
@@ -65,12 +64,11 @@ TEST_CASE("Probabilities", "[Measures]") {
         {1, 2},    {2, 1},    {0},       {1},       {2}};
 
     // Defining the State Vector that will be measured.
-    StateVectorManaged<double> Measured_StateVector =
-        Initializing_StateVector();
+    StateVectorCPU<double> Measured_StateVector = Initializing_StateVector();
 
     // Initializing the measures class.
     // It will attach to the StateVector, allowing measures to keep been taken.
-    Measures<double, StateVectorManaged<double>> Measurer(Measured_StateVector);
+    Measures<double, StateVectorCPU<double>> Measurer(Measured_StateVector);
 
     vector<double> probabilities;
 
@@ -92,12 +90,11 @@ TEST_CASE("Probabilities", "[Measures]") {
 
 TEST_CASE("Expected Values", "[Measures]") {
     // Defining the State Vector that will be measured.
-    StateVectorManaged<double> Measured_StateVector =
-        Initializing_StateVector();
+    StateVectorCPU<double> Measured_StateVector = Initializing_StateVector();
 
     // Initializing the measures class.
     // It will attach to the StateVector, allowing measures to keep been taken.
-    Measures<double, StateVectorManaged<double>> Measurer(Measured_StateVector);
+    Measures<double, StateVectorCPU<double>> Measurer(Measured_StateVector);
 
     SECTION("Testing single operation defined by a matrix:") {
         vector<std::complex<double>> PauliX = {0, 1, 1, 0};
@@ -165,12 +162,11 @@ TEST_CASE("Expected Values", "[Measures]") {
 
 TEST_CASE("Variances", "[Measures]") {
     // Defining the State Vector that will be measured.
-    StateVectorManaged<double> Measured_StateVector =
-        Initializing_StateVector();
+    StateVectorCPU<double> Measured_StateVector = Initializing_StateVector();
 
     // Initializing the measures class.
     // It will attach to the StateVector, allowing measures to keep been taken.
-    Measures<double, StateVectorManaged<double>> Measurer(Measured_StateVector);
+    Measures<double, StateVectorCPU<double>> Measurer(Measured_StateVector);
 
     SECTION("Testing single operation defined by a matrix:") {
         vector<std::complex<double>> PauliX = {0, 1, 1, 0};
diff --git a/pennylane_lightning/src/tests/Test_StateVectorCPU.cpp b/pennylane_lightning/src/tests/Test_StateVectorCPU.cpp
new file mode 100644
index 0000000000..17fd667c19
--- /dev/null
+++ b/pennylane_lightning/src/tests/Test_StateVectorCPU.cpp
@@ -0,0 +1,48 @@
+#include <algorithm>
+#include <complex>
+#include <iostream>
+#include <limits>
+#include <random>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <catch2/catch.hpp>
+
+#include "StateVectorCPU.hpp"
+#include "StateVectorRaw.hpp"
+#include "Util.hpp"
+
+#include "TestHelpers.hpp"
+
+using namespace Pennylane;
+
+TEMPLATE_TEST_CASE("StateVectorCPU::StateVectorCPU", "[StateVectorRaw]", float,
+                   double) {
+    using fp_t = TestType;
+
+    SECTION("StateVectorCPU") {
+        REQUIRE(!std::is_constructible_v<StateVectorCPU<>>);
+    }
+    SECTION("StateVectorCPU<TestType>") {
+        REQUIRE(!std::is_constructible_v<StateVectorCPU<TestType>>);
+    }
+    SECTION("StateVectorCPU<TestType> {size_t}") {
+        REQUIRE(std::is_constructible_v<StateVectorCPU<TestType>, size_t>);
+        const size_t num_qubits = 4;
+        StateVectorCPU<fp_t> sv(num_qubits);
+
+        REQUIRE(sv.getNumQubits() == 4);
+        REQUIRE(sv.getLength() == 16);
+    }
+    SECTION("StateVectorCPU<TestType> {const StateVectorRaw<TestType>&}") {
+        REQUIRE(std::is_constructible_v<StateVectorCPU<TestType>,
+                                        const StateVectorRaw<TestType> &>);
+    }
+    SECTION("StateVectorCPU<TestType> {const StateVectorCPU<TestType>&}") {
+        REQUIRE(std::is_copy_constructible_v<StateVectorCPU<TestType>>);
+    }
+    SECTION("StateVectorCPU<TestType> {StateVectorCPU<TestType>&&}") {
+        REQUIRE(std::is_move_constructible_v<StateVectorCPU<TestType>>);
+    }
+}
diff --git a/pennylane_lightning/src/tests/Test_StateVectorManaged.cpp b/pennylane_lightning/src/tests/Test_StateVectorManaged.cpp
deleted file mode 100644
index 3a30c68638..0000000000
--- a/pennylane_lightning/src/tests/Test_StateVectorManaged.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-#include <algorithm>
-#include <complex>
-#include <iostream>
-#include <limits>
-#include <random>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include <catch2/catch.hpp>
-
-#include "StateVectorManaged.hpp"
-#include "StateVectorRaw.hpp"
-#include "Util.hpp"
-
-#include "TestHelpers.hpp"
-
-using namespace Pennylane;
-
-TEMPLATE_TEST_CASE("StateVectorManaged::StateVectorManaged", "[StateVectorRaw]",
-                   float, double) {
-    using fp_t = TestType;
-
-    SECTION("StateVectorManaged") {
-        REQUIRE(std::is_constructible_v<StateVectorManaged<>>);
-    }
-    SECTION("StateVectorManaged<TestType>") {
-        REQUIRE(std::is_constructible_v<StateVectorManaged<TestType>>);
-    }
-    SECTION("StateVectorManaged<TestType> {size_t}") {
-        REQUIRE(std::is_constructible_v<StateVectorManaged<TestType>>);
-        const size_t num_qubits = 4;
-        StateVectorManaged<fp_t> sv(num_qubits);
-
-        REQUIRE(sv.getNumQubits() == 4);
-        REQUIRE(sv.getLength() == 16);
-        REQUIRE(sv.getDataVector().size() == 16);
-    }
-    SECTION("StateVectorManaged<TestType> {const StateVectorRaw<TestType>&}") {
-        REQUIRE(std::is_constructible_v<StateVectorManaged<TestType>,
-                                        const StateVectorRaw<TestType> &>);
-    }
-    SECTION(
-        "StateVectorManaged<TestType> {const StateVectorManaged<TestType>&}") {
-        REQUIRE(std::is_copy_constructible_v<StateVectorManaged<TestType>>);
-    }
-    SECTION("StateVectorManaged<TestType> {StateVectorManaged<TestType>&&}") {
-        REQUIRE(std::is_move_constructible_v<StateVectorManaged<TestType>>);
-    }
-}
diff --git a/pennylane_lightning/src/tests/Test_Util.cpp b/pennylane_lightning/src/tests/Test_Util.cpp
index 4360a793f5..2e804d05e2 100644
--- a/pennylane_lightning/src/tests/Test_Util.cpp
+++ b/pennylane_lightning/src/tests/Test_Util.cpp
@@ -567,6 +567,21 @@ TEST_CASE("Utility bit operations", "[Util][BitUtil]") {
             }
         }
     }
+
+    SECTION("Bitswap") {
+        CHECK(Util::bitswap(0B001101, 0, 1) == 0B001110);
+        CHECK(Util::bitswap(0B001101, 0, 2) == 0B001101);
+        CHECK(Util::bitswap(0B001101, 0, 3) == 0B001101);
+        CHECK(Util::bitswap(0B001101, 0, 4) == 0B011100);
+    }
+
+    SECTION("fillOnes") {
+        CHECK(Util::fillOnes<uint8_t>(4) == 0B1111);
+        CHECK(Util::fillOnes<uint8_t>(6) == 0B111111);
+        CHECK(Util::fillOnes<uint32_t>(17) == 0B1'1111'1111'1111'1111);
+        CHECK(Util::fillOnes<uint64_t>(54) ==
+              0x3F'FFFF'FFFF'FFFF); // 54 == 4*13 + 2
+    }
 }
 
 TEST_CASE("Utility array and tuples", "[Util]") {
diff --git a/pennylane_lightning/src/util/BitUtil.hpp b/pennylane_lightning/src/util/BitUtil.hpp
index 8b7251ddc3..d6996a77d8 100644
--- a/pennylane_lightning/src/util/BitUtil.hpp
+++ b/pennylane_lightning/src/util/BitUtil.hpp
@@ -19,6 +19,7 @@
 #include <climits>
 #include <cstdint>
 #include <cstdlib>
+#include <type_traits>
 
 #if defined(_MSC_VER)
 #include <intrin.h> // for __lzcnt64 and __popcount
@@ -171,6 +172,18 @@ inline auto log2PerfectPower(unsigned long val) -> size_t {
 #endif
 ///@}
 
+constexpr auto constLog2PerfectPower(size_t value) -> size_t {
+    if (value == 0) {
+        return 0; // not well defined
+    }
+    size_t n = 0;
+    while ((value & 1U) == 0U) {
+        value >>= 1U;
+        ++n;
+    }
+    return n;
+}
+
 /**
  * @brief Check if there is a positive integer n such that value == 2^n.
  *
@@ -196,8 +209,18 @@ inline auto constexpr fillLeadingOnes(size_t pos) -> size_t {
 /**
  * @brief Swap bits in i-th and j-th position in place
  */
-inline void constexpr bitswap(size_t bits, const size_t i, const size_t j) {
+inline auto constexpr bitswap(size_t bits, const size_t i, const size_t j)
+    -> size_t {
     size_t x = ((bits >> i) ^ (bits >> j)) & 1U;
-    bits ^= ((x << i) | (x << j));
+    return bits ^ ((x << i) | (x << j));
+}
+
+template <class IntegerType>
+inline auto constexpr fillOnes(size_t nbits) -> IntegerType {
+    static_assert(std::is_integral_v<IntegerType> &&
+                  std::is_unsigned_v<IntegerType>);
+
+    return static_cast<IntegerType>(~IntegerType(0)) >>
+           static_cast<IntegerType>(CHAR_BIT * sizeof(IntegerType) - nbits);
 }
 } // namespace Pennylane::Util
diff --git a/pennylane_lightning/src/util/LinearAlgebra.hpp b/pennylane_lightning/src/util/LinearAlgebra.hpp
index 40ea4292a8..bdf0e35f99 100644
--- a/pennylane_lightning/src/util/LinearAlgebra.hpp
+++ b/pennylane_lightning/src/util/LinearAlgebra.hpp
@@ -210,9 +210,9 @@ inline auto innerProdC(const std::complex<T> *v1, const std::complex<T> *v2,
  * @see innerProd(const std::complex<T> *v1, const std::complex<T> *v2,
  * const size_t data_size)
  */
-template <class T>
-inline auto innerProd(const std::vector<std::complex<T>> &v1,
-                      const std::vector<std::complex<T>> &v2)
+template <class T, class AllocA, class AllocB>
+inline auto innerProd(const std::vector<std::complex<T>, AllocA> &v1,
+                      const std::vector<std::complex<T>, AllocB> &v2)
     -> std::complex<T> {
     return innerProd(v1.data(), v2.data(), v1.size());
 }
@@ -224,9 +224,9 @@ inline auto innerProd(const std::vector<std::complex<T>> &v1,
  * @see innerProdC(const std::complex<T> *v1, const std::complex<T> *v2,
  * const size_t data_size)
  */
-template <class T>
-inline auto innerProdC(const std::vector<std::complex<T>> &v1,
-                       const std::vector<std::complex<T>> &v2)
+template <class T, class AllocA, class AllocB>
+inline auto innerProdC(const std::vector<std::complex<T>, AllocA> &v1,
+                       const std::vector<std::complex<T>, AllocB> &v2)
     -> std::complex<T> {
     return innerProdC(v1.data(), v2.data(), v1.size());
 }
@@ -484,15 +484,15 @@ inline auto Transpose(const std::vector<std::complex<T>> &mat, size_t m,
  * @param n Number of columns of `mat`.
  * @return mat transpose of shape n * m.
  */
-template <class T>
-inline auto Transpose(const std::vector<T> &mat, size_t m, size_t n)
-    -> std::vector<T> {
+template <class T, class Alloc>
+inline auto Transpose(const std::vector<T, Alloc> &mat, size_t m, size_t n)
+    -> std::vector<T, Alloc> {
     if (mat.size() != m * n) {
         throw std::invalid_argument(
             "Invalid number of rows and columns for the input matrix");
     }
 
-    std::vector<T> mat_t(n * m);
+    std::vector<T, Alloc> mat_t(n * m);
     CFTranspose(mat.data(), mat_t.data(), m, n, 0, m, 0, n);
     return mat_t;
 }
@@ -548,9 +548,10 @@ inline void vecMatrixProd(const T *v_in, const T *mat, T *v_out, size_t m,
  * @see inline void vecMatrixProd(const T *v_in,
  * const T *mat, T *v_out, size_t m, size_t n)
  */
-template <class T>
-inline auto vecMatrixProd(const std::vector<T> &v_in, const std::vector<T> &mat,
-                          size_t m, size_t n) -> std::vector<T> {
+template <class T, class Alloc>
+inline auto vecMatrixProd(const std::vector<T, Alloc> &v_in,
+                          const std::vector<T, Alloc> &mat, size_t m, size_t n)
+    -> std::vector<T, Alloc> {
     if (v_in.size() != m) {
         throw std::invalid_argument("Invalid size for the input vector");
     }
@@ -559,7 +560,7 @@ inline auto vecMatrixProd(const std::vector<T> &v_in, const std::vector<T> &mat,
             "Invalid number of rows and columns for the input matrix");
     }
 
-    std::vector<T> v_out(n);
+    std::vector<T, Alloc> v_out(n);
     vecMatrixProd(v_in.data(), mat.data(), v_out.data(), m, n);
 
     return v_out;
diff --git a/pennylane_lightning/src/util/Macros.hpp b/pennylane_lightning/src/util/Macros.hpp
index 1b60d1e076..eeba364ba8 100644
--- a/pennylane_lightning/src/util/Macros.hpp
+++ b/pennylane_lightning/src/util/Macros.hpp
@@ -19,6 +19,74 @@
 
 #if defined(__GNUC__) || defined(__clang__)
 #define PL_UNREACHABLE __builtin_unreachable()
-#else
+#elif defined(_MSC_VER)
 #define PL_UNREACHABLE __assume(false)
+#else // Unsupported compiler
+#define PL_UNREACHABLE
+#endif
+
+#if defined(__AVX2__)
+#define PL_USE_AVX2 1
+[[maybe_unused]] static constexpr bool use_avx2 = true;
+#else
+[[maybe_unused]] static constexpr bool use_avx2 = false;
+#endif
+
+#if defined(__AVX512F__)
+#define PL_USE_AVX512F 1
+[[maybe_unused]] static constexpr bool use_avx512f = true;
+#else
+[[maybe_unused]] static constexpr bool use_avx512f = false;
+#endif
+
+#if defined(__AVX512DQ__)
+#define PL_USE_AVX512DQ 1
+[[maybe_unused]] static constexpr bool use_avx512dq = true;
+#else
+[[maybe_unused]] static constexpr bool use_avx512dq = false;
+#endif
+
+#if defined(__AVX512VL__)
+#define PL_USE_AVX512VL 1
+[[maybe_unused]] static constexpr bool use_avx512vl = true;
+#else
+[[maybe_unused]] static constexpr bool use_avx512vl = false;
+#endif
+
+#if defined(_OPENMP)
+#define PL_USE_OMP 1
+[[maybe_unused]] static constexpr bool use_openmp = true;
+#else
+[[maybe_unused]] static constexpr bool use_openmp = false;
+#endif
+
+#if (_OPENMP >= 202011)
+#define PL_UNROLL_LOOP __Pragma("omp unroll(8)")
+#elif defined(__GNUC__)
+#define PL_UNROLL_LOOP _Pragma("GCC unroll 8")
+#elif defined(__clang__)
+#define PL_UNROLL_LOOP _Pragma("unroll(8)")
+#else
+#define PL_UNROLL_LOOP
+#endif
+
+// Define force inline
+#if defined(__GNUC__) || defined(__clang__)
+#if NDEBUG
+#define PL_FORCE_INLINE __attribute__((always_inline)) inline
+#else
+#define PL_FORCE_INLINE
+#endif
+#elif defined(_MSC_VER)
+#if NDEBUG
+#define PL_FORCE_INLINE __forceinline
+#else
+#define PL_FORCE_INLINE
+#endif
+#else
+#if NDEBUG
+#define PL_FORCE_INLINE inline
+#else
+#define PL_FORCE_INLINE
+#endif
 #endif
diff --git a/pennylane_lightning/src/util/Memory.hpp b/pennylane_lightning/src/util/Memory.hpp
new file mode 100644
index 0000000000..dcd8dd359d
--- /dev/null
+++ b/pennylane_lightning/src/util/Memory.hpp
@@ -0,0 +1,106 @@
+// Copyright 2022 Xanadu Quantum Technologies Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//     http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <cstdint>
+#include <cstdlib>
+#include <limits>
+#include <memory>
+#include <new>
+
+#include "TypeList.hpp"
+
+namespace Pennylane {
+
+constexpr auto constIsPerfectPowerOf2(size_t value) -> bool {
+    while ((value & 1U) == 0) {
+        value >>= 1U;
+    }
+    return value == 1;
+}
+
+template <class T, uint32_t alignment> struct AlignedAllocator {
+    static_assert(constIsPerfectPowerOf2(alignment),
+                  "Template parameter alignment must be power of 2.");
+    using value_type = T;
+
+    AlignedAllocator() = default;
+
+    template <class U> struct rebind {
+        using other = AlignedAllocator<U, alignment>;
+    };
+
+    template <typename U>
+    explicit constexpr AlignedAllocator(
+        [[maybe_unused]] const AlignedAllocator<U, alignment> &rhs) noexcept {}
+
+    [[nodiscard]] T *allocate(std::size_t size) {
+        if (size == 0) {
+            return nullptr;
+        }
+        void *p = std::aligned_alloc(
+            alignment,
+            sizeof(T) * size); // NOLINT(cppcoreguidelines-owning-memory)
+        if (p == nullptr) {
+            throw std::bad_alloc();
+        }
+        return static_cast<T *>(p);
+    }
+
+    void deallocate(T *p, [[maybe_unused]] std::size_t size) noexcept {
+        std::free(
+            p); // NOLINT(hicpp-no-malloc, cppcoreguidelines-owning-memory)
+    }
+
+    template <class U> void construct(U *ptr) { ::new ((void *)ptr) U(); }
+
+    template <class U> void destroy(U *ptr) {
+        (void)ptr;
+        ptr->~U();
+    }
+};
+
+template <class T, class U, uint32_t alignment>
+bool operator==([[maybe_unused]] const AlignedAllocator<T, alignment> &lhs,
+                [[maybe_unused]] const AlignedAllocator<U, alignment> &rhs) {
+    return true;
+}
+
+template <class T, class U, uint32_t alignment>
+bool operator!=([[maybe_unused]] const AlignedAllocator<T, alignment> &lhs,
+                [[maybe_unused]] const AlignedAllocator<U, alignment> &rhs) {
+    return false;
+}
+
+/**
+ * @brief This function calculate the common multiplier of alignments of all
+ * kernels.
+ *
+ * As all alignment must be a multiple of 2, we just can choose the maximum
+ * alignment.
+ */
+template <typename TypeList> struct commonAlignmentHelper {
+    constexpr static uint32_t value =
+        std::max(TypeList::Type::packed_bytes,
+                 commonAlignmentHelper<typename TypeList::Next>::value);
+};
+template <> struct commonAlignmentHelper<void> {
+    constexpr static uint32_t value = 4U;
+};
+
+template <typename TypeList>
+[[maybe_unused]] constexpr static size_t common_alignment =
+    commonAlignmentHelper<TypeList>::value;
+
+template <class T, uint32_t alignment>
+using PLAllocator = std::conditional_t<alignment == 4, std::allocator<T>,
+                                       AlignedAllocator<T, alignment>>;
+} // namespace Pennylane
diff --git a/pennylane_lightning/src/util/TypeList.hpp b/pennylane_lightning/src/util/TypeList.hpp
index e288bd80a5..97db820da7 100644
--- a/pennylane_lightning/src/util/TypeList.hpp
+++ b/pennylane_lightning/src/util/TypeList.hpp
@@ -18,14 +18,19 @@
 #pragma once
 
 #include <cstdlib>
+#include <tuple>
 #include <type_traits>
+#include <utility>
 
 namespace Pennylane::Util {
 template <typename T, typename... Ts> struct TypeNode {
     using Type = T;
     using Next = TypeNode<Ts...>;
 };
-
+template <typename T> struct TypeNode<T, void> {
+    using Type = T;
+    using Next = void;
+};
 template <typename T> struct TypeNode<T> {
     using Type = T;
     using Next = void;
@@ -36,16 +41,22 @@ template <typename T> struct TypeNode<T> {
  */
 template <typename... Ts> using TypeList = TypeNode<Ts...>;
 
-template <typename TypeList, size_t n> struct getNthType {
-    static_assert(!std::is_same_v<typename TypeList::Next, void>,
-                  "The given n is larger than the length of the typelist.");
-    using Type = getNthType<typename TypeList::Next, n - 1>;
+template <typename TypeList, size_t n> struct getNth {
+    using Type = typename getNth<typename TypeList::Next, n - 1>::Type;
 };
 
-template <typename TypeList> struct getNthType<TypeList, 0> {
+template <typename TypeList> struct getNth<TypeList, 0> {
+    static_assert(!std::is_same_v<typename TypeList::Type, void>,
+                  "The given n is larger than the length of the type list.");
     using Type = typename TypeList::Type;
 };
 
+/**
+ * @brief Alias
+ */
+template <typename TypeList, size_t n>
+using getNthType = typename getNth<TypeList, n>::Type;
+
 template <typename TypeList> constexpr size_t length() {
     if constexpr (std::is_same_v<TypeList, void>) {
         return 0;
@@ -53,4 +64,15 @@ template <typename TypeList> constexpr size_t length() {
         return 1 + length<typename TypeList::Next>();
     }
 }
+
+template <typename T, typename U> struct PrependToTypeList;
+
+template <typename T, typename... Ts>
+struct PrependToTypeList<T, TypeNode<Ts...>> {
+    using Type = TypeNode<T, Ts...>;
+};
+template <typename T> struct PrependToTypeList<T, void> {
+    using Type = TypeNode<T, void>;
+};
+
 } // namespace Pennylane::Util
diff --git a/pennylane_lightning/src/util/Util.hpp b/pennylane_lightning/src/util/Util.hpp
index 3b184b82f9..ca029c0609 100644
--- a/pennylane_lightning/src/util/Util.hpp
+++ b/pennylane_lightning/src/util/Util.hpp
@@ -404,4 +404,39 @@ auto chunkData(const Container<T> &data, std::size_t num_chunks)
 // type alias
 template <class T> using remove_cvref_t = typename remove_cvref<T>::type;
 
+/**
+ * @brief Iterate over all enum values (if BEGIN and END are defined).
+ *
+ * @tparam T enum type
+ * @tparam Func function to execute
+ */
+template <class T, class Func> void for_each_enum(Func &&func) {
+    for (auto e = T::BEGIN; e != T::END;
+         e = static_cast<T>(std::underlying_type_t<T>(e) + 1)) {
+        func(e);
+    }
+}
+template <class T, class U, class Func> void for_each_enum(Func &&func) {
+    for (auto e1 = T::BEGIN; e1 != T::END;
+         e1 = static_cast<T>(std::underlying_type_t<T>(e1) + 1)) {
+        for (auto e2 = U::BEGIN; e2 != U::END;
+             e2 = static_cast<U>(std::underlying_type_t<U>(e2) + 1)) {
+            func(e1, e2);
+        }
+    }
+}
+
+template <class PrecisionT, class TypeList> struct common_alignment {
+    constexpr static size_t value =
+        std::max(TypeList::Type::template required_alignment<PrecisionT>,
+                 common_alignment<PrecisionT, typename TypeList::Next>::value);
+};
+template <class PrecisionT> struct common_alignment<PrecisionT, void> {
+    constexpr static size_t value = std::alignment_of_v<PrecisionT>;
+};
+
+template <class PrecisionT, class TypeList>
+[[maybe_unused]] constexpr static size_t common_alignment_v =
+    common_alignment<PrecisionT, TypeList>::value;
+
 } // namespace Pennylane::Util

From 9398a0ab728bfd849c5f19b46b5cc4713b55a851 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Sun, 27 Feb 2022 00:34:13 -0500
Subject: [PATCH 02/94] StateVector architecture refatored; pass tests

---
 pennylane_lightning/lightning_qubit.py        |   5 +
 .../src/algorithms/AdjointDiff.hpp            |  49 ++++---
 pennylane_lightning/src/bindings/Bindings.hpp |   7 +
 .../DefaultKernelsForStateVector.hpp          |  15 +-
 .../src/simulator/Measures.hpp                |  16 +--
 .../src/simulator/StateVectorCPU.hpp          | 110 ++------------
 .../src/simulator/StateVectorManagedCPU.cpp   |  19 +++
 .../src/simulator/StateVectorManagedCPU.hpp   | 134 ++++++++++++++++++
 ...ateVectorRaw.cpp => StateVectorRawCPU.cpp} |   6 +-
 ...ateVectorRaw.hpp => StateVectorRawCPU.hpp} |  25 ++--
 pennylane_lightning/src/tests/.clang-tidy     |   2 +-
 pennylane_lightning/src/tests/CMakeLists.txt  |   2 -
 .../src/tests/CreateAllWires.cpp              |   2 +-
 .../src/tests/Test_AdjDiff.cpp                |  35 +++--
 ...est_GateImplementations_CompareKernels.cpp |   2 -
 .../Test_GateImplementations_Nonparam.cpp     |  11 +-
 .../tests/Test_GateImplementations_Param.cpp  |  12 +-
 .../src/tests/Test_Measures.cpp               |  24 ++--
 .../src/tests/Test_StateVectorBase.cpp        |   0
 .../src/tests/Test_StateVectorCPU.cpp         |  76 +++++++---
 .../src/tests/Test_StateVectorRaw.cpp         |  47 ------
 pennylane_lightning/src/tests/Test_Util.cpp   |   6 +-
 .../src/tests/Test_VectorJacobianProduct.cpp  |  50 +++----
 pennylane_lightning/src/util/Memory.hpp       |   8 +-
 24 files changed, 370 insertions(+), 293 deletions(-)
 create mode 100644 pennylane_lightning/src/simulator/StateVectorManagedCPU.cpp
 create mode 100644 pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
 rename pennylane_lightning/src/simulator/{StateVectorRaw.cpp => StateVectorRawCPU.cpp} (82%)
 rename pennylane_lightning/src/simulator/{StateVectorRaw.hpp => StateVectorRawCPU.hpp} (82%)
 delete mode 100644 pennylane_lightning/src/tests/Test_StateVectorBase.cpp
 delete mode 100644 pennylane_lightning/src/tests/Test_StateVectorRaw.cpp

diff --git a/pennylane_lightning/lightning_qubit.py b/pennylane_lightning/lightning_qubit.py
index 4fe1851a69..f6dd12a3fe 100644
--- a/pennylane_lightning/lightning_qubit.py
+++ b/pennylane_lightning/lightning_qubit.py
@@ -116,6 +116,11 @@ def __init__(self, wires, *, kernel_for_ops=None, shots=None, batch_obs=False):
         super().__init__(wires, shots=shots)
         self._batch_obs = batch_obs
 
+        # Lightning keeps a simulator memory of which is managed by C++
+        # Note that as C++ manages the data, we need to copy from this array when
+        # the result is used outside of the module
+        self.sim_ = None
+
     @classmethod
     def capabilities(cls):
         capabilities = super().capabilities().copy()
diff --git a/pennylane_lightning/src/algorithms/AdjointDiff.hpp b/pennylane_lightning/src/algorithms/AdjointDiff.hpp
index 9b69139260..c717681b40 100644
--- a/pennylane_lightning/src/algorithms/AdjointDiff.hpp
+++ b/pennylane_lightning/src/algorithms/AdjointDiff.hpp
@@ -27,7 +27,7 @@
 #include "Error.hpp"
 #include "JacobianTape.hpp"
 #include "LinearAlgebra.hpp"
-#include "StateVectorCPU.hpp"
+#include "StateVectorManagedCPU.hpp"
 
 #include <iostream>
 
@@ -49,7 +49,7 @@ namespace Pennylane::Algorithms {
  */
 template <class T = double> class AdjointJacobian {
   private:
-    using GeneratorFunc = void (*)(StateVectorCPU<T> &,
+    using GeneratorFunc = void (*)(StateVectorManagedCPU<T> &,
                                    const std::vector<size_t> &,
                                    const bool); // function pointer type
 
@@ -64,8 +64,8 @@ template <class T = double> class AdjointJacobian {
      * @param obs_index Observable index position of Jacobian to update.
      * @param param_index Parameter index position of Jacobian to update.
      */
-    inline void updateJacobian(const StateVectorCPU<T> &sv1,
-                               const StateVectorCPU<T> &sv2,
+    inline void updateJacobian(const StateVectorManagedCPU<T> &sv1,
+                               const StateVectorManagedCPU<T> &sv2,
                                std::vector<std::vector<T>> &jac,
                                T scaling_coeff, size_t obs_index,
                                size_t param_index) {
@@ -77,13 +77,13 @@ template <class T = double> class AdjointJacobian {
 
     /**
      * @brief Utility method to apply all operations from given `%OpsData<T>`
-     * object to `%StateVectorCPU<T>`
+     * object to `%StateVectorManagedCPU<T>`
      *
      * @param state Statevector to be updated.
      * @param operations Operations to apply.
      * @param adj Take the adjoint of the given operations.
      */
-    inline void applyOperations(StateVectorCPU<T> &state,
+    inline void applyOperations(StateVectorManagedCPU<T> &state,
                                 const OpsData<T> &operations,
                                 bool adj = false) {
         for (size_t op_idx = 0; op_idx < operations.getOpsName().size();
@@ -96,13 +96,13 @@ template <class T = double> class AdjointJacobian {
     }
     /**
      * @brief Utility method to apply the adjoint indexed operation from
-     * `%OpsData<T>` object to `%StateVectorCPU<T>`.
+     * `%OpsData<T>` object to `%StateVectorManagedCPU<T>`.
      *
      * @param state Statevector to be updated.
      * @param operations Operations to apply.
      * @param op_idx Adjointed operation index to apply.
      */
-    inline void applyOperationAdj(StateVectorCPU<T> &state,
+    inline void applyOperationAdj(StateVectorManagedCPU<T> &state,
                                   const OpsData<T> &operations, size_t op_idx) {
         state.applyOperation(operations.getOpsName()[op_idx],
                              operations.getOpsWires()[op_idx],
@@ -112,12 +112,12 @@ template <class T = double> class AdjointJacobian {
 
     /**
      * @brief Utility method to apply a given operations from given
-     * `%ObsDatum<T>` object to `%StateVectorCPU<T>`
+     * `%ObsDatum<T>` object to `%StateVectorManagedCPU<T>`
      *
      * @param state Statevector to be updated.
      * @param observable Observable to apply.
      */
-    inline void applyObservable(StateVectorCPU<T> &state,
+    inline void applyObservable(StateVectorManagedCPU<T> &state,
                                 const ObsDatum<T> &observable) {
         using namespace Pennylane::Util;
         for (size_t j = 0; j < observable.getSize(); j++) {
@@ -159,9 +159,10 @@ template <class T = double> class AdjointJacobian {
      * @param reference_state Reference statevector
      * @param observables Vector of observables to apply to each statevector.
      */
-    inline void applyObservables(std::vector<StateVectorCPU<T>> &states,
-                                 const StateVectorCPU<T> &reference_state,
-                                 const std::vector<ObsDatum<T>> &observables) {
+    inline void
+    applyObservables(std::vector<StateVectorManagedCPU<T>> &states,
+                     const StateVectorManagedCPU<T> &reference_state,
+                     const std::vector<ObsDatum<T>> &observables) {
         // clang-format off
         // Globally scoped exception value to be captured within OpenMP block.
         // See the following for OpenMP design decisions:
@@ -209,9 +210,9 @@ template <class T = double> class AdjointJacobian {
      * @param op_idx Index of given operation within operations list to take
      * adjoint of.
      */
-    inline void applyOperationsAdj(std::vector<StateVectorCPU<T>> &states,
-                                   const OpsData<T> &operations,
-                                   size_t op_idx) {
+    inline void
+    applyOperationsAdj(std::vector<StateVectorManagedCPU<T>> &states,
+                       const OpsData<T> &operations, size_t op_idx) {
         // clang-format off
         // Globally scoped exception value to be captured within OpenMP block.
         // See the following for OpenMP design decisions:
@@ -300,7 +301,7 @@ template <class T = double> class AdjointJacobian {
      * of parametric gates.
      *
      * For the statevector data associated with `psi` of length `num_elements`,
-     * we make internal copies to a `%StateVectorCPU<T>` object, with one
+     * we make internal copies to a `%StateVectorManagedCPU<T>` object, with one
      * per required observable. The `operations` will be applied to the internal
      * statevector copies, with the operation indices participating in the
      * gradient calculations given in `trainableParams`, and the overall number
@@ -335,7 +336,8 @@ template <class T = double> class AdjointJacobian {
             num_param_ops - 1; // total number of parametric ops
 
         // Create $U_{1:p}\vert \lambda \rangle$
-        StateVectorCPU<T> lambda(jd.getPtrStateVec(), jd.getSizeStateVec());
+        StateVectorManagedCPU<T> lambda(jd.getPtrStateVec(),
+                                        jd.getSizeStateVec());
 
         // Apply given operations to statevector if requested
         if (apply_operations) {
@@ -345,14 +347,15 @@ template <class T = double> class AdjointJacobian {
         const auto tp_begin = tp.begin();
         auto tp_it = tp.end();
 
-        StateVectorCPU<T> sv{lambda.getNumQubits(), Threading::SingleThread};
+        StateVectorManagedCPU<T> sv{lambda.getNumQubits(),
+                                    Threading::SingleThread};
         // Create observable-applied state-vectors
-        std::vector<StateVectorCPU<T>> H_lambda(
-            num_observables,
-            StateVectorCPU<T>{lambda.getNumQubits(), Threading::SingleThread});
+        std::vector<StateVectorManagedCPU<T>> H_lambda(
+            num_observables, StateVectorManagedCPU<T>{lambda.getNumQubits(),
+                                                      Threading::SingleThread});
         applyObservables(H_lambda, lambda, obs);
 
-        StateVectorCPU<T> mu(lambda.getNumQubits());
+        StateVectorManagedCPU<T> mu(lambda.getNumQubits());
 
         for (int op_idx = static_cast<int>(ops_name.size() - 1); op_idx >= 0;
              op_idx--) {
diff --git a/pennylane_lightning/src/bindings/Bindings.hpp b/pennylane_lightning/src/bindings/Bindings.hpp
index 5d79774ffd..a1845ba67d 100644
--- a/pennylane_lightning/src/bindings/Bindings.hpp
+++ b/pennylane_lightning/src/bindings/Bindings.hpp
@@ -63,6 +63,13 @@ static auto create(pybind11::array_t<std::complex<PrecisionT>> &numpyArray)
         {data_ptr, static_cast<size_t>(numpyArrayInfo.shape[0])});
 }
 
+template <class PrecisionT = double>
+static auto toNumpyArray(const StateVectorCPU<PrecisionT> &sv)
+    -> py::array_t<std::complex<PrecisionT>> {
+    return py::array_t<std::complex<PrecisionT>>(
+        {sv.getLength()}, {sizeof(PrecisionT)} sv.getData(), );
+}
+
 /**
  * @brief Apply given list of operations to Numpy data array using C++
  * `%StateVector` class.
diff --git a/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp b/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp
index 72613bc386..3259ad5861 100644
--- a/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp
+++ b/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp
@@ -123,15 +123,18 @@ class DefaultKernelsForStateVector {
                                      all_memory_model, all_qubit_numbers,
                                      Gates::KernelType::LM);
 
-        instance.assignKernelForGate(GateOperation::IsingXX, all_threading,
-                                     all_memory_model, less_than(12),
-                                     Gates::KernelType::LM);
+        instance.assignKernelForGate(
+            GateOperation::IsingXX, all_threading,
+            // NOLINTNEXTLINE(readability-magic-numbers)
+            all_memory_model, less_than(12), Gates::KernelType::LM);
         instance.assignKernelForGate(
             GateOperation::IsingXX, all_threading, all_memory_model,
+            // NOLINTNEXTLINE(readability-magic-numbers)
             in_between_closed(12, 20), Gates::KernelType::PI);
-        instance.assignKernelForGate(GateOperation::IsingXX, all_threading,
-                                     all_memory_model, larger_than(20),
-                                     Gates::KernelType::LM);
+        instance.assignKernelForGate(
+            GateOperation::IsingXX, all_threading,
+            // NOLINTNEXTLINE(readability-magic-numbers)
+            all_memory_model, larger_than(20), Gates::KernelType::LM);
 
         instance.assignKernelForGate(GateOperation::IsingYY, all_threading,
                                      all_memory_model, all_qubit_numbers,
diff --git a/pennylane_lightning/src/simulator/Measures.hpp b/pennylane_lightning/src/simulator/Measures.hpp
index f2f1cc010a..d03031ee36 100644
--- a/pennylane_lightning/src/simulator/Measures.hpp
+++ b/pennylane_lightning/src/simulator/Measures.hpp
@@ -26,8 +26,8 @@
 #include <vector>
 
 #include "LinearAlgebra.hpp"
-#include "StateVectorCPU.hpp"
-#include "StateVectorRaw.hpp"
+#include "StateVectorManagedCPU.hpp"
+#include "StateVectorRawCPU.hpp"
 
 namespace Pennylane {
 /**
@@ -39,14 +39,14 @@ namespace Pennylane {
  *
  * @tparam fp_t Floating point precision of underlying measurements.
  */
-template <class fp_t = double, class SVType = StateVectorRaw<fp_t>>
+template <class fp_t = double, class SVType = StateVectorRawCPU<fp_t>>
 class Measures {
   private:
     const SVType &original_statevector;
     using CFP_t = std::complex<fp_t>;
 
   public:
-    Measures(const SVType &provided_statevector)
+    explicit Measures(const SVType &provided_statevector)
         : original_statevector{provided_statevector} {};
 
     /**
@@ -123,7 +123,7 @@ class Measures {
                 const std::vector<size_t> &wires) {
         // Copying the original state vector, for the application of the
         // observable operator.
-        StateVectorCPU<fp_t> operator_statevector(original_statevector);
+        StateVectorManagedCPU<fp_t> operator_statevector(original_statevector);
 
         operator_statevector.applyMatrix(matrix, wires);
 
@@ -143,7 +143,7 @@ class Measures {
                 const std::vector<size_t> &wires) {
         // Copying the original state vector, for the application of the
         // observable operator.
-        StateVectorCPU<fp_t> operator_statevector(original_statevector);
+        StateVectorManagedCPU<fp_t> operator_statevector(original_statevector);
 
         operator_statevector.applyOperation(operation, wires);
 
@@ -190,7 +190,7 @@ class Measures {
     fp_t var(const std::string &operation, const std::vector<size_t> &wires) {
         // Copying the original state vector, for the application of the
         // observable operator.
-        StateVectorCPU<fp_t> operator_statevector(original_statevector);
+        StateVectorManagedCPU<fp_t> operator_statevector(original_statevector);
 
         operator_statevector.applyOperation(operation, wires);
 
@@ -216,7 +216,7 @@ class Measures {
              const std::vector<size_t> &wires) {
         // Copying the original state vector, for the application of the
         // observable operator.
-        StateVectorCPU<fp_t> operator_statevector(original_statevector);
+        StateVectorManagedCPU<fp_t> operator_statevector(original_statevector);
 
         operator_statevector.applyMatrix(matrix, wires);
 
diff --git a/pennylane_lightning/src/simulator/StateVectorCPU.hpp b/pennylane_lightning/src/simulator/StateVectorCPU.hpp
index a003adf1ba..89ff8d40a4 100644
--- a/pennylane_lightning/src/simulator/StateVectorCPU.hpp
+++ b/pennylane_lightning/src/simulator/StateVectorCPU.hpp
@@ -22,22 +22,18 @@
 namespace Pennylane {
 
 /**
- * @brief StateVector class where data resides in CPU memory. Memory ownership
- * resides within class.
+ * @brief StateVector class where data resides in CPU memory.
  *
- * We currently use std::unique_ptr to C-style array as we want to choose
- * allocator in runtime. This is impossible with std::vector.
- *
- * @tparam PrecisionT
+ * @tparam PrecisionT Data floating point type
+ * @tparam Derived Derived class for CRTP.
  */
-template <class PrecisionT = double>
-class StateVectorCPU
-    : public StateVectorBase<PrecisionT, StateVectorCPU<PrecisionT>> {
+template <class PrecisionT, class Derived>
+class StateVectorCPU : public StateVectorBase<PrecisionT, Derived> {
   public:
     using ComplexPrecisionT = std::complex<PrecisionT>;
 
   private:
-    using BaseType = StateVectorBase<PrecisionT, StateVectorCPU>;
+    using BaseType = StateVectorBase<PrecisionT, Derived>;
 
     Threading threading_;
     CPUMemoryModel memory_model_;
@@ -46,8 +42,6 @@ class StateVectorCPU
         kernel_for_gates_;
     std::unordered_map<Gates::GeneratorOperation, Gates::KernelType>
         kernel_for_generators_;
-    std::unique_ptr<ComplexPrecisionT[]>
-        data_; // NOLINT(modernize-avoid-c-arrays)
 
     void setKernels(size_t num_qubits, Threading threading,
                     CPUMemoryModel memory_model) {
@@ -58,90 +52,15 @@ class StateVectorCPU
             num_qubits, threading, memory_model);
     }
 
-  public:
-    explicit StateVectorCPU(size_t num_qubits,
-                            Threading threading = bestThreading(),
-                            CPUMemoryModel memory_model = bestCPUMemoryModel())
+  protected:
+    explicit StateVectorCPU(size_t num_qubits, Threading threading,
+                            CPUMemoryModel memory_model)
         : BaseType(num_qubits), threading_{threading}, memory_model_{
                                                            memory_model} {
-
         setKernels(num_qubits, threading, memory_model);
-
-        size_t length = BaseType::getLength();
-        data_ = std::unique_ptr<ComplexPrecisionT[]>{new (std::align_val_t{
-            64}) ComplexPrecisionT[length]}; // NOLINT(modernize-avoid-c-arrays)
-        std::fill(data_.get(), data_.get() + length,
-                  ComplexPrecisionT{0.0, 0.0});
-        data_[0] = {1, 0};
-    }
-
-    template <class OtherDerived>
-    explicit StateVectorCPU(
-        const StateVectorBase<PrecisionT, OtherDerived> &other,
-        Threading threading = bestThreading(),
-        CPUMemoryModel memory_model = bestCPUMemoryModel())
-        : BaseType(other.getNumQubits()), threading_{threading},
-          memory_model_{memory_model} {
-
-        size_t length = BaseType::getLength();
-        data_ = std::unique_ptr<ComplexPrecisionT[]>{new (std::align_val_t{
-            64}) ComplexPrecisionT[length]}; // NOLINT(modernize-avoid-c-arrays)
-
-        std::copy(other.getData(), other.getData() + length, data_.get());
-
-        setKernels(BaseType::getNumQubits(), threading, memory_model);
-    }
-
-    StateVectorCPU(const ComplexPrecisionT *other_data, size_t other_size,
-                   Threading threading = bestThreading(),
-                   CPUMemoryModel memory_model = bestCPUMemoryModel())
-        : BaseType(Util::log2PerfectPower(other_size)), threading_{threading},
-          memory_model_{memory_model} {
-        PL_ABORT_IF_NOT(Util::isPerfectPowerOf2(other_size),
-                        "The size of provided data must be a power of 2.");
-        data_ = std::unique_ptr<ComplexPrecisionT[]>{
-            new (std::align_val_t{64}) ComplexPrecisionT
-                [other_size]}; // NOLINT(modernize-avoid-c-arrays)
-        setKernels(BaseType::getNumQubits(), threading, memory_model);
-
-        updateData(other_data);
-    }
-
-    template <class Alloc>
-    explicit StateVectorCPU(
-        const std::vector<std::complex<PrecisionT>, Alloc> &rhs,
-        Threading threading = bestThreading(),
-        CPUMemoryModel memory_model = bestCPUMemoryModel())
-        : StateVectorCPU(rhs.data(), rhs.size(), threading,
-                         memory_model) // NOLINT(hicpp-member-init)
-                                       // this is false positive for delegating
-                                       // constructor from clang-tidy
-    {}
-
-    StateVectorCPU(const StateVectorCPU &rhs)
-        : BaseType(rhs.getNumQubits()), threading_{rhs.threading_},
-          memory_model_{rhs.memory_model_} {
-        setKernels(BaseType::getNumQubits(), threading_, memory_model_);
-
-        size_t length = BaseType::getLength();
-        data_ = std::unique_ptr<ComplexPrecisionT[]>{new (std::align_val_t{
-            64}) ComplexPrecisionT[length]}; // NOLINT(modernize-avoid-c-arrays)
-        std::copy(rhs.getData(), rhs.getData() + length, data_.get());
-    }
-
-    StateVectorCPU(StateVectorCPU &&) noexcept = default;
-
-    StateVectorCPU &operator=(const StateVectorCPU &) = delete;
-    StateVectorCPU &operator=(StateVectorCPU &&) noexcept = default;
-
-    ~StateVectorCPU() = default;
-
-    [[nodiscard]] auto getData() -> ComplexPrecisionT * { return data_.get(); }
-
-    [[nodiscard]] auto getData() const -> const ComplexPrecisionT * {
-        return data_.get();
     }
 
+  public:
     [[nodiscard]] inline auto
     getKernelForGate(Gates::GateOperation gate_op) const -> Gates::KernelType {
         return kernel_for_gates_.at(gate_op);
@@ -152,15 +71,6 @@ class StateVectorCPU
         -> Gates::KernelType {
         return kernel_for_generators_.at(gntr_op);
     }
-
-    /**
-     * @brief Update data of the class to new_data
-     *
-     * @param new_data std::vector contains data.
-     */
-    void updateData(const ComplexPrecisionT *data) {
-        std::copy(data, data + BaseType::getLength(), data_.get());
-    }
 };
 
 } // namespace Pennylane
diff --git a/pennylane_lightning/src/simulator/StateVectorManagedCPU.cpp b/pennylane_lightning/src/simulator/StateVectorManagedCPU.cpp
new file mode 100644
index 0000000000..90a13bf549
--- /dev/null
+++ b/pennylane_lightning/src/simulator/StateVectorManagedCPU.cpp
@@ -0,0 +1,19 @@
+// Copyright 2021 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "StateVectorManagedCPU.hpp"
+
+// explicit instantiation
+template class Pennylane::StateVectorManagedCPU<float>;
+template class Pennylane::StateVectorManagedCPU<double>;
diff --git a/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp b/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
new file mode 100644
index 0000000000..b36aac7f6d
--- /dev/null
+++ b/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
@@ -0,0 +1,134 @@
+// Copyright 2021 Xanadu Quantum Technologies Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//     http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "BitUtil.hpp"
+#include "DispatchKeys.hpp"
+#include "Gates.hpp"
+#include "KernelType.hpp"
+#include "Memory.hpp"
+#include "StateVectorBase.hpp"
+#include "StateVectorCPU.hpp"
+#include "Util.hpp"
+
+namespace Pennylane {
+
+/**
+ * @brief StateVector class where data resides in CPU memory. Memory ownership
+ * resides within class.
+ *
+ * We currently use std::unique_ptr to C-style array as we want to choose
+ * allocator in runtime. This is impossible with std::vector.
+ *
+ * @tparam PrecisionT
+ */
+template <class PrecisionT = double>
+class StateVectorManagedCPU
+    : public StateVectorCPU<PrecisionT, StateVectorManagedCPU<PrecisionT>> {
+  public:
+    using ComplexPrecisionT = std::complex<PrecisionT>;
+
+  private:
+    using BaseType = StateVectorCPU<PrecisionT, StateVectorManagedCPU>;
+
+    // NOLINTNEXTLINE(modernize-avoid-c-arrays,hicpp-avoid-c-arrays)
+    std::unique_ptr<ComplexPrecisionT[]> data_;
+
+  public:
+    explicit StateVectorManagedCPU(
+        size_t num_qubits, Threading threading = bestThreading(),
+        CPUMemoryModel memory_model = bestCPUMemoryModel())
+        : BaseType{num_qubits, threading, memory_model} {
+
+        size_t length = BaseType::getLength();
+        // NOLINTNEXTLINE(modernize-avoid-c-arrays,hicpp-avoid-c-arrays)
+        data_ = std::unique_ptr<ComplexPrecisionT[]>{
+            new (std::align_val_t{64}) ComplexPrecisionT[length]};
+        std::fill(data_.get(), data_.get() + length,
+                  ComplexPrecisionT{0.0, 0.0});
+        data_[0] = {1, 0};
+    }
+
+    template <class OtherDerived>
+    explicit StateVectorManagedCPU(
+        const StateVectorBase<PrecisionT, OtherDerived> &other,
+        Threading threading = bestThreading(),
+        CPUMemoryModel memory_model = bestCPUMemoryModel())
+        : BaseType(other.getNumQubits(), threading, memory_model) {
+
+        size_t length = BaseType::getLength();
+        // NOLINTNEXTLINE(modernize-avoid-c-arrays,hicpp-avoid-c-arrays)
+        data_ = std::unique_ptr<ComplexPrecisionT[]>{
+            new (std::align_val_t{64}) ComplexPrecisionT[length]};
+
+        std::copy(other.getData(), other.getData() + length, data_.get());
+
+        setKernels(BaseType::getNumQubits(), threading, memory_model);
+    }
+
+    StateVectorManagedCPU(const ComplexPrecisionT *other_data,
+                          size_t other_size,
+                          Threading threading = bestThreading(),
+                          CPUMemoryModel memory_model = bestCPUMemoryModel())
+        : BaseType(Util::log2PerfectPower(other_size), threading,
+                   memory_model) {
+        PL_ABORT_IF_NOT(Util::isPerfectPowerOf2(other_size),
+                        "The size of provided data must be a power of 2.");
+
+        // NOLINTNEXTLINE(modernize-avoid-c-arrays,hicpp-avoid-c-arrays)
+        data_ = std::unique_ptr<ComplexPrecisionT[]>{
+            new (std::align_val_t{64}) ComplexPrecisionT[other_size]};
+        updateData(other_data);
+    }
+
+    // Clang-tidy gives false positive for delegating constructor
+    template <class Alloc>
+    // NOLINTNEXTLINE(hicpp-member-init)
+    explicit StateVectorManagedCPU(
+        const std::vector<std::complex<PrecisionT>, Alloc> &rhs,
+        Threading threading = bestThreading(),
+        CPUMemoryModel memory_model = bestCPUMemoryModel())
+        : StateVectorManagedCPU(rhs.data(), rhs.size(), threading,
+                                memory_model) {}
+
+    StateVectorManagedCPU(const StateVectorManagedCPU &rhs) : BaseType(rhs) {
+        size_t length = BaseType::getLength();
+        // NOLINTNEXTLINE(modernize-avoid-c-arrays,hicpp-avoid-c-arrays)
+        data_ = std::unique_ptr<ComplexPrecisionT[]>{
+            new (std::align_val_t{64}) ComplexPrecisionT[length]};
+        std::copy(rhs.getData(), rhs.getData() + length, data_.get());
+    }
+
+    StateVectorManagedCPU(StateVectorManagedCPU &&) noexcept = default;
+
+    StateVectorManagedCPU &operator=(const StateVectorManagedCPU &) = delete;
+    StateVectorManagedCPU &
+    operator=(StateVectorManagedCPU &&) noexcept = default;
+
+    ~StateVectorManagedCPU() = default;
+
+    [[nodiscard]] auto getData() -> ComplexPrecisionT * { return data_.get(); }
+
+    [[nodiscard]] auto getData() const -> const ComplexPrecisionT * {
+        return data_.get();
+    }
+
+    /**
+     * @brief Update data of the class to new_data
+     *
+     * @param new_data std::vector contains data.
+     */
+    void updateData(const ComplexPrecisionT *data) {
+        std::copy(data, data + BaseType::getLength(), data_.get());
+    }
+};
+
+} // namespace Pennylane
diff --git a/pennylane_lightning/src/simulator/StateVectorRaw.cpp b/pennylane_lightning/src/simulator/StateVectorRawCPU.cpp
similarity index 82%
rename from pennylane_lightning/src/simulator/StateVectorRaw.cpp
rename to pennylane_lightning/src/simulator/StateVectorRawCPU.cpp
index 65e6664e09..7454f66a65 100644
--- a/pennylane_lightning/src/simulator/StateVectorRaw.cpp
+++ b/pennylane_lightning/src/simulator/StateVectorRawCPU.cpp
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "StateVectorRaw.hpp"
+#include "StateVectorRawCPU.hpp"
 
 // explicit instantiation
-template class Pennylane::StateVectorRaw<float>;
-template class Pennylane::StateVectorRaw<double>;
+template class Pennylane::StateVectorRawCPU<float>;
+template class Pennylane::StateVectorRawCPU<double>;
diff --git a/pennylane_lightning/src/simulator/StateVectorRaw.hpp b/pennylane_lightning/src/simulator/StateVectorRawCPU.hpp
similarity index 82%
rename from pennylane_lightning/src/simulator/StateVectorRaw.hpp
rename to pennylane_lightning/src/simulator/StateVectorRawCPU.hpp
index f25b2c2151..57c0775774 100644
--- a/pennylane_lightning/src/simulator/StateVectorRaw.hpp
+++ b/pennylane_lightning/src/simulator/StateVectorRawCPU.hpp
@@ -24,7 +24,7 @@
 
 #include "BitUtil.hpp"
 #include "Error.hpp"
-#include "StateVectorBase.hpp"
+#include "StateVectorCPU.hpp"
 
 #include <iostream>
 
@@ -44,10 +44,10 @@ namespace Pennylane {
  * @tparam PrecisionT Floating point precision of underlying statevector data.
  */
 template <class PrecisionT = double>
-class StateVectorRaw
-    : public StateVectorBase<PrecisionT, StateVectorRaw<PrecisionT>> {
+class StateVectorRawCPU
+    : public StateVectorCPU<PrecisionT, StateVectorRawCPU<PrecisionT>> {
   public:
-    using Base = StateVectorBase<PrecisionT, StateVectorRaw<PrecisionT>>;
+    using BaseType = StateVectorCPU<PrecisionT, StateVectorRawCPU<PrecisionT>>;
     using ComplexPrecisionT = std::complex<PrecisionT>;
 
   private:
@@ -61,9 +61,10 @@ class StateVectorRaw
      * @param data Raw data pointer.
      * @param length The size of the data, i.e. 2^(number of qubits).
      */
-    StateVectorRaw(ComplexPrecisionT *data, size_t length)
-        : StateVectorBase<PrecisionT, StateVectorRaw<PrecisionT>>(
-              Util::log2PerfectPower(length)),
+    StateVectorRawCPU(ComplexPrecisionT *data, size_t length,
+                      Threading threading = bestThreading())
+        : BaseType{Util::log2PerfectPower(length), threading,
+                   getMemoryModel(static_cast<void *>(data))},
           data_{data}, length_(length) {
         // check length is perfect power of 2
         if (!Util::isPerfectPowerOf2(length)) {
@@ -74,14 +75,6 @@ class StateVectorRaw
         }
     }
 
-    StateVectorRaw(const StateVectorRaw &) = default;
-    StateVectorRaw(StateVectorRaw &&) noexcept = default;
-
-    auto operator=(const StateVectorRaw &) -> StateVectorRaw & = default;
-    auto operator=(StateVectorRaw &&) noexcept -> StateVectorRaw & = default;
-
-    ~StateVectorRaw() = default;
-
     /**
      * @brief Get the underlying data pointer.
      *
@@ -110,7 +103,7 @@ class StateVectorRaw
                      " is given."); // TODO: change to std::format in C++20
         }
         data_ = data;
-        Base::setNumQubits(Util::log2PerfectPower(length));
+        BaseType::setNumQubits(Util::log2PerfectPower(length));
         length_ = length;
     }
 
diff --git a/pennylane_lightning/src/tests/.clang-tidy b/pennylane_lightning/src/tests/.clang-tidy
index 0a70c347b0..3b5744a4b0 100644
--- a/pennylane_lightning/src/tests/.clang-tidy
+++ b/pennylane_lightning/src/tests/.clang-tidy
@@ -1,5 +1,5 @@
 ---
-Checks:          'clang-diagnostic-*,clang-analyzer-*,-*,-llvmlibc-*,modernize-*,-modernize-use-trailing-return-type,clang-analyzer-cplusplus*,openmp-*,performance-*,portability-*,readability-*,-readability-magic-numbers,hicpp-*,-hicpp-no-array-decay,bugprone-suspicious-*,llvm-namespace-comment,cppcoreguidelines-avoid-non-const-global-variables,cppcoreguidelines-slicing,cppcoreguidelines-special-member-functions'
+Checks:          'clang-diagnostic-*,clang-analyzer-*,-*,-llvmlibc-*,modernize-*,-modernize-use-trailing-return-type,clang-analyzer-cplusplus*,openmp-*,performance-*,portability-*,readability-*,-modernize-avoid-c-arrays,-readability-magic-numbers,hicpp-*,-hicpp-no-array-decay,-hicpp-avoid-c-arrays,bugprone-suspicious-*,llvm-namespace-comment,cppcoreguidelines-slicing,cppcoreguidelines-special-member-functions'
 WarningsAsErrors: '*'
 HeaderFilterRegex: '.*'
 AnalyzeTemporaryDtors: false
diff --git a/pennylane_lightning/src/tests/CMakeLists.txt b/pennylane_lightning/src/tests/CMakeLists.txt
index fbe9b621ea..64ebd3a39d 100644
--- a/pennylane_lightning/src/tests/CMakeLists.txt
+++ b/pennylane_lightning/src/tests/CMakeLists.txt
@@ -70,7 +70,6 @@ target_link_libraries(compile_time_tests lightning_gates lightning_utils)
 
 set(TEST_SOURCES CreateAllWires.cpp
                  Test_AdjDiff.cpp
-#                 Test_Bindings.cpp
                  Test_DynamicDispatcher.cpp
                  Test_DefaultKernelsForStateVector.cpp
                  Test_GateImplementations_CompareKernels.cpp
@@ -84,7 +83,6 @@ set(TEST_SOURCES CreateAllWires.cpp
                  Test_Measures.cpp
                  Test_OpToMemberFuncPtr.cpp
                  Test_StateVectorCPU.cpp
-                 Test_StateVectorRaw.cpp
                  Test_Util.cpp
                  Test_VectorJacobianProduct.cpp)
 
diff --git a/pennylane_lightning/src/tests/CreateAllWires.cpp b/pennylane_lightning/src/tests/CreateAllWires.cpp
index 43a7e80ce4..4738554b54 100644
--- a/pennylane_lightning/src/tests/CreateAllWires.cpp
+++ b/pennylane_lightning/src/tests/CreateAllWires.cpp
@@ -13,7 +13,7 @@ auto crateAllWires(size_t n_qubits, Gates::GateOperation gate_op, bool order)
             wires.reserve(Util::popcount(k));
 
             for (size_t i = 0; i < n_qubits; i++) {
-                if (((k >> i) & 1) == 1) {
+                if (((k >> i) & 1U) == 1U) {
                     wires.emplace_back(i);
                 }
             }
diff --git a/pennylane_lightning/src/tests/Test_AdjDiff.cpp b/pennylane_lightning/src/tests/Test_AdjDiff.cpp
index 6a05a36018..696d66d41d 100644
--- a/pennylane_lightning/src/tests/Test_AdjDiff.cpp
+++ b/pennylane_lightning/src/tests/Test_AdjDiff.cpp
@@ -13,7 +13,7 @@
 #include <catch2/catch.hpp>
 
 #include "AdjointDiff.hpp"
-#include "StateVectorCPU.hpp"
+#include "StateVectorRawCPU.hpp"
 #include "Util.hpp"
 
 #include "TestHelpers.hpp"
@@ -50,12 +50,10 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=RX, Obs=Z",
         for (const auto &p : param) {
             auto ops = OpsData<double>({"RX"}, {{p}}, {{0}}, {false});
 
-            /*
             std::vector<std::complex<double>> cdata(0b1 << num_qubits);
             cdata[0] = std::complex<double>{1, 0};
-            */
 
-            StateVectorCPU<double> psi(num_qubits);
+            StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
 
             std::vector<size_t> tp{0};
             std::vector<ObsDatum<double>> obs_ls{obs};
@@ -84,7 +82,10 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=RY, Obs=X",
         for (const auto &p : param) {
             auto ops = OpsData<double>({"RY"}, {{p}}, {{0}}, {false});
 
-            StateVectorCPU<double> psi(num_qubits);
+            std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+            cdata[0] = std::complex<double>{1, 0};
+
+            StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
 
             std::vector<size_t> tp{0};
             std::vector<ObsDatum<double>> obs_ls{obs};
@@ -108,7 +109,9 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=RX, Obs=[Z,Z]",
         const size_t num_obs = 2;
         std::vector<double> jacobian(num_obs * num_params, 0);
 
-        StateVectorCPU<double> psi(num_qubits);
+        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+        StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
+        cdata[0] = std::complex<double>{1, 0};
 
         auto obs1 = ObsDatum<double>({"PauliZ"}, {{}}, {{0}});
         auto obs2 = ObsDatum<double>({"PauliZ"}, {{}}, {{1}});
@@ -137,7 +140,9 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=[RX,RX,RX], Obs=[Z,Z,Z]",
         const size_t num_obs = 3;
         std::vector<double> jacobian(num_obs * num_params, 0);
 
-        StateVectorCPU<double> psi(num_qubits);
+        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+        StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
+        cdata[0] = std::complex<double>{1, 0};
 
         auto obs1 = ObsDatum<double>({"PauliZ"}, {{}}, {{0}});
         auto obs2 = ObsDatum<double>({"PauliZ"}, {{}}, {{1}});
@@ -174,7 +179,9 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=[RX,RX,RX], Obs=[Z,Z,Z], "
         std::vector<double> jacobian(num_obs * num_params, 0);
         std::vector<size_t> t_params{0, 2};
 
-        StateVectorCPU<double> psi(num_qubits);
+        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+        StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
+        cdata[0] = std::complex<double>{1, 0};
 
         auto obs1 = ObsDatum<double>({"PauliZ"}, {{}}, {{0}});
         auto obs2 = ObsDatum<double>({"PauliZ"}, {{}}, {{1}});
@@ -207,7 +214,9 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=[RX,RX,RX], Obs=[ZZZ]",
         const size_t num_obs = 1;
         std::vector<double> jacobian(num_obs * num_params, 0);
 
-        StateVectorCPU<double> psi(num_qubits);
+        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+        StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
+        cdata[0] = std::complex<double>{1, 0};
 
         auto obs = ObsDatum<double>({"PauliZ", "PauliZ", "PauliZ"},
                                     {{}, {}, {}}, {{0}, {1}, {2}});
@@ -240,7 +249,9 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=Mixed, Obs=[XXX]",
         const size_t num_obs = 1;
         std::vector<double> jacobian(num_obs * num_params, 0);
 
-        StateVectorCPU<double> psi(num_qubits);
+        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+        StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
+        cdata[0] = std::complex<double>{1, 0};
 
         auto obs = ObsDatum<double>({"PauliX", "PauliX", "PauliX"},
                                     {{}, {}, {}}, {{0}, {1}, {2}});
@@ -301,7 +312,7 @@ TEST_CASE("AdjointJacobian::adjointJacobian Decomposed Rot gate, non "
 
             std::vector<std::complex<double>> cdata{INVSQRT2<double>(),
                                                     -INVSQRT2<double>()};
-            StateVectorCPU<double> psi(cdata.data(), cdata.size());
+            StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
 
             auto obs = ObsDatum<double>({"PauliZ"}, {{}}, {{0}});
             auto ops = OpsData<double>(
@@ -342,7 +353,7 @@ TEST_CASE("AdjointJacobian::adjointJacobian Mixed Ops, Obs and TParams",
 
         std::vector<std::complex<double>> cdata{ONE<double>(), ZERO<double>(),
                                                 ZERO<double>(), ZERO<double>()};
-        StateVectorCPU<double> psi(cdata);
+        StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
 
         auto obs = ObsDatum<double>({"PauliX", "PauliZ"}, {{}, {}}, {{0}, {1}});
         auto ops = OpsData<double>(
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
index 30d6894b08..aea72009e9 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
@@ -25,9 +25,7 @@ using namespace Pennylane;
 using namespace Pennylane::Gates;
 using namespace Pennylane::Util;
 
-namespace {
 using std::vector;
-}
 
 template <typename TypeList> std::string kernelsToString() {
     if constexpr (!std::is_same_v<TypeList, void>) {
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Nonparam.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Nonparam.cpp
index 85772294ff..2dde03af2b 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Nonparam.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Nonparam.cpp
@@ -21,9 +21,7 @@
  */
 using namespace Pennylane;
 
-namespace {
 using std::vector;
-}
 
 /**
  * @brief Run test suit only when the gate is defined
@@ -75,13 +73,14 @@ void testApplyPauliX() {
                     << ", PauliX - " << PrecisionToName<PrecisionT>::value) {
         for (size_t index = 0; index < num_qubits; index++) {
             auto st = createZeroState<PrecisionT>(num_qubits);
-            CHECK(st[0] == Util::ONE<PrecisionT>());
 
             GateImplementation::applyPauliX(st.data(), num_qubits, {index},
                                             false);
-            CHECK(st[0] == Util::ZERO<PrecisionT>());
-            CHECK(st[0b1 << (num_qubits - index - 1)] ==
-                  Util::ONE<PrecisionT>());
+
+            std::string expected_str("000");
+            expected_str[index] = '1';
+            REQUIRE(st ==
+                    PLApprox(createProductState<PrecisionT>(expected_str)));
         }
     }
 }
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Param.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Param.cpp
index 8e594fe3d6..74e6f3a767 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Param.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Param.cpp
@@ -245,15 +245,15 @@ void testApplyRot() {
         std::vector<PrecisionT>{2.3, 0.1, 0.4}};
 
     std::vector<std::vector<ComplexPrecisionT>> expected_results{
-        std::vector<ComplexPrecisionT>(0b1 << num_qubits),
-        std::vector<ComplexPrecisionT>(0b1 << num_qubits),
-        std::vector<ComplexPrecisionT>(0b1 << num_qubits)};
+        std::vector<ComplexPrecisionT>(1U << num_qubits),
+        std::vector<ComplexPrecisionT>(1U << num_qubits),
+        std::vector<ComplexPrecisionT>(1U << num_qubits)};
 
     for (size_t i = 0; i < angles.size(); i++) {
         const auto rot_mat =
             Gates::getRot<PrecisionT>(angles[i][0], angles[i][1], angles[i][2]);
         expected_results[i][0] = rot_mat[0];
-        expected_results[i][0b1 << (num_qubits - i - 1)] = rot_mat[2];
+        expected_results[i][1U << (num_qubits - i - 1)] = rot_mat[2];
     }
 
     for (size_t index = 0; index < num_qubits; index++) {
@@ -1233,8 +1233,8 @@ void testApplyCRot() {
     std::vector<ComplexPrecisionT> expected_results(8);
     const auto rot_mat =
         Gates::getRot<PrecisionT>(angles[0], angles[1], angles[2]);
-    expected_results[0b1 << (num_qubits - 1)] = rot_mat[0];
-    expected_results[(0b1 << num_qubits) - 2] = rot_mat[2];
+    expected_results[1U << (num_qubits - 1)] = rot_mat[0];
+    expected_results[(1U << num_qubits) - 2] = rot_mat[2];
 
     DYNAMIC_SECTION(GateImplementation::name
                     << ", CRot0,1 |000> -> |000> - "
diff --git a/pennylane_lightning/src/tests/Test_Measures.cpp b/pennylane_lightning/src/tests/Test_Measures.cpp
index b7ec1e8fd5..6f18a458df 100644
--- a/pennylane_lightning/src/tests/Test_Measures.cpp
+++ b/pennylane_lightning/src/tests/Test_Measures.cpp
@@ -3,7 +3,7 @@
 #include <vector>
 
 #include "Measures.hpp"
-#include "StateVectorCPU.hpp"
+#include "StateVectorManagedCPU.hpp"
 #include "Util.hpp"
 
 #include <catch2/catch.hpp>
@@ -17,14 +17,14 @@ using std::string;
 using std::vector;
 }; // namespace
 
-StateVectorCPU<double> Initializing_StateVector() {
+StateVectorManagedCPU<double> Initializing_StateVector() {
     // Defining a StateVector in a non-trivial configuration:
     size_t num_qubits = 3;
     size_t data_size = std::pow(2, num_qubits);
 
     std::vector<std::complex<double>> arr(data_size, 0);
     arr[0] = 1;
-    StateVectorCPU<double> Measured_StateVector(arr.data(), data_size);
+    StateVectorManagedCPU<double> Measured_StateVector(arr.data(), data_size);
 
     std::vector<size_t> wires;
 
@@ -64,11 +64,13 @@ TEST_CASE("Probabilities", "[Measures]") {
         {1, 2},    {2, 1},    {0},       {1},       {2}};
 
     // Defining the State Vector that will be measured.
-    StateVectorCPU<double> Measured_StateVector = Initializing_StateVector();
+    StateVectorManagedCPU<double> Measured_StateVector =
+        Initializing_StateVector();
 
     // Initializing the measures class.
     // It will attach to the StateVector, allowing measures to keep been taken.
-    Measures<double, StateVectorCPU<double>> Measurer(Measured_StateVector);
+    Measures<double, StateVectorManagedCPU<double>> Measurer(
+        Measured_StateVector);
 
     vector<double> probabilities;
 
@@ -90,11 +92,13 @@ TEST_CASE("Probabilities", "[Measures]") {
 
 TEST_CASE("Expected Values", "[Measures]") {
     // Defining the State Vector that will be measured.
-    StateVectorCPU<double> Measured_StateVector = Initializing_StateVector();
+    StateVectorManagedCPU<double> Measured_StateVector =
+        Initializing_StateVector();
 
     // Initializing the measures class.
     // It will attach to the StateVector, allowing measures to keep been taken.
-    Measures<double, StateVectorCPU<double>> Measurer(Measured_StateVector);
+    Measures<double, StateVectorManagedCPU<double>> Measurer(
+        Measured_StateVector);
 
     SECTION("Testing single operation defined by a matrix:") {
         vector<std::complex<double>> PauliX = {0, 1, 1, 0};
@@ -162,11 +166,13 @@ TEST_CASE("Expected Values", "[Measures]") {
 
 TEST_CASE("Variances", "[Measures]") {
     // Defining the State Vector that will be measured.
-    StateVectorCPU<double> Measured_StateVector = Initializing_StateVector();
+    StateVectorManagedCPU<double> Measured_StateVector =
+        Initializing_StateVector();
 
     // Initializing the measures class.
     // It will attach to the StateVector, allowing measures to keep been taken.
-    Measures<double, StateVectorCPU<double>> Measurer(Measured_StateVector);
+    Measures<double, StateVectorManagedCPU<double>> Measurer(
+        Measured_StateVector);
 
     SECTION("Testing single operation defined by a matrix:") {
         vector<std::complex<double>> PauliX = {0, 1, 1, 0};
diff --git a/pennylane_lightning/src/tests/Test_StateVectorBase.cpp b/pennylane_lightning/src/tests/Test_StateVectorBase.cpp
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/pennylane_lightning/src/tests/Test_StateVectorCPU.cpp b/pennylane_lightning/src/tests/Test_StateVectorCPU.cpp
index 17fd667c19..5b1e263de2 100644
--- a/pennylane_lightning/src/tests/Test_StateVectorCPU.cpp
+++ b/pennylane_lightning/src/tests/Test_StateVectorCPU.cpp
@@ -9,40 +9,80 @@
 
 #include <catch2/catch.hpp>
 
-#include "StateVectorCPU.hpp"
-#include "StateVectorRaw.hpp"
+#include "StateVectorManagedCPU.hpp"
+#include "StateVectorRawCPU.hpp"
 #include "Util.hpp"
 
 #include "TestHelpers.hpp"
 
 using namespace Pennylane;
 
-TEMPLATE_TEST_CASE("StateVectorCPU::StateVectorCPU", "[StateVectorRaw]", float,
-                   double) {
+TEMPLATE_TEST_CASE("StateVectorManagedCPU::StateVectorManagedCPU",
+                   "[StateVectorRaw]", float, double) {
     using fp_t = TestType;
 
-    SECTION("StateVectorCPU") {
-        REQUIRE(!std::is_constructible_v<StateVectorCPU<>>);
+    SECTION("StateVectorManagedCPU") {
+        REQUIRE(!std::is_constructible_v<StateVectorManagedCPU<>>);
     }
-    SECTION("StateVectorCPU<TestType>") {
-        REQUIRE(!std::is_constructible_v<StateVectorCPU<TestType>>);
+    SECTION("StateVectorManagedCPU<TestType>") {
+        REQUIRE(!std::is_constructible_v<StateVectorManagedCPU<TestType>>);
     }
-    SECTION("StateVectorCPU<TestType> {size_t}") {
-        REQUIRE(std::is_constructible_v<StateVectorCPU<TestType>, size_t>);
+    SECTION("StateVectorManagedCPU<TestType> {size_t}") {
+        REQUIRE(
+            std::is_constructible_v<StateVectorManagedCPU<TestType>, size_t>);
         const size_t num_qubits = 4;
-        StateVectorCPU<fp_t> sv(num_qubits);
+        StateVectorManagedCPU<fp_t> sv(num_qubits);
 
         REQUIRE(sv.getNumQubits() == 4);
         REQUIRE(sv.getLength() == 16);
     }
-    SECTION("StateVectorCPU<TestType> {const StateVectorRaw<TestType>&}") {
-        REQUIRE(std::is_constructible_v<StateVectorCPU<TestType>,
-                                        const StateVectorRaw<TestType> &>);
+    SECTION("StateVectorManagedCPU<TestType> {const "
+            "StateVectorRawCPU<TestType>&}") {
+        REQUIRE(std::is_constructible_v<StateVectorManagedCPU<TestType>,
+                                        const StateVectorRawCPU<TestType> &>);
     }
-    SECTION("StateVectorCPU<TestType> {const StateVectorCPU<TestType>&}") {
-        REQUIRE(std::is_copy_constructible_v<StateVectorCPU<TestType>>);
+    SECTION("StateVectorManagedCPU<TestType> {const "
+            "StateVectorManagedCPU<TestType>&}") {
+        REQUIRE(std::is_copy_constructible_v<StateVectorManagedCPU<TestType>>);
     }
-    SECTION("StateVectorCPU<TestType> {StateVectorCPU<TestType>&&}") {
-        REQUIRE(std::is_move_constructible_v<StateVectorCPU<TestType>>);
+    SECTION(
+        "StateVectorManagedCPU<TestType> {StateVectorManagedCPU<TestType>&&}") {
+        REQUIRE(std::is_move_constructible_v<StateVectorManagedCPU<TestType>>);
     }
 }
+
+std::mt19937_64 re{1337};
+
+TEMPLATE_TEST_CASE("StateVectorRawCPU::StateVectorRawCPU",
+                   "[StateVectorRawCPU]", float, double) {
+    using fp_t = TestType;
+
+    SECTION("StateVectorRawCPU<TestType> {std::complex<TestType>*, size_t}") {
+        const size_t num_qubits = 4;
+        auto st_data = createRandomState<fp_t>(re, num_qubits);
+        StateVectorRawCPU<fp_t> sv(st_data.data(), st_data.size());
+
+        REQUIRE(sv.getNumQubits() == 4);
+        REQUIRE(sv.getData() == st_data.data());
+        REQUIRE(sv.getLength() == 16);
+    }
+    SECTION("StateVectorRawCPU<TestType> {std::complex<TestType>*, size_t}") {
+        std::vector<std::complex<TestType>> st_data(14, 0.0);
+        REQUIRE_THROWS(StateVectorRawCPU<fp_t>(st_data.data(), st_data.size()));
+    }
+}
+
+TEMPLATE_TEST_CASE("StateVectorRawCPU::setData", "[StateVectorRawCPU]", float,
+                   double) {
+    using fp_t = TestType;
+
+    auto st_data = createRandomState<fp_t>(re, 4);
+    StateVectorRawCPU<fp_t> sv(st_data.data(), st_data.size());
+
+    auto st_data2 = createRandomState<fp_t>(re, 8);
+    sv.setData(st_data2.data(), st_data2.size());
+
+    REQUIRE(sv.getNumQubits() == 8);
+    REQUIRE(sv.getData() == st_data2.data());
+    REQUIRE(sv.getLength() == (1U << 8U));
+}
diff --git a/pennylane_lightning/src/tests/Test_StateVectorRaw.cpp b/pennylane_lightning/src/tests/Test_StateVectorRaw.cpp
deleted file mode 100644
index 4700c74881..0000000000
--- a/pennylane_lightning/src/tests/Test_StateVectorRaw.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-#include <complex>
-#include <numeric>
-#include <vector>
-
-#include "StateVectorRaw.hpp"
-#include "TestHelpers.hpp"
-#include "Util.hpp"
-
-#include <catch2/catch.hpp>
-
-using namespace Pennylane;
-
-std::mt19937_64 re{1337};
-
-TEMPLATE_TEST_CASE("StateVectorRaw::StateVectorRaw", "[StateVectorRaw]", float,
-                   double) {
-    using fp_t = TestType;
-
-    SECTION("StateVectorRaw<TestType> {std::complex<TestType>*, size_t}") {
-        const size_t num_qubits = 4;
-        auto st_data = createRandomState<fp_t>(re, num_qubits);
-        StateVectorRaw<fp_t> sv(st_data.data(), st_data.size());
-
-        REQUIRE(sv.getNumQubits() == 4);
-        REQUIRE(sv.getData() == st_data.data());
-        REQUIRE(sv.getLength() == 16);
-    }
-    SECTION("StateVectorRaw<TestType> {std::complex<TestType>*, size_t}") {
-        std::vector<std::complex<TestType>> st_data(14, 0.0);
-        REQUIRE_THROWS(StateVectorRaw<fp_t>(st_data.data(), st_data.size()));
-    }
-}
-
-TEMPLATE_TEST_CASE("StateVectorRaw::setData", "[StateVectorRaw]", float,
-                   double) {
-    using fp_t = TestType;
-
-    auto st_data = createRandomState<fp_t>(re, 4);
-    StateVectorRaw<fp_t> sv(st_data.data(), st_data.size());
-
-    auto st_data2 = createRandomState<fp_t>(re, 8);
-    sv.setData(st_data2.data(), st_data2.size());
-
-    REQUIRE(sv.getNumQubits() == 8);
-    REQUIRE(sv.getData() == st_data2.data());
-    REQUIRE(sv.getLength() == (1U << 8));
-}
diff --git a/pennylane_lightning/src/tests/Test_Util.cpp b/pennylane_lightning/src/tests/Test_Util.cpp
index 2e804d05e2..8ac67d087a 100644
--- a/pennylane_lightning/src/tests/Test_Util.cpp
+++ b/pennylane_lightning/src/tests/Test_Util.cpp
@@ -468,7 +468,7 @@ TEMPLATE_TEST_CASE("Utility math functions", "[Util][LinearAlgebra]", float,
  */
 size_t popcount_slow(uint64_t x) {
     size_t c = 0;
-    for (; x != 0; x >>= 1) {
+    for (; x != 0; x >>= 1U) {
         if ((x & 1U) != 0U) {
             c++;
         }
@@ -483,8 +483,8 @@ size_t popcount_slow(uint64_t x) {
  */
 size_t ctz_slow(uint64_t x) {
     size_t c = 0;
-    while ((x & 1) == 0) {
-        x >>= 1;
+    while ((x & 1U) == 0) {
+        x >>= 1U;
         c++;
     }
     return c;
diff --git a/pennylane_lightning/src/tests/Test_VectorJacobianProduct.cpp b/pennylane_lightning/src/tests/Test_VectorJacobianProduct.cpp
index babee6b726..e2a876661e 100644
--- a/pennylane_lightning/src/tests/Test_VectorJacobianProduct.cpp
+++ b/pennylane_lightning/src/tests/Test_VectorJacobianProduct.cpp
@@ -14,7 +14,7 @@
 
 #include "AdjointDiff.hpp"
 #include "JacobianProd.hpp"
-#include "StateVectorRaw.hpp"
+#include "StateVectorRawCPU.hpp"
 #include "Util.hpp"
 
 #include "TestHelpers.hpp"
@@ -53,10 +53,10 @@ TEST_CASE("VectorJacobianProduct::vectorJacobianProduct Op=RX, Obs=Z dy={0}",
 
         for (const auto &p : param) {
             auto ops = OpsData<double>({"RX"}, {{p}}, {{0}}, {false});
-            std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+            std::vector<std::complex<double>> cdata(1U << num_qubits);
             cdata[0] = std::complex<double>{1, 0};
 
-            StateVectorRaw<double> psi(cdata.data(), cdata.size());
+            StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
 
             std::vector<size_t> tp{0};
             std::vector<ObsDatum<double>> obs_ls{obs};
@@ -91,10 +91,10 @@ TEST_CASE("VectorJacobianProduct::vectorJacobianProduct Op=RX, Obs=Z dy={1}",
         for (const auto &p : param) {
             auto ops = OpsData<double>({"RX"}, {{p}}, {{0}}, {false});
 
-            std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+            std::vector<std::complex<double>> cdata(1U << num_qubits);
             cdata[0] = std::complex<double>{1, 0};
 
-            StateVectorRaw<double> psi(cdata.data(), cdata.size());
+            StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
 
             std::vector<size_t> tp{0};
             std::vector<ObsDatum<double>> obs_ls{obs};
@@ -129,10 +129,10 @@ TEST_CASE("VectorJacobianProduct::vectorJacobianProduct Op=RX, Obs=Z dy={0.4}",
         for (const auto &p : param) {
             auto ops = OpsData<double>({"RX"}, {{p}}, {{0}}, {false});
 
-            std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+            std::vector<std::complex<double>> cdata(1U << num_qubits);
             cdata[0] = std::complex<double>{1, 0};
 
-            StateVectorRaw<double> psi(cdata.data(), cdata.size());
+            StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
 
             std::vector<size_t> tp{0};
             std::vector<ObsDatum<double>> obs_ls{obs};
@@ -168,10 +168,10 @@ TEST_CASE("VectorJacobianProduct::vectorJacobianProduct Op=RY, Obs=X dy={0.4}",
         for (const auto &p : param) {
             auto ops = OpsData<double>({"RY"}, {{p}}, {{0}}, {false});
 
-            std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+            std::vector<std::complex<double>> cdata(1U << num_qubits);
             cdata[0] = std::complex<double>{1, 0};
 
-            StateVectorRaw<double> psi(cdata.data(), cdata.size());
+            StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
 
             std::vector<size_t> tp{0};
             std::vector<ObsDatum<double>> obs_ls{obs};
@@ -203,8 +203,8 @@ TEST_CASE(
         std::vector<double> vjp_res(num_params);
         std::vector<double> dy(num_obs, 1);
 
-        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
-        StateVectorRaw<double> psi(cdata.data(), cdata.size());
+        std::vector<std::complex<double>> cdata(1U << num_qubits);
+        StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
         cdata[0] = std::complex<double>{1, 0};
 
         auto obs1 = ObsDatum<double>({"PauliZ"}, {{}}, {{0}});
@@ -239,8 +239,8 @@ TEST_CASE("VectorJacobianProduct::vectorJacobianProduct Op=[RX,RX,RX], "
         std::vector<double> vjp_res(num_params);
         std::vector<double> dy(num_obs, 0.4);
 
-        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
-        StateVectorRaw<double> psi(cdata.data(), cdata.size());
+        std::vector<std::complex<double>> cdata(1U << num_qubits);
+        StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
         cdata[0] = std::complex<double>{1, 0};
 
         auto obs1 = ObsDatum<double>({"PauliZ"}, {{}}, {{0}});
@@ -282,8 +282,8 @@ TEST_CASE(
         std::vector<double> vjp_res(num_params);
         std::vector<double> dy(num_obs, 1);
 
-        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
-        StateVectorRaw<double> psi(cdata.data(), cdata.size());
+        std::vector<std::complex<double>> cdata(1U << num_qubits);
+        StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
         cdata[0] = std::complex<double>{1, 0};
 
         auto obs1 = ObsDatum<double>({"PauliZ"}, {{}}, {{0}});
@@ -322,8 +322,8 @@ TEST_CASE("VectorJacobianProduct::vectorJacobianProduct Op=[RX,RX,RX], "
         std::vector<double> vjp_res(num_params);
         std::vector<double> dy(num_obs, 0.4);
 
-        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
-        StateVectorRaw<double> psi(cdata.data(), cdata.size());
+        std::vector<std::complex<double>> cdata(1U << num_qubits);
+        StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
         cdata[0] = std::complex<double>{1, 0};
 
         auto obs = ObsDatum<double>({"PauliZ", "PauliZ", "PauliZ"},
@@ -361,8 +361,8 @@ TEST_CASE(
         std::vector<double> vjp_res(num_params);
         std::vector<double> dy(num_obs, 1);
 
-        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
-        StateVectorRaw<double> psi(cdata.data(), cdata.size());
+        std::vector<std::complex<double>> cdata(1U << num_qubits);
+        StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
         cdata[0] = std::complex<double>{1, 0};
 
         auto obs = ObsDatum<double>({"PauliX", "PauliX", "PauliX"},
@@ -412,8 +412,8 @@ TEST_CASE("VectorJacobianProduct::vectorJacobianProduct Op=Mixed, Obs=[XXX], "
         std::vector<double> vjp_res(num_params);
         std::vector<double> dy(num_obs, -0.2);
 
-        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
-        StateVectorRaw<double> psi(cdata.data(), cdata.size());
+        std::vector<std::complex<double>> cdata(1U << num_qubits);
+        StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
         cdata[0] = std::complex<double>{1, 0};
 
         auto obs = ObsDatum<double>({"PauliX", "PauliX", "PauliX"},
@@ -480,7 +480,7 @@ TEST_CASE(
 
             std::vector<std::complex<double>> cdata{INVSQRT2<double>(),
                                                     -INVSQRT2<double>()};
-            StateVectorRaw<double> psi(cdata.data(), cdata.size());
+            StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
 
             auto obs = ObsDatum<double>({"PauliZ"}, {{}}, {{0}});
             auto ops = OpsData<double>(
@@ -525,7 +525,7 @@ TEST_CASE(
 
         std::vector<std::complex<double>> cdata{ONE<double>(), ZERO<double>(),
                                                 ZERO<double>(), ZERO<double>()};
-        StateVectorRaw<double> psi(cdata.data(), cdata.size());
+        StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
 
         auto obs = ObsDatum<double>({"PauliX", "PauliZ"}, {{}, {}}, {{0}, {1}});
         auto ops = OpsData<double>(
@@ -584,7 +584,7 @@ TEST_CASE("VectorJacobianProduct::vectorJacobianProduct Mixed Ops, Obs and "
 
         std::vector<std::complex<double>> cdata{ONE<double>(), ZERO<double>(),
                                                 ZERO<double>(), ZERO<double>()};
-        StateVectorRaw<double> psi(cdata.data(), cdata.size());
+        StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
 
         auto obs = ObsDatum<double>({"PauliX", "PauliZ"}, {{}, {}}, {{0}, {1}});
         auto ops = OpsData<double>(
@@ -622,4 +622,4 @@ TEST_CASE("VectorJacobianProduct::vectorJacobianProduct Mixed Ops, Obs and "
         CHECK(-0.5 * expected[1] == Approx(vjp_res[1]).margin(1e-7));
         CHECK(-0.5 * expected[2] == Approx(vjp_res[2]).margin(1e-7));
     }
-}
\ No newline at end of file
+}
diff --git a/pennylane_lightning/src/util/Memory.hpp b/pennylane_lightning/src/util/Memory.hpp
index dcd8dd359d..6a6df1f1ce 100644
--- a/pennylane_lightning/src/util/Memory.hpp
+++ b/pennylane_lightning/src/util/Memory.hpp
@@ -46,9 +46,7 @@ template <class T, uint32_t alignment> struct AlignedAllocator {
         if (size == 0) {
             return nullptr;
         }
-        void *p = std::aligned_alloc(
-            alignment,
-            sizeof(T) * size); // NOLINT(cppcoreguidelines-owning-memory)
+        void *p = std::aligned_alloc(alignment, sizeof(T) * size);
         if (p == nullptr) {
             throw std::bad_alloc();
         }
@@ -56,8 +54,8 @@ template <class T, uint32_t alignment> struct AlignedAllocator {
     }
 
     void deallocate(T *p, [[maybe_unused]] std::size_t size) noexcept {
-        std::free(
-            p); // NOLINT(hicpp-no-malloc, cppcoreguidelines-owning-memory)
+        // NOLINTNEXTLINE(hicpp-no-malloc)
+        std::free(p);
     }
 
     template <class U> void construct(U *ptr) { ::new ((void *)ptr) U(); }

From ff394e57522d4a83d5cb828c407418caea3b7381 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Mon, 28 Feb 2022 23:25:05 -0500
Subject: [PATCH 03/94] Export to python

---
 pennylane_lightning/_serialize.py             |  14 +-
 pennylane_lightning/lightning_qubit.py        |  46 +-
 pennylane_lightning/src/bindings/Bindings.cpp |  56 +--
 pennylane_lightning/src/bindings/Bindings.hpp | 186 +++++--
 pennylane_lightning/src/gates/Constant.hpp    |  27 +-
 .../src/gates/GateOperation.hpp               |  16 +-
 .../src/gates/OpToMemberFuncPtr.hpp           |  65 ++-
 .../cpu_kernels/GateImplementationsLM.hpp     |  78 ++-
 .../cpu_kernels/GateImplementationsPI.hpp     | 179 ++++++-
 .../src/simulator/CPUMemoryModel.hpp          |  89 ++++
 .../DefaultKernelsForStateVector.hpp          | 458 +++++++++++++-----
 .../src/simulator/DispatchKeys.hpp            |  30 +-
 .../src/simulator/DynamicDispatcher.cpp       |  75 ++-
 .../src/simulator/DynamicDispatcher.hpp       | 137 +++++-
 .../src/simulator/StateVectorBase.hpp         |  93 ++--
 .../src/simulator/StateVectorCPU.hpp          |  21 +-
 .../src/simulator/StateVectorManagedCPU.hpp   |  27 +-
 .../Test_DefaultKernelsForStateVector.cpp     |  71 +++
 .../src/tests/Test_DynamicDispatcher.cpp      |  10 +-
 ...est_GateImplementations_CompareKernels.cpp |  61 ++-
 .../Test_GateImplementations_Inverse.cpp      |  31 +-
 .../tests/Test_GateImplementations_Matrix.cpp | 256 +++++++---
 .../src/tests/Test_OpToMemberFuncPtr.cpp      |  11 +-
 pennylane_lightning/src/util/ConstantUtil.hpp |   7 +
 .../src/util/IntegerInterval.hpp              |  94 ++++
 pennylane_lightning/src/util/Memory.hpp       |  12 +-
 pennylane_lightning/src/util/Util.hpp         |  10 +
 tests/test_adjoint_jacobian.py                |  14 +-
 tests/test_apply.py                           |  38 --
 tests/test_array.py                           |  34 ++
 tests/test_serialize.py                       |  40 --
 tests/test_vjp.py                             |  60 +--
 32 files changed, 1619 insertions(+), 727 deletions(-)
 create mode 100644 pennylane_lightning/src/simulator/CPUMemoryModel.hpp
 create mode 100644 pennylane_lightning/src/util/IntegerInterval.hpp
 create mode 100644 tests/test_array.py

diff --git a/pennylane_lightning/_serialize.py b/pennylane_lightning/_serialize.py
index e82e121d81..db7d78cb8e 100644
--- a/pennylane_lightning/_serialize.py
+++ b/pennylane_lightning/_serialize.py
@@ -40,18 +40,6 @@
     pass
 
 
-def _is_lightning_gate(gate_name):
-    """Returns True if the gate (besides Matrix) is implemented
-    and exported from lightning.
-
-    Args:
-        gate_name (str): the name of gate
-    """
-    if gate_name == "Matrix":
-        return False
-    return gate_name in DEFAULT_KERNEL_FOR_OPS
-
-
 def _obs_has_kernel(obs: Observable) -> bool:
     """Returns True if the input observable has a supported kernel in the C++ backend.
 
@@ -167,7 +155,7 @@ def _serialize_ops(
             name = single_op.name if not is_inverse else single_op.name[:-4]
             names.append(name)
 
-            if not _is_lightning_gate(name):
+            if getattr(StateVectorC128, name, None) is None:
                 params.append([])
                 mats.append(single_op.matrix)
 
diff --git a/pennylane_lightning/lightning_qubit.py b/pennylane_lightning/lightning_qubit.py
index f6dd12a3fe..014e4acb51 100644
--- a/pennylane_lightning/lightning_qubit.py
+++ b/pennylane_lightning/lightning_qubit.py
@@ -46,11 +46,13 @@
         StateVectorC128,
         AdjointJacobianC128,
         VectorJacobianProductC128,
-        DEFAULT_KERNEL_FOR_OPS,
-        EXPORTED_KERNEL_OPS,
+        allocate_aligned_array,
+        get_alignment,
+        best_alignment,
+        CPUMemoryModel,
     )
 
-    from ._serialize import _serialize_obs, _serialize_ops, _is_lightning_gate
+    from ._serialize import _serialize_obs, _serialize_ops
 
     CPP_BINARY_AVAILABLE = True
 except ModuleNotFoundError:
@@ -101,25 +103,23 @@ class LightningQubit(DefaultQubit):
     _CPP_BINARY_AVAILABLE = True
 
     def __init__(self, wires, *, kernel_for_ops=None, shots=None, batch_obs=False):
-        self._kernel_for_ops = DEFAULT_KERNEL_FOR_OPS
-        if kernel_for_ops is not None:
-            if not isinstance(kernel_for_ops, dict):
-                raise ValueError("Argument kernel_for_ops must be a dictionary.")
-
-            for gate_op, kernel in kernel_for_ops.items():
-                if (kernel, gate_op) not in EXPORTED_KERNEL_OPS:
-                    raise ValueError(
-                        f"The given kernel {kernel} does not implement {gate_op} gate."
-                    )
-                self._kernel_for_ops[gate_op] = kernel
-
         super().__init__(wires, shots=shots)
         self._batch_obs = batch_obs
 
-        # Lightning keeps a simulator memory of which is managed by C++
-        # Note that as C++ manages the data, we need to copy from this array when
-        # the result is used outside of the module
-        self.sim_ = None
+    @staticmethod
+    def _asarray(arr, dtype=None):
+        arr = np.asarray(arr)
+        if not dtype:
+            dtype = arr.dtype
+
+        # We allocate a new aligned memory and copy data to there if alignment or dtype mismatches
+        # Note that get_alignment does not neccsarily returns CPUMemoryModel(Unaligned) even for
+        # numpy allocated memory as the memory location happens to be aligend.
+        if int(get_alignment(arr)) < int(best_alignment()) or arr.dtype != dtype:
+            new_arr = allocate_aligned_array(arr.size, np.dtype(dtype)).reshape(arr.shape)
+            np.copyto(new_arr, arr)
+            arr = new_arr
+        return arr
 
     @classmethod
     def capabilities(cls):
@@ -195,17 +195,13 @@ def apply_lightning(self, state, operations, dtype=np.complex128):
 
         for o in operations:
             name = o.name.split(".")[0]  # The split is because inverse gates have .inv appended
-            if _is_lightning_gate(name):
-                kernel = self._kernel_for_ops[name]
-                method = getattr(sim, f"{name}_{kernel}".format(), None)
-            else:
-                method = None
+            method = getattr(sim, name, None)
 
             wires = self.wires.indices(o.wires)
 
             if method is None:
                 # Inverse can be set to False since o.matrix is already in inverted form
-                method = getattr(sim, "applyMatrix_{}".format(self._kernel_for_ops["Matrix"]))
+                method = getattr(sim, "applyMatrix")
                 method(o.matrix, wires, False)
             else:
                 inv = o.inverse
diff --git a/pennylane_lightning/src/bindings/Bindings.cpp b/pennylane_lightning/src/bindings/Bindings.cpp
index c7747c6016..b935734087 100644
--- a/pennylane_lightning/src/bindings/Bindings.cpp
+++ b/pennylane_lightning/src/bindings/Bindings.cpp
@@ -19,6 +19,7 @@
 
 #include "GateUtil.hpp"
 #include "SelectKernel.hpp"
+#include "StateVectorManagedCPU.hpp"
 
 #include "pybind11/pybind11.h"
 
@@ -27,7 +28,7 @@ namespace {
 using namespace Pennylane::Algorithms;
 using namespace Pennylane::Gates;
 
-using Pennylane::StateVectorRaw;
+using Pennylane::StateVectorRawCPU;
 
 using std::complex;
 using std::string;
@@ -45,7 +46,7 @@ namespace py = pybind11;
  * @param m Pybind11 module.
  */
 template <class PrecisionT, class ParamT>
-void lightning_class_bindings(py::module &m) {
+void lightning_class_bindings(py::module_ &m) {
     // Enable module name to be based on size of complex datatype
     const std::string bitsize =
         std::to_string(sizeof(std::complex<PrecisionT>) * 8);
@@ -53,13 +54,14 @@ void lightning_class_bindings(py::module &m) {
     //***********************************************************************//
     //                              StateVector
     //***********************************************************************//
-
+    //
     std::string class_name = "StateVectorC" + bitsize;
     auto pyclass =
-        py::class_<StateVectorRaw<PrecisionT>>(m, class_name.c_str());
-    pyclass.def(py::init(&create<PrecisionT>));
+        py::class_<StateVectorRawCPU<PrecisionT>>(m, class_name.c_str());
+    pyclass.def(py::init(&createRaw<PrecisionT>));
 
-    registerKernelsToPyexport<PrecisionT, ParamT>(pyclass);
+    registerGatesForStateVector<PrecisionT, ParamT,
+                                StateVectorRawCPU<PrecisionT>>(pyclass);
 
     //***********************************************************************//
     //                              Observable
@@ -221,7 +223,7 @@ void lightning_class_bindings(py::module &m) {
         .def("adjoint_jacobian", &AdjointJacobian<PrecisionT>::adjointJacobian)
         .def("adjoint_jacobian",
              [](AdjointJacobian<PrecisionT> &adj,
-                const StateVectorRaw<PrecisionT> &sv,
+                const StateVectorRawCPU<PrecisionT> &sv,
                 const std::vector<ObsDatum<PrecisionT>> &observables,
                 const OpsData<PrecisionT> &operations,
                 const std::vector<size_t> &trainableParams, size_t num_params) {
@@ -292,7 +294,7 @@ void lightning_class_bindings(py::module &m) {
                  auto fn = v.vectorJacobianProduct(dy, num_params);
                  return py::cpp_function(
                      [fn, num_params](
-                         const StateVectorRaw<PrecisionT> &sv,
+                         const StateVectorRawCPU<PrecisionT> &sv,
                          const std::vector<ObsDatum<PrecisionT>> &observables,
                          const OpsData<PrecisionT> &operations,
                          const std::vector<size_t> &trainableParams) {
@@ -309,7 +311,7 @@ void lightning_class_bindings(py::module &m) {
 
     class_name = "MeasuresC" + bitsize;
     py::class_<Measures<PrecisionT>>(m, class_name.c_str())
-        .def(py::init<const StateVectorRaw<PrecisionT> &>())
+        .def(py::init<const StateVectorRawCPU<PrecisionT> &>())
         .def("probs",
              [](Measures<PrecisionT> &M, const std::vector<size_t> &wires) {
                  if (wires.empty()) {
@@ -362,29 +364,19 @@ PYBIND11_MODULE(lightning_qubit_ops, // NOLINT: No control over Pybind internals
               &Gates::getIndicesAfterExclusion),
           "Get statevector indices for gate application");
 
-    /* Add EXPORTED_KERNELS */
-    std::vector<std::pair<std::string, std::string>> exported_kernel_ops;
-
-    for (const auto kernel : kernels_to_pyexport) {
-        const auto kernel_name = lookup(kernel_id_name_pairs, kernel);
-        const auto implemented_gates = implementedGatesForKernel(kernel);
-        for (const auto gate_op : implemented_gates) {
-            const auto gate_name =
-                std::string(lookup(Constant::gate_names, gate_op));
-            exported_kernel_ops.emplace_back(kernel_name, gate_name);
-        }
-    }
-
-    m.attr("EXPORTED_KERNEL_OPS") = py::cast(exported_kernel_ops);
-
-    /* Add DEFAULT_KERNEL_FOR_OPS */
-    std::map<std::string, std::string> default_kernel_ops_map;
-    for (const auto &[gate_op, name] : Constant::gate_names) {
-        const auto kernel = lookup(Constant::default_kernel_for_gates, gate_op);
-        const auto kernel_name = Util::lookup(kernel_id_name_pairs, kernel);
-        default_kernel_ops_map.emplace(std::string(name), kernel_name);
-    }
-    m.attr("DEFAULT_KERNEL_FOR_OPS") = py::cast(default_kernel_ops_map);
+    /* Add CPUMemoryModel enum class */
+    py::enum_<CPUMemoryModel>(m, "CPUMemoryModel")
+        .value("Unaligned", CPUMemoryModel::Unaligned)
+        .value("Aligned256", CPUMemoryModel::Aligned256)
+        .value("Aligned512", CPUMemoryModel::Aligned512);
+
+    /* Add array */
+    m.def("allocate_aligned_array", &allocateAlignedArray,
+          "Get numpy array whose underlying data is aligned.");
+    m.def("get_alignment", &getNumpyArrayAlignment,
+          "Get alignment of an underlying data for a numpy array.");
+    m.def("best_alignment", &bestCPUMemoryModel,
+          "Best memory alignment. for the simulator.");
 
     lightning_class_bindings<float, float>(m);
     lightning_class_bindings<double, double>(m);
diff --git a/pennylane_lightning/src/bindings/Bindings.hpp b/pennylane_lightning/src/bindings/Bindings.hpp
index a1845ba67d..84ef5f806c 100644
--- a/pennylane_lightning/src/bindings/Bindings.hpp
+++ b/pennylane_lightning/src/bindings/Bindings.hpp
@@ -18,10 +18,12 @@
  */
 #pragma once
 #include "AdjointDiff.hpp"
+#include "CPUMemoryModel.hpp"
 #include "JacobianProd.hpp"
 #include "Measures.hpp"
+#include "Memory.hpp"
 #include "OpToMemberFuncPtr.hpp"
-#include "StateVectorRaw.hpp"
+#include "StateVectorManagedCPU.hpp"
 
 #include "pybind11/complex.h"
 #include "pybind11/functional.h"
@@ -45,8 +47,8 @@ namespace Pennylane {
  * @return StateVector<PrecisionT> `%StateVector` object.
  */
 template <class PrecisionT = double>
-static auto create(pybind11::array_t<std::complex<PrecisionT>> &numpyArray)
-    -> StateVectorRaw<PrecisionT> {
+auto createRaw(const pybind11::array_t<std::complex<PrecisionT>> &numpyArray)
+    -> StateVectorRawCPU<PrecisionT> {
     pybind11::buffer_info numpyArrayInfo = numpyArray.request();
 
     if (numpyArrayInfo.ndim != 1) {
@@ -59,15 +61,86 @@ static auto create(pybind11::array_t<std::complex<PrecisionT>> &numpyArray)
     }
     auto *data_ptr =
         static_cast<std::complex<PrecisionT> *>(numpyArrayInfo.ptr);
-    return StateVectorRaw<PrecisionT>(
+    return StateVectorRawCPU<PrecisionT>(
         {data_ptr, static_cast<size_t>(numpyArrayInfo.shape[0])});
 }
 
 template <class PrecisionT = double>
-static auto toNumpyArray(const StateVectorCPU<PrecisionT> &sv)
-    -> py::array_t<std::complex<PrecisionT>> {
-    return py::array_t<std::complex<PrecisionT>>(
-        {sv.getLength()}, {sizeof(PrecisionT)} sv.getData(), );
+auto createManaged(
+    const pybind11::array_t<std::complex<PrecisionT>> &numpyArray)
+    -> StateVectorManagedCPU<PrecisionT> {
+    pybind11::buffer_info numpyArrayInfo = numpyArray.request();
+
+    if (numpyArrayInfo.ndim != 1) {
+        throw std::invalid_argument(
+            "NumPy array must be a 1-dimensional array");
+    }
+    if (numpyArrayInfo.itemsize != sizeof(std::complex<PrecisionT>)) {
+        throw std::invalid_argument(
+            "NumPy array must be of type np.complex64 or np.complex128");
+    }
+    auto *data_ptr =
+        static_cast<std::complex<PrecisionT> *>(numpyArrayInfo.ptr);
+    return StateVectorManagedCPU<PrecisionT>(
+        {data_ptr, static_cast<size_t>(numpyArrayInfo.shape[0])});
+}
+
+template <class PrecisionT = double>
+auto toNumpyArray(const StateVectorManagedCPU<PrecisionT> &sv)
+    -> pybind11::array_t<std::complex<PrecisionT>> {
+    return pybind11::array_t<std::complex<PrecisionT>>(
+        {sv.getLength()}, {2 * sizeof(PrecisionT)}, sv.getData());
+}
+
+auto getNumpyArrayAlignment(const pybind11::array &numpyArray)
+    -> CPUMemoryModel {
+    return getMemoryModel(numpyArray.request().ptr);
+}
+
+void deallocateArray(void *ptr) { std::free(ptr); }
+
+/**
+ * @brief We return an numpy array whose underlying data is allocated by
+ * lightning.
+ *
+ * See https://github.com/pybind/pybind11/issues/1042#issuecomment-325941022
+ * for capsule usage.
+ */
+auto allocateAlignedArray(size_t size, pybind11::dtype dt) -> pybind11::array {
+
+    auto memory_model = bestCPUMemoryModel();
+
+    if (dt.is(pybind11::dtype::of<float>())) {
+        void *ptr = std::aligned_alloc(getAlignment<float>(memory_model),
+                                       sizeof(float) * size);
+        auto capsule = pybind11::capsule(ptr, &deallocateArray);
+
+        return pybind11::array{dt, {size}, {sizeof(float)}, ptr, capsule};
+    } else if (dt.is(pybind11::dtype::of<double>())) {
+        void *ptr = std::aligned_alloc(getAlignment<double>(memory_model),
+                                       sizeof(double) * size);
+        auto capsule = pybind11::capsule(ptr, &deallocateArray);
+
+        return pybind11::array{dt, {size}, {sizeof(double)}, ptr, capsule};
+    } else if (dt.is(pybind11::dtype::of<std::complex<float>>())) {
+        void *ptr =
+            std::aligned_alloc(getAlignment<std::complex<float>>(memory_model),
+                               sizeof(std::complex<float>) * size);
+        auto capsule = pybind11::capsule(ptr, &deallocateArray);
+
+        return pybind11::array{
+            dt, {size}, {sizeof(std::complex<float>)}, ptr, capsule};
+    } else if (dt.is(pybind11::dtype::of<std::complex<double>>())) {
+        void *ptr =
+            std::aligned_alloc(getAlignment<std::complex<double>>(memory_model),
+                               sizeof(std::complex<double>) * size);
+        auto capsule = pybind11::capsule(ptr, &deallocateArray);
+
+        return pybind11::array{
+            dt, {size}, {sizeof(std::complex<double>)}, ptr, capsule};
+    } else {
+        throw pybind11::type_error("Unsupported datatype.");
+    }
 }
 
 /**
@@ -87,7 +160,7 @@ void apply(pybind11::array_t<std::complex<PrecisionT>> &stateNumpyArray,
            const std::vector<std::vector<size_t>> &wires,
            const std::vector<bool> &inverse,
            const std::vector<std::vector<PrecisionT>> &params) {
-    auto state = create<PrecisionT>(stateNumpyArray);
+    auto state = createRaw<PrecisionT>(stateNumpyArray);
     state.applyOperations(ops, wires, inverse, params);
 }
 
@@ -103,6 +176,7 @@ void apply(pybind11::array_t<std::complex<PrecisionT>> &stateNumpyArray,
  * @tparam kernel Kernel to register
  * @tparam gate_op Gate operation
  */
+/*
 template <class PrecisionT, class ParamT, Gates::KernelType kernel,
           Gates::GateOperation gate_op>
 constexpr auto getLambdaForKernelGateOp() {
@@ -115,16 +189,14 @@ constexpr auto getLambdaForKernelGateOp() {
 
     if constexpr (gate_op != GateOperation::Matrix) {
         return
-            [](StateVectorRaw<PrecisionT> &st, const std::vector<size_t> &wires,
-               bool inverse, const std::vector<ParamT> &params) {
-                constexpr auto func_ptr =
-                    GateOpToMemberFuncPtr<PrecisionT, ParamT,
-                                          GateImplementation, gate_op>::value;
-                callGateOps(func_ptr, st.getData(), st.getNumQubits(), wires,
+            [](StateVectorRawCPU<PrecisionT> &st, const std::vector<size_t>
+&wires, bool inverse, const std::vector<ParamT> &params) { constexpr auto
+func_ptr = GateOpToMemberFuncPtr<PrecisionT, ParamT, GateImplementation,
+gate_op>::value; callGateOps(func_ptr, st.getData(), st.getNumQubits(), wires,
                             inverse, params);
             };
     } else {
-        return [](StateVectorRaw<PrecisionT> &st,
+        return [](StateVectorRawCPU<PrecisionT> &st,
                   const py::array_t<std::complex<PrecisionT>,
                                     py::array::c_style | py::array::forcecast>
                       &matrix,
@@ -135,7 +207,8 @@ constexpr auto getLambdaForKernelGateOp() {
         };
     }
 };
-
+*/
+/*
 /// @cond DEV
 template <class PrecisionT, class ParamT, Gates::KernelType kernel,
           size_t gate_idx>
@@ -154,7 +227,7 @@ constexpr auto getGateOpLambdaPairsIter() {
     }
 }
 /// @endcond
-
+*/
 /**
  * @brief Create a tuple of lambda functions to bind
  *
@@ -162,10 +235,12 @@ constexpr auto getGateOpLambdaPairsIter() {
  * @tparam ParamT Floating point type of gate parameters
  * @tparam kernel Kernel to register
  */
+/*
 template <class PrecisionT, class ParamT, Gates::KernelType kernel>
 constexpr auto getGateOpLambdaPairs() {
     return getGateOpLambdaPairsIter<PrecisionT, ParamT, kernel, 0>();
 }
+*/
 
 /**
  * @brief For given kernel, register all implemented gate operations and apply
@@ -176,17 +251,13 @@ constexpr auto getGateOpLambdaPairs() {
  * @tparam Kernel Kernel to register
  * @tparam PyClass Pybind11 class type
  */
-template <class PrecisionT, class ParamT, Gates::KernelType kernel,
-          class PyClass>
+/*
+template <class PrecisionT, class ParamT, class PyClass>
 void registerImplementedGatesForKernel(PyClass &pyclass) {
     using namespace Pennylane::Gates;
-    const auto kernel_name = std::string(SelectKernel<kernel>::name);
-
-    constexpr auto gate_op_lambda_pairs =
-        getGateOpLambdaPairs<PrecisionT, ParamT, kernel>();
 
     auto registerToPyclass =
-        [&pyclass, &kernel_name](auto &&gate_op_lambda_pair) -> GateOperation {
+        [&pyclass](auto &&gate_op_lambda_pair) -> GateOperation {
         const auto &[gate_op, func] = gate_op_lambda_pair;
         if (gate_op == GateOperation::Matrix) {
             const std::string name = "applyMatrix_" + kernel_name;
@@ -195,10 +266,14 @@ void registerImplementedGatesForKernel(PyClass &pyclass) {
         } else {
             const auto gate_name =
                 std::string(lookup(Constant::gate_names, gate_op));
-            const std::string name = gate_name + "_" + kernel_name;
-            const std::string doc = "Apply the " + gate_name + " gate using " +
-                                    kernel_name + " kernel.";
-            pyclass.def(name.c_str(), func, doc.c_str());
+            const std::string doc = "Apply the " + gate_name + " gate.";
+            auto func = [&gate_name](StateVectorManagedCPU<PrecisionT>& sv,
+                                     const std::vector<size_t> &wires,
+                                     bool inverse,
+                                     const std::vector<ParamT> &params) {
+                sv.applyOperation(gate_name, wires, inverse, params);
+            }
+            pyclass.def(name.c_str(), , doc.c_str());
         }
         return gate_op;
     };
@@ -209,29 +284,40 @@ void registerImplementedGatesForKernel(PyClass &pyclass) {
         },
         gate_op_lambda_pairs);
 }
-
+*/
 /// @cond DEV
-template <class PrecisionT, class ParamT, size_t kernel_idx, class PyClass>
-void registerKernelsToPyexportIter(PyClass &pyclass) {
-    if constexpr (kernel_idx < kernels_to_pyexport.size()) {
-        constexpr auto kernel = kernels_to_pyexport[kernel_idx];
-        registerImplementedGatesForKernel<PrecisionT, ParamT, kernel>(pyclass);
-        registerKernelsToPyexportIter<PrecisionT, ParamT, kernel_idx + 1>(
-            pyclass);
+template <class PrecisionT, class ParamT, class SVType, class PyClass>
+void registerGatesForStateVector(PyClass &pyclass) {
+    using Gates::GateOperation;
+    namespace Constant = Gates::Constant;
+
+    static_assert(std::is_same_v<typename SVType::PrecisionT, PrecisionT>);
+
+    { // Register matrix
+        const std::string doc = "Apply a given matrix to wires.";
+        auto func =
+            [](SVType &st,
+               const pybind11::array_t<std::complex<PrecisionT>,
+                                       pybind11::array::c_style |
+                                           pybind11::array::forcecast> &matrix,
+               const std::vector<size_t> &wires, bool inverse = false) {
+                st.applyMatrix(static_cast<const std::complex<PrecisionT> *>(
+                                   matrix.request().ptr),
+                               wires, inverse);
+            };
+        pyclass.def("applyMatrix", func, doc.c_str());
     }
-}
-/// @endcond
 
-/**
- * @brief register gates for each kernel in kernels_to_pyexport
- *
- *
- * @tparam PrecisionT Floating point precision of underlying statevector data
- * @tparam ParamT Floating point type of gate parameters
- * @tparam PyClass Pyclass type
- */
-template <class PrecisionT, class ParamT, class PyClass>
-void registerKernelsToPyexport(PyClass &pyclass) {
-    registerKernelsToPyexportIter<PrecisionT, ParamT, 0>(pyclass);
+    Util::for_each_enum<GateOperation>([&pyclass](GateOperation gate_op) {
+        const auto gate_name =
+            std::string(lookup(Constant::gate_names, gate_op));
+        const std::string doc = "Apply the " + gate_name + " gate.";
+        auto func = [gate_name = gate_name](
+                        SVType &sv, const std::vector<size_t> &wires,
+                        bool inverse, const std::vector<ParamT> &params) {
+            sv.applyOperation(gate_name, wires, inverse, params);
+        };
+        pyclass.def(gate_name.c_str(), func, doc.c_str());
+    });
 }
 } // namespace Pennylane
diff --git a/pennylane_lightning/src/gates/Constant.hpp b/pennylane_lightning/src/gates/Constant.hpp
index 38d086875d..62f0859829 100644
--- a/pennylane_lightning/src/gates/Constant.hpp
+++ b/pennylane_lightning/src/gates/Constant.hpp
@@ -25,14 +25,19 @@ namespace Pennylane::Gates::Constant {
 /**
  * @brief List of multi-qubit gates
  */
-[[maybe_unused]] constexpr std::array multi_qubit_gates{GateOperation::MultiRZ,
-                                                        GateOperation::Matrix};
+[[maybe_unused]] constexpr std::array multi_qubit_gates{GateOperation::MultiRZ};
 /**
  * @brief List of multi-qubit generators
  */
 [[maybe_unused]] constexpr std::array multi_qubit_generators{
     GeneratorOperation::MultiRZ,
 };
+/**
+ * @brief List of multi-qubit matrix operation
+ */
+[[maybe_unused]] constexpr std::array multi_qubit_matrix_ops{
+    MatrixOperation::MultiQubitOp,
+};
 
 /**
  * @brief Gate names
@@ -71,9 +76,7 @@ namespace Pennylane::Gates::Constant {
                                                "Toffoli"},
     std::pair<GateOperation, std::string_view>{GateOperation::CSWAP, "CSWAP"},
     std::pair<GateOperation, std::string_view>{GateOperation::MultiRZ,
-                                               "MultiRZ"},
-    std::pair<GateOperation, std::string_view>{GateOperation::Matrix, "Matrix"},
-};
+                                               "MultiRZ"}};
 /**
  * @brief Generator names.
  *
@@ -108,6 +111,19 @@ namespace Pennylane::Gates::Constant {
                                                     "GeneratorMultiRZ"},
 };
 
+/**
+ * @brief Matrix names.
+ *
+ */
+[[maybe_unused]] constexpr std::array matrix_names = {
+    std::pair<MatrixOperation, std::string_view>{MatrixOperation::SingleQubitOp,
+                                                 "SingleQubitOp"},
+    std::pair<MatrixOperation, std::string_view>{MatrixOperation::TwoQubitOp,
+                                                 "TwoQubitOp"},
+    std::pair<MatrixOperation, std::string_view>{MatrixOperation::MultiQubitOp,
+                                                 "MultiQubitOp"},
+};
+
 /**
  * @brief Number of wires for gates besides multi-qubit gates
  */
@@ -236,7 +252,6 @@ namespace Pennylane::Gates::Constant {
     std::pair{GateOperation::Toffoli, KernelType::PI},
     std::pair{GateOperation::CSWAP, KernelType::PI},
     std::pair{GateOperation::MultiRZ, KernelType::LM},
-    std::pair{GateOperation::Matrix, KernelType::PI},
 };
 /**
  * @brief Define which kernel to use for each generator operation.
diff --git a/pennylane_lightning/src/gates/GateOperation.hpp b/pennylane_lightning/src/gates/GateOperation.hpp
index 24d17d4406..709bf3f459 100644
--- a/pennylane_lightning/src/gates/GateOperation.hpp
+++ b/pennylane_lightning/src/gates/GateOperation.hpp
@@ -56,13 +56,11 @@ enum class GateOperation : uint32_t {
     CSWAP,
     /* Mutli-qubit gates */
     MultiRZ,
-    /* General matrix */
-    Matrix,
     /* END (placeholder) */
     END
 };
 /**
- * @brief Enum class of all gate generators
+ * @brief Enum class for all gate generators
  */
 enum class GeneratorOperation : uint32_t {
     BEGIN = 0,
@@ -82,4 +80,16 @@ enum class GeneratorOperation : uint32_t {
     /* END (placeholder) */
     END
 };
+
+/**
+ * @brief Enum class for matrix operation
+ */
+enum class MatrixOperation : uint32_t {
+    BEGIN = 0,
+    SingleQubitOp = 0,
+    TwoQubitOp,
+    MultiQubitOp,
+    /* END (placeholder) */
+    END
+};
 } // namespace Pennylane::Gates
diff --git a/pennylane_lightning/src/gates/OpToMemberFuncPtr.hpp b/pennylane_lightning/src/gates/OpToMemberFuncPtr.hpp
index 05808dd364..49c1611bb9 100644
--- a/pennylane_lightning/src/gates/OpToMemberFuncPtr.hpp
+++ b/pennylane_lightning/src/gates/OpToMemberFuncPtr.hpp
@@ -33,11 +33,8 @@ namespace Pennylane::Gates {
 template <class PrecisionT, class ParamT, class GateImplementation,
           GateOperation gate_op>
 struct GateOpToMemberFuncPtr {
-    // raises compile error when used
-    static_assert(
-        gate_op != GateOperation::Matrix,
-        "GateOpToMemberFuncPtr is not defined for GateOperation::Matrix.");
-    static_assert(gate_op == GateOperation::Matrix,
+    // raises compile error when this struct is instantiated.
+    static_assert(sizeof(PrecisionT) == -1,
                   "GateOpToMemberFuncPtr is not defined for the given gate. "
                   "When you define a new GateOperation, check that you also "
                   "have added the corresponding entry in "
@@ -210,7 +207,7 @@ struct GateOpToMemberFuncPtr<PrecisionT, ParamT, GateImplementation,
 template <class PrecisionT, class GateImplementation,
           GeneratorOperation gntr_op>
 struct GeneratorOpToMemberFuncPtr {
-    // raises compile error when used
+    // raises compile error when this struct is instantiated.
     static_assert(
         sizeof(GateImplementation) == -1,
         "GeneratorOpToMemberFuncPtr is not defined for the given generator. "
@@ -292,6 +289,33 @@ struct GeneratorOpToMemberFuncPtr<PrecisionT, GateImplementation,
         &GateImplementation::template applyGeneratorMultiRZ<PrecisionT>;
 };
 
+/**
+ * @brief Matrix operation to member function pointer
+ */
+template <class PrecisionT, class GateImplementation, MatrixOperation mat_op>
+struct MatrixOpToMemberFuncPtr {
+    static_assert(sizeof(PrecisionT) == -1, "Unrecognized matrix operation");
+};
+
+template <class PrecisionT, class GateImplementation>
+struct MatrixOpToMemberFuncPtr<PrecisionT, GateImplementation,
+                               MatrixOperation::SingleQubitOp> {
+    constexpr static auto value =
+        &GateImplementation::template applySingleQubitOp<PrecisionT>;
+};
+template <class PrecisionT, class GateImplementation>
+struct MatrixOpToMemberFuncPtr<PrecisionT, GateImplementation,
+                               MatrixOperation::TwoQubitOp> {
+    constexpr static auto value =
+        &GateImplementation::template applyTwoQubitOp<PrecisionT>;
+};
+template <class PrecisionT, class GateImplementation>
+struct MatrixOpToMemberFuncPtr<PrecisionT, GateImplementation,
+                               MatrixOperation::MultiQubitOp> {
+    constexpr static auto value =
+        &GateImplementation::template applyMultiQubitOp<PrecisionT>;
+};
+
 /// @cond DEV
 namespace Internal {
 /**
@@ -371,6 +395,15 @@ template <class PrecisionT> struct GeneratorFuncPtr {
     using Type = PrecisionT (*)(std::complex<PrecisionT> *, size_t,
                                 const std::vector<size_t> &, bool);
 };
+
+/**
+ * @brief Pointer type for a matrix operation
+ */
+template <class PrecisionT> struct MatrixFuncPtr {
+    using Type = void (*)(std::complex<PrecisionT> *, size_t,
+                          const std::complex<PrecisionT> *,
+                          const std::vector<size_t> &, bool);
+};
 } // namespace Internal
 /// @endcond
 
@@ -382,11 +415,17 @@ using GateFuncPtrT =
     typename Internal::GateFuncPtr<PrecisionT, ParamT, num_params>::Type;
 
 /**
- * @brief Convenient type alias for GeneratorFuncPtrT.
+ * @brief Convenient type alias for GeneratorFuncPtr.
  */
 template <class PrecisionT>
 using GeneratorFuncPtrT = typename Internal::GeneratorFuncPtr<PrecisionT>::Type;
 
+/**
+ * @brief Convinient type alias for MatrixfuncPtr.
+ */
+template <class PrecisionT>
+using MatrixFuncPtrT = typename Internal::MatrixFuncPtr<PrecisionT>::Type;
+
 /**
  * @defgroup Call gate operation with provided arguments
  *
@@ -449,4 +488,16 @@ inline PrecisionT callGeneratorOps(GeneratorFuncPtrT<PrecisionT> func,
                                    const std::vector<size_t> &wires, bool adj) {
     return func(data, num_qubits, wires, adj);
 }
+
+/**
+ * @brief Call a matrix operation.
+ * @tparam PrecisionT Floating point type for the state-vector.
+ */
+template <class PrecisionT>
+inline void callMatrixOp(MatrixFuncPtrT<PrecisionT> func,
+                         std::complex<PrecisionT> *data, size_t num_qubits,
+                         const std::complex<PrecisionT *> matrix,
+                         const std::vector<size_t> &wires, bool adj) {
+    return func(data, num_qubits, matrix, wires, adj);
+}
 } // namespace Pennylane::Gates
diff --git a/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp b/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp
index 6e0060fc54..5618bffc7b 100644
--- a/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp
+++ b/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp
@@ -65,8 +65,7 @@ class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
         GateOperation::CRX,     GateOperation::CRY,
         GateOperation::CRZ,     GateOperation::CRot,
         GateOperation::IsingXX, GateOperation::IsingYY,
-        GateOperation::IsingZZ, GateOperation::MultiRZ,
-        GateOperation::Matrix};
+        GateOperation::IsingZZ, GateOperation::MultiRZ};
 
     constexpr static std::array implemented_generators = {
         GeneratorOperation::RX,
@@ -83,6 +82,10 @@ class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
         GeneratorOperation::MultiRZ,
     };
 
+    constexpr static std::array implemented_matrices = {
+        MatrixOperation::SingleQubitOp, MatrixOperation::TwoQubitOp,
+        MatrixOperation::MultiQubitOp};
+
     /**
      * @brief Apply a single qubit gate to the statevector.
      *
@@ -95,9 +98,10 @@ class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
     template <class PrecisionT>
     static inline void
     applySingleQubitOp(std::complex<PrecisionT> *arr, size_t num_qubits,
-                       const std::complex<PrecisionT> *matrix, size_t wire,
-                       bool inverse = false) {
-        const size_t rev_wire = num_qubits - wire - 1;
+                       const std::complex<PrecisionT> *matrix,
+                       const std::vector<size_t> &wires, bool inverse = false) {
+        assert(wires.size() == 1);
+        const size_t rev_wire = num_qubits - wires[0] - 1;
         const size_t rev_wire_shift = (static_cast<size_t>(1U) << rev_wire);
         const size_t wire_parity = fillTrailingOnes(rev_wire);
         const size_t wire_parity_inv = fillLeadingOnes(rev_wire + 1);
@@ -146,6 +150,7 @@ class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
     applyTwoQubitOp(std::complex<PrecisionT> *arr, size_t num_qubits,
                     const std::complex<PrecisionT> *matrix,
                     const std::vector<size_t> &wires, bool inverse = false) {
+        assert(wires.size() == 2);
         const size_t rev_wire0 = num_qubits - wires[1] - 1;
         const size_t rev_wire1 = num_qubits - wires[0] - 1; // Control qubit
 
@@ -242,47 +247,38 @@ class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
     }
 
     template <class PrecisionT>
-    static void applyMatrix(std::complex<PrecisionT> *arr, size_t num_qubits,
-                            const std::complex<PrecisionT> *matrix,
-                            const std::vector<size_t> &wires, bool inverse) {
+    static void
+    applyMultiQubitOp(std::complex<PrecisionT> *arr, size_t num_qubits,
+                      const std::complex<PrecisionT> *matrix,
+                      const std::vector<size_t> &wires, bool inverse) {
         assert(num_qubits >= wires.size());
 
-        switch (wires.size()) {
-        case 1:
-            applySingleQubitOp(arr, num_qubits, matrix, wires[0], inverse);
-            break;
-        case 2:
-            applyTwoQubitOp(arr, num_qubits, matrix, wires, inverse);
-            break;
-        default: {
-            size_t dim = 1U << wires.size();
-            std::vector<size_t> indices;
-            indices.resize(dim);
-
-            for (size_t k = 0; k < Util::exp2(num_qubits); k += dim) {
-                std::vector<std::complex<PrecisionT>> coeffs_in(dim);
-                std::vector<std::complex<PrecisionT>> coeffs_out(dim);
-
-                for (size_t inner_idx = 0; inner_idx < dim; inner_idx++) {
-                    size_t idx = k | inner_idx;
-                    size_t n_wires = wires.size();
-                    for (size_t pos = 0; pos < n_wires; pos++) {
-                        idx = bitswap(idx, n_wires - pos - 1,
-                                      num_qubits - wires[pos] - 1);
-                    }
-                    indices[inner_idx] = idx;
-                    coeffs_in[inner_idx] = arr[idx];
-                }
+        size_t dim = 1U << wires.size();
+        std::vector<size_t> indices;
+        indices.resize(dim);
 
-                Util::matrixVecProd(
-                    matrix, coeffs_in.data(), coeffs_out.data(), dim, dim,
-                    inverse ? Trans::Adjoint : Trans::NoTranspose);
+        for (size_t k = 0; k < Util::exp2(num_qubits); k += dim) {
+            std::vector<std::complex<PrecisionT>> coeffs_in(dim);
+            std::vector<std::complex<PrecisionT>> coeffs_out(dim);
 
-                for (size_t inner_idx = 0; inner_idx < dim; inner_idx++) {
-                    arr[indices[inner_idx]] = coeffs_out[inner_idx];
+            for (size_t inner_idx = 0; inner_idx < dim; inner_idx++) {
+                size_t idx = k | inner_idx;
+                size_t n_wires = wires.size();
+                for (size_t pos = 0; pos < n_wires; pos++) {
+                    idx = bitswap(idx, n_wires - pos - 1,
+                                  num_qubits - wires[pos] - 1);
                 }
+                indices[inner_idx] = idx;
+                coeffs_in[inner_idx] = arr[idx];
+            }
+
+            Util::matrixVecProd(matrix, coeffs_in.data(), coeffs_out.data(),
+                                dim, dim,
+                                inverse ? Trans::Adjoint : Trans::NoTranspose);
+
+            for (size_t inner_idx = 0; inner_idx < dim; inner_idx++) {
+                arr[indices[inner_idx]] = coeffs_out[inner_idx];
             }
-        }
         }
     }
 
@@ -520,7 +516,7 @@ class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
             (inverse) ? Gates::getRot<PrecisionT>(-omega, -theta, -phi)
                       : Gates::getRot<PrecisionT>(phi, theta, omega);
 
-        applySingleQubitOp(arr, num_qubits, rotMat.data(), wires[0]);
+        applySingleQubitOp(arr, num_qubits, rotMat.data(), wires);
     }
 
     /* Two-qubit gates */
diff --git a/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsPI.hpp b/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsPI.hpp
index 82a0edf924..b4314411f5 100644
--- a/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsPI.hpp
+++ b/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsPI.hpp
@@ -68,8 +68,8 @@ class GateImplementationsPI : public PauliGenerator<GateImplementationsPI> {
         GateOperation::IsingZZ, GateOperation::CRX,
         GateOperation::CRY,     GateOperation::CRZ,
         GateOperation::CRot,    GateOperation::Toffoli,
-        GateOperation::CSWAP,   GateOperation::MultiRZ,
-        GateOperation::Matrix};
+        GateOperation::CSWAP,   GateOperation::MultiRZ};
+
     constexpr static std::array implemented_generators = {
         GeneratorOperation::RX,
         GeneratorOperation::RY,
@@ -83,6 +83,153 @@ class GateImplementationsPI : public PauliGenerator<GateImplementationsPI> {
         GeneratorOperation::CRZ,
         GeneratorOperation::ControlledPhaseShift};
 
+    constexpr static std::array implemented_matrices = {
+        MatrixOperation::SingleQubitOp, MatrixOperation::TwoQubitOp,
+        MatrixOperation::MultiQubitOp};
+
+    /**
+     * @brief Apply a single qubit gate to the statevector.
+     *
+     * @param arr Pointer to the statevector.
+     * @param num_qubits Number of qubits.
+     * @param matrix Perfect square matrix in row-major order.
+     * @param wires Wires the gate applies to.
+     * @param inverse Indicate whether inverse should be taken.
+     */
+    template <class PrecisionT>
+    static inline void
+    applySingleQubitOp(std::complex<PrecisionT> *arr, size_t num_qubits,
+                       const std::complex<PrecisionT> *matrix,
+                       const std::vector<size_t> &wires, bool inverse = false) {
+        assert(wires.size() == 1);
+
+        const auto [indices, externalIndices] = GateIndices(wires, num_qubits);
+
+        if (inverse) {
+            for (const size_t &externalIndex : externalIndices) {
+                std::complex<PrecisionT> *shiftedState = arr + externalIndex;
+                const std::complex<PrecisionT> v0 = shiftedState[indices[0]];
+                const std::complex<PrecisionT> v1 = shiftedState[indices[1]];
+                shiftedState[indices[0]] =
+                    std::conj(matrix[0B00]) * v0 +
+                    std::conj(matrix[0B10]) *
+                        v1; // NOLINT(readability-magic-numbers)
+                shiftedState[indices[1]] =
+                    std::conj(matrix[0B01]) * v0 +
+                    std::conj(matrix[0B11]) *
+                        v1; // NOLINT(readability-magic-numbers)
+            }
+        } else {
+            for (const size_t &externalIndex : externalIndices) {
+                std::complex<PrecisionT> *shiftedState = arr + externalIndex;
+                const std::complex<PrecisionT> v0 = shiftedState[indices[0]];
+                const std::complex<PrecisionT> v1 = shiftedState[indices[1]];
+                shiftedState[indices[0]] =
+                    matrix[0B00] * v0 +
+                    matrix[0B01] * v1; // NOLINT(readability-magic-numbers)
+                shiftedState[indices[1]] =
+                    matrix[0B10] * v0 +
+                    matrix[0B11] * v1; // NOLINT(readability-magic-numbers)
+            }
+        }
+    }
+
+    /**
+     * @brief Apply a two qubit gate to the statevector.
+     *
+     * @param arr Pointer to the statevector.
+     * @param num_qubits Number of qubits.
+     * @param matrix Perfect square matrix in row-major order.
+     * @param wires Wires the gate applies to.
+     * @param inverse Indicate whether inverse should be taken.
+     */
+    template <class PrecisionT>
+    static inline void
+    applyTwoQubitOp(std::complex<PrecisionT> *arr, size_t num_qubits,
+                    const std::complex<PrecisionT> *matrix,
+                    const std::vector<size_t> &wires, bool inverse = false) {
+        assert(wires.size() == 2);
+        const auto [indices, externalIndices] = GateIndices(wires, num_qubits);
+
+        if (inverse) {
+            for (const size_t &externalIndex : externalIndices) {
+                std::complex<PrecisionT> *shiftedState = arr + externalIndex;
+
+                const std::complex<PrecisionT> v00 = shiftedState[indices[0]];
+                const std::complex<PrecisionT> v01 = shiftedState[indices[1]];
+                const std::complex<PrecisionT> v10 = shiftedState[indices[2]];
+                const std::complex<PrecisionT> v11 = shiftedState[indices[3]];
+
+                // NOLINTNEXTLINE(readability-magic-numbers)
+                shiftedState[indices[0]] =
+                    std::conj(matrix[0b0000]) * v00 +
+                    // NOLINTNEXTLINE(readability-magic-numbers)
+                    std::conj(matrix[0b0100]) * v01 +
+                    // NOLINTNEXTLINE(readability-magic-numbers)
+                    std::conj(matrix[0b1000]) * v10 +
+                    // NOLINTNEXTLINE(readability-magic-numbers)
+                    std::conj(matrix[0b1100]) * v11;
+                // NOLINTNEXTLINE(readability-magic-numbers)
+                shiftedState[indices[1]] =
+                    std::conj(matrix[0b0001]) * v00 +
+                    // NOLINTNEXTLINE(readability-magic-numbers)
+                    std::conj(matrix[0b0101]) * v01 +
+                    // NOLINTNEXTLINE(readability-magic-numbers)
+                    std::conj(matrix[0b1001]) * v10 +
+                    // NOLINTNEXTLINE(readability-magic-numbers)
+                    std::conj(matrix[0b1101]) * v11;
+                // NOLINTNEXTLINE(readability-magic-numbers)
+                shiftedState[indices[2]] =
+                    std::conj(matrix[0b0010]) * v00 +
+                    // NOLINTNEXTLINE(readability-magic-numbers)
+                    std::conj(matrix[0b0110]) * v01 +
+                    // NOLINTNEXTLINE(readability-magic-numbers)
+                    std::conj(matrix[0b1010]) * v10 +
+                    // NOLINTNEXTLINE(readability-magic-numbers)
+                    std::conj(matrix[0b1110]) * v11;
+                // NOLINTNEXTLINE(readability-magic-numbers)
+                shiftedState[indices[3]] =
+                    std::conj(matrix[0b0011]) * v00 +
+                    // NOLINTNEXTLINE(readability-magic-numbers)
+                    std::conj(matrix[0b0111]) * v01 +
+                    // NOLINTNEXTLINE(readability-magic-numbers)
+                    std::conj(matrix[0b1011]) * v10 +
+                    // NOLINTNEXTLINE(readability-magic-numbers)
+                    std::conj(matrix[0b1111]) * v11;
+            }
+        } else {
+            for (const size_t &externalIndex : externalIndices) {
+                std::complex<PrecisionT> *shiftedState = arr + externalIndex;
+
+                const std::complex<PrecisionT> v00 = shiftedState[indices[0]];
+                const std::complex<PrecisionT> v01 = shiftedState[indices[1]];
+                const std::complex<PrecisionT> v10 = shiftedState[indices[2]];
+                const std::complex<PrecisionT> v11 = shiftedState[indices[3]];
+
+                // NOLINTNEXTLINE(readability-magic-numbers)
+                shiftedState[indices[0]] =
+                    matrix[0b0000] * v00 + matrix[0b0001] * v01 +
+                    // NOLINTNEXTLINE(readability-magic-numbers)
+                    matrix[0b0010] * v10 + matrix[0b0011] * v11;
+                // NOLINTNEXTLINE(readability-magic-numbers)
+                shiftedState[indices[1]] =
+                    matrix[0b0100] * v00 + matrix[0b0101] * v01 +
+                    // NOLINTNEXTLINE(readability-magic-numbers)
+                    matrix[0b0110] * v10 + matrix[0b0111] * v11;
+                // NOLINTNEXTLINE(readability-magic-numbers)
+                shiftedState[indices[2]] =
+                    matrix[0b1000] * v00 + matrix[0b1001] * v01 +
+                    // NOLINTNEXTLINE(readability-magic-numbers)
+                    matrix[0b1010] * v10 + matrix[0b1011] * v11;
+                // NOLINTNEXTLINE(readability-magic-numbers)
+                shiftedState[indices[3]] =
+                    matrix[0b1100] * v00 + matrix[0b1101] * v01 +
+                    // NOLINTNEXTLINE(readability-magic-numbers)
+                    matrix[0b1110] * v10 + matrix[0b1111] * v11;
+            }
+        }
+    }
+
     /**
      * @brief Apply a given matrix directly to the statevector.
      *
@@ -93,9 +240,10 @@ class GateImplementationsPI : public PauliGenerator<GateImplementationsPI> {
      * @param inverse Indicate whether inverse should be taken.
      */
     template <class PrecisionT>
-    static void applyMatrix(std::complex<PrecisionT> *arr, size_t num_qubits,
-                            const std::complex<PrecisionT> *matrix,
-                            const std::vector<size_t> &wires, bool inverse) {
+    static void
+    applyMultiQubitOp(std::complex<PrecisionT> *arr, size_t num_qubits,
+                      const std::complex<PrecisionT> *matrix,
+                      const std::vector<size_t> &wires, bool inverse) {
         const auto [indices, externalIndices] = GateIndices(wires, num_qubits);
 
         std::vector<std::complex<PrecisionT>> v(indices.size());
@@ -134,27 +282,6 @@ class GateImplementationsPI : public PauliGenerator<GateImplementationsPI> {
         }
     }
 
-    /**
-     * @brief Apply a given matrix directly to the statevector.
-     *
-     * @param arr Pointer to the statevector.
-     * @param num_qubits Number of qubits.
-     * @param matrix Perfect square matrix in row-major order.
-     * @param wires Wires the gate applies to.
-     * @param inverse Indicate whether inverse should be taken.
-     */
-    template <class PrecisionT>
-    static void applyMatrix(std::complex<PrecisionT> *arr, size_t num_qubits,
-                            const std::vector<std::complex<PrecisionT>> &matrix,
-                            const std::vector<size_t> &wires, bool inverse) {
-        if (matrix.size() != Util::exp2(2 * wires.size())) {
-            throw std::invalid_argument(
-                "The size of matrix does not match with the given "
-                "number of wires");
-        }
-        applyMatrix(arr, num_qubits, matrix.data(), wires, inverse);
-    }
-
     /* Single qubit operators */
     template <class PrecisionT>
     static void applyPauliX(std::complex<PrecisionT> *arr, size_t num_qubits,
diff --git a/pennylane_lightning/src/simulator/CPUMemoryModel.hpp b/pennylane_lightning/src/simulator/CPUMemoryModel.hpp
new file mode 100644
index 0000000000..97b60cf7f4
--- /dev/null
+++ b/pennylane_lightning/src/simulator/CPUMemoryModel.hpp
@@ -0,0 +1,89 @@
+
+// Copyright 2022 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/**
+ * @file
+ * Define memory models for CPU
+ */
+#pragma once
+#include "Macros.hpp"
+
+#include <cstdint>
+#include <memory>
+
+namespace Pennylane {
+enum class CPUMemoryModel : uint8_t {
+    Unaligned,
+    Aligned256,
+    Aligned512,
+    END,
+    BEGIN = Unaligned,
+};
+
+inline auto getMemoryModel(const void *ptr) -> CPUMemoryModel {
+    if ((reinterpret_cast<uintptr_t>(ptr) % 64) == 0) {
+        return CPUMemoryModel::Aligned512;
+    }
+
+    if ((reinterpret_cast<uintptr_t>(ptr) % 32) == 0) {
+        return CPUMemoryModel::Aligned256;
+    }
+
+    return CPUMemoryModel::Unaligned;
+}
+
+constexpr inline auto bestCPUMemoryModel() -> CPUMemoryModel {
+    if constexpr (use_avx512f) {
+        return CPUMemoryModel::Aligned512;
+    } else if (use_avx2) {
+        return CPUMemoryModel::Aligned256;
+    }
+    return CPUMemoryModel::Unaligned;
+}
+
+template <class PrecisionT>
+constexpr inline auto getAlignment(CPUMemoryModel memory_model) -> size_t {
+    switch (memory_model) {
+    case CPUMemoryModel::Unaligned:
+        return alignof(PrecisionT);
+    case CPUMemoryModel::Aligned256:
+        return 32U;
+    case CPUMemoryModel::Aligned512:
+        return 64U;
+    default:
+        break;
+    }
+    PL_UNREACHABLE;
+}
+
+template <typename T>
+auto allocateMemory(CPUMemoryModel memory_model, size_t size)
+    // NOLINTNEXTLINE(modernize-avoid-c-arrays,hicpp-avoid-c-arrays)
+    -> std::unique_ptr<T[]> {
+    switch (memory_model) {
+    case CPUMemoryModel::Unaligned:
+        // NOLINTNEXTLINE(modernize-avoid-c-arrays,hicpp-avoid-c-arrays)
+        return std::unique_ptr<T[]>{new T[size]};
+    case CPUMemoryModel::Aligned256:
+        // NOLINTNEXTLINE(modernize-avoid-c-arrays,hicpp-avoid-c-arrays)
+        return std::unique_ptr<T[]>{new (std::align_val_t(32)) T[size]};
+    case CPUMemoryModel::Aligned512:
+        // NOLINTNEXTLINE(modernize-avoid-c-arrays,hicpp-avoid-c-arrays)
+        return std::unique_ptr<T[]>{new (std::align_val_t(64)) T[size]};
+    default:
+        break;
+    }
+    PL_UNREACHABLE;
+}
+} // namespace Pennylane
diff --git a/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp b/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp
index 3259ad5861..5cb25cbd93 100644
--- a/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp
+++ b/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp
@@ -13,34 +13,99 @@
 // limitations under the License.
 /**
  * @file
+ * Set/get Default kernels for statevector
  */
 #include "DispatchKeys.hpp"
 #include "GateOperation.hpp"
+#include "IntegerInterval.hpp"
 #include "KernelType.hpp"
+#include "Util.hpp"
 
 #include <functional>
 #include <unordered_map>
+#include <unordered_set>
+#include <utility>
 
 namespace Pennylane {
 
-inline auto larger_than(size_t size) {
-    return [=](size_t num_qubits) { return num_qubits > size; };
-}
-inline auto larger_than_equal_to(size_t size) {
-    return [=](size_t num_qubits) { return num_qubits >= size; };
-}
-inline auto less_than(size_t size) {
-    return [=](size_t num_qubits) { return num_qubits < size; };
-}
-inline auto less_than_equal_to(size_t size) {
-    return [=](size_t num_qubits) { return num_qubits <= size; };
+///@cond DEV
+struct DispatchElement {
+    uint32_t priority;
+    Util::IntegerInterval<size_t> interval;
+    Gates::KernelType kernel;
+};
+
+inline bool lower_priority(const DispatchElement &lhs,
+                           const DispatchElement &rhs) {
+    return lhs.priority < rhs.priority;
 }
-inline auto in_between_closed(size_t l1, size_t l2) {
-    return [=](size_t num_qubits) {
-        return (l1 <= num_qubits) && (num_qubits <= l2);
-    };
+
+inline bool higher_priority(const DispatchElement &lhs,
+                            const DispatchElement &rhs) {
+    return lhs.priority > rhs.priority;
 }
 
+/**
+ * @brief Maintain dispatch element using a vector decreasingly-ordered by
+ * priority.
+ */
+class PriorityDispatchSet {
+  private:
+    std::vector<DispatchElement> ordered_vec_;
+
+  public:
+    [[nodiscard]] bool
+    conflict(uint32_t test_priority,
+             const Util::IntegerInterval<size_t> &test_interval) const {
+        const auto test_elt = DispatchElement{test_priority, test_interval,
+                                              Gates::KernelType::None};
+        const auto [b, e] =
+            std::equal_range(ordered_vec_.begin(), ordered_vec_.end(), test_elt,
+                             higher_priority);
+        for (auto iter = b; iter != e; ++iter) {
+            if (!is_disjoint(iter->interval, test_interval)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    void insert(const DispatchElement &elt) {
+        const auto iter_to_insert = std::upper_bound(
+            ordered_vec_.begin(), ordered_vec_.end(), elt, &higher_priority);
+        ordered_vec_.insert(iter_to_insert, elt);
+    }
+
+    template <typename... Ts> void emplace(Ts &&...args) {
+        const auto elt = DispatchElement{std::forward<Ts>(args)...};
+        const auto iter_to_insert = std::upper_bound(
+            ordered_vec_.begin(), ordered_vec_.end(), elt, &higher_priority);
+        ordered_vec_.insert(iter_to_insert, elt);
+    }
+
+    [[nodiscard]] Gates::KernelType getKernel(size_t num_qubits) const {
+        for (const auto &elt : ordered_vec_) {
+            if (elt.interval(num_qubits)) {
+                return elt.kernel;
+            }
+        }
+        throw std::range_error(
+            "Cannot find a kernel for the given number of qubits.");
+    }
+
+    void clearPriority(uint32_t remove_priority) {
+        const auto begin = std::lower_bound(
+            ordered_vec_.begin(), ordered_vec_.end(), remove_priority,
+            [](const auto &elt, uint32_t p) { return elt.priority > p; });
+        const auto end = std::upper_bound(
+            ordered_vec_.begin(), ordered_vec_.end(), remove_priority,
+            [](uint32_t p, const auto &elt) { return p > elt.priority; });
+        ordered_vec_.erase(begin, end);
+    }
+};
+
+///@endcond
+
 class DefaultKernelsForStateVector {
   private:
     const static inline std::unordered_map<CPUMemoryModel,
@@ -55,23 +120,31 @@ class DefaultKernelsForStateVector {
         };
 
     std::unordered_map<
-        Gates::GateOperation,
-        std::vector<std::tuple<uint32_t, std::function<bool(size_t)>,
-                               Gates::KernelType>>>
+        std::pair<Gates::GateOperation, uint32_t /* dispatch_key */>,
+        PriorityDispatchSet, Util::PairHash>
         gate_kernel_map_;
 
     std::unordered_map<
-        Gates::GeneratorOperation,
-        std::vector<std::tuple<uint32_t, std::function<bool(size_t)>,
-                               Gates::KernelType>>>
+        std::pair<Gates::GeneratorOperation, uint32_t /* dispatch_key */>,
+        PriorityDispatchSet, Util::PairHash>
         generator_kernel_map_;
 
+    std::unordered_map<
+        std::pair<Gates::MatrixOperation, uint32_t /* dispatch_key */>,
+        PriorityDispatchSet, Util::PairHash>
+        matrix_kernel_map_;
+
     void registerDefaultGates() {
         using Gates::GateOperation;
+        using Util::full_domain;
+        using Util::in_between_closed;
+        using Util::larger_than;
+        using Util::larger_than_equal_to;
+        using Util::less_than;
+        using Util::less_than_equal_to;
+
         auto &instance = *this;
-        auto all_qubit_numbers = []([[maybe_unused]] size_t num_qubits) {
-            return true;
-        };
+        auto all_qubit_numbers = full_domain<size_t>();
         /* Single-qubit gates */
         instance.assignKernelForGate(GateOperation::PauliX, all_threading,
                                      all_memory_model, all_qubit_numbers,
@@ -126,15 +199,15 @@ class DefaultKernelsForStateVector {
         instance.assignKernelForGate(
             GateOperation::IsingXX, all_threading,
             // NOLINTNEXTLINE(readability-magic-numbers)
-            all_memory_model, less_than(12), Gates::KernelType::LM);
+            all_memory_model, less_than<size_t>(12), Gates::KernelType::LM);
         instance.assignKernelForGate(
             GateOperation::IsingXX, all_threading, all_memory_model,
             // NOLINTNEXTLINE(readability-magic-numbers)
-            in_between_closed(12, 20), Gates::KernelType::PI);
+            in_between_closed<size_t>(12, 20), Gates::KernelType::PI);
         instance.assignKernelForGate(
             GateOperation::IsingXX, all_threading,
             // NOLINTNEXTLINE(readability-magic-numbers)
-            all_memory_model, larger_than(20), Gates::KernelType::LM);
+            all_memory_model, larger_than<size_t>(20), Gates::KernelType::LM);
 
         instance.assignKernelForGate(GateOperation::IsingYY, all_threading,
                                      all_memory_model, all_qubit_numbers,
@@ -166,12 +239,18 @@ class DefaultKernelsForStateVector {
     }
 
     void registerDefaultGenerators() {
+        using Gates::GateOperation;
         using Gates::GeneratorOperation;
         using Gates::KernelType;
+        using Util::full_domain;
+        using Util::in_between_closed;
+        using Util::larger_than;
+        using Util::larger_than_equal_to;
+        using Util::less_than;
+        using Util::less_than_equal_to;
+
         auto &instance = *this;
-        auto all_qubit_numbers = []([[maybe_unused]] size_t num_qubits) {
-            return true;
-        };
+        auto all_qubit_numbers = full_domain<size_t>();
 
         instance.assignKernelForGenerator(GeneratorOperation::PhaseShift,
                                           all_threading, all_memory_model,
@@ -211,9 +290,35 @@ class DefaultKernelsForStateVector {
                                           all_qubit_numbers, KernelType::LM);
     }
 
+    void registerDefaultMatrices() {
+        using Gates::GateOperation;
+        using Gates::KernelType;
+        using Gates::MatrixOperation;
+        using Util::full_domain;
+        using Util::in_between_closed;
+        using Util::larger_than;
+        using Util::larger_than_equal_to;
+        using Util::less_than;
+        using Util::less_than_equal_to;
+
+        auto &instance = *this;
+        auto all_qubit_numbers = full_domain<size_t>();
+
+        instance.assignKernelForMatrix(MatrixOperation::SingleQubitOp,
+                                       all_threading, all_memory_model,
+                                       all_qubit_numbers, KernelType::LM);
+        instance.assignKernelForMatrix(MatrixOperation::TwoQubitOp,
+                                       all_threading, all_memory_model,
+                                       all_qubit_numbers, KernelType::LM);
+        instance.assignKernelForMatrix(MatrixOperation::MultiQubitOp,
+                                       all_threading, all_memory_model,
+                                       all_qubit_numbers, KernelType::LM);
+    }
+
     DefaultKernelsForStateVector() {
         registerDefaultGates();
         registerDefaultGenerators();
+        registerDefaultMatrices();
     }
 
   public:
@@ -230,108 +335,197 @@ class DefaultKernelsForStateVector {
         return instance;
     }
 
-    void
-    assignKernelForGate(Gates::GateOperation gate_op, Threading threading,
-                        CPUMemoryModel memory_model,
-                        const std::function<bool(size_t)> &num_qubits_criterion,
-                        Gates::KernelType kernel) {
+    void assignKernelForGate(Gates::GateOperation gate_op, Threading threading,
+                             CPUMemoryModel memory_model, uint32_t priority,
+                             const Util::IntegerInterval<size_t> &interval,
+                             Gates::KernelType kernel) {
+        if (std::find(allowed_kernels.at(memory_model).cbegin(),
+                      allowed_kernels.at(memory_model).cend(),
+                      kernel) == allowed_kernels.at(memory_model).cend()) {
+            throw std::invalid_argument("The given kernel is now allowed for "
+                                        "the given memory model.");
+        }
+        const auto dispatch_key = toDispatchKey(threading, memory_model);
+        auto &set = gate_kernel_map_[std::make_pair(gate_op, dispatch_key)];
+
+        if (set.conflict(priority, interval)) {
+            throw std::invalid_argument("The given interval conflicts with "
+                                        "existing intervals.");
+        }
+        set.emplace(priority, interval, kernel);
+    }
+
+    void assignKernelForGate(Gates::GateOperation gate_op,
+                             [[maybe_unused]] AllThreading dummy,
+                             CPUMemoryModel memory_model,
+                             const Util::IntegerInterval<size_t> &interval,
+                             Gates::KernelType kernel) {
+        /* Priority for all threading is 1 */
+        Util::for_each_enum<Threading>([=](Threading threading) {
+            assignKernelForGate(gate_op, threading, memory_model, 1, interval,
+                                kernel);
+        });
+    }
+
+    void assignKernelForGate(Gates::GateOperation gate_op, Threading threading,
+                             [[maybe_unused]] AllMemoryModel dummy,
+                             const Util::IntegerInterval<size_t> &interval,
+                             Gates::KernelType kernel) {
+        /* Priority for all memory model is 2 */
+        Util::for_each_enum<CPUMemoryModel>([=](CPUMemoryModel memory_model) {
+            assignKernelForGate(gate_op, threading, memory_model, 2, interval,
+                                kernel);
+        });
+    }
+
+    void assignKernelForGate(Gates::GateOperation gate_op,
+                             [[maybe_unused]] AllThreading dummy1,
+                             [[maybe_unused]] AllMemoryModel dummy2,
+                             const Util::IntegerInterval<size_t> &interval,
+                             Gates::KernelType kernel) {
+        /* Priority is 0 */
+        Util::for_each_enum<Threading, CPUMemoryModel>(
+            [=](Threading threading, CPUMemoryModel memory_model) {
+                assignKernelForGate(gate_op, threading, memory_model, 0,
+                                    interval, kernel);
+            });
+    }
+
+    void assignKernelForGenerator(Gates::GeneratorOperation gntr_op,
+                                  Threading threading,
+                                  CPUMemoryModel memory_model,
+                                  uint32_t priority,
+                                  const Util::IntegerInterval<size_t> &interval,
+                                  Gates::KernelType kernel) {
         if (std::find(allowed_kernels.at(memory_model).cbegin(),
                       allowed_kernels.at(memory_model).cend(),
                       kernel) == allowed_kernels.at(memory_model).cend()) {
             throw std::invalid_argument("The given kernel is now allowed for "
                                         "the given memory model.");
         }
-        gate_kernel_map_[gate_op].emplace_back(
-            toDispatchKey(threading, memory_model), num_qubits_criterion,
-            kernel);
+        const auto dispatch_key = toDispatchKey(threading, memory_model);
+        auto &set =
+            generator_kernel_map_[std::make_pair(gntr_op, dispatch_key)];
+
+        if (set.conflict(priority, interval)) {
+            throw std::invalid_argument("The given interval conflicts with "
+                                        "existing intervals.");
+        }
+        set.emplace(priority, interval, kernel);
     }
 
-    void
-    assignKernelForGate(Gates::GateOperation gate_op,
-                        [[maybe_unused]] AllThreading dummy,
-                        CPUMemoryModel memory_model,
-                        const std::function<bool(size_t)> &num_qubits_criterion,
-                        Gates::KernelType kernel) {
+    void assignKernelForGenerator(Gates::GeneratorOperation gntr_op,
+                                  [[maybe_unused]] AllThreading dummy,
+                                  CPUMemoryModel memory_model,
+                                  const Util::IntegerInterval<size_t> &interval,
+                                  Gates::KernelType kernel) {
         Util::for_each_enum<Threading>([=](Threading threading) {
-            assignKernelForGate(gate_op, threading, memory_model,
-                                num_qubits_criterion, kernel);
+            assignKernelForGenerator(gntr_op, threading, memory_model, 1,
+                                     interval, kernel);
         });
     }
 
-    void
-    assignKernelForGate(Gates::GateOperation gate_op, Threading threading,
-                        [[maybe_unused]] AllMemoryModel dummy,
-                        const std::function<bool(size_t)> &num_qubits_criterion,
-                        Gates::KernelType kernel) {
+    void assignKernelForGenerator(Gates::GeneratorOperation gntr_op,
+                                  Threading threading,
+                                  [[maybe_unused]] AllMemoryModel dummy,
+                                  const Util::IntegerInterval<size_t> &interval,
+                                  Gates::KernelType kernel) {
         Util::for_each_enum<CPUMemoryModel>([=](CPUMemoryModel memory_model) {
-            assignKernelForGate(gate_op, threading, memory_model,
-                                num_qubits_criterion, kernel);
+            assignKernelForGenerator(gntr_op, threading, memory_model, 2,
+                                     interval, kernel);
         });
     }
 
-    void
-    assignKernelForGate(Gates::GateOperation gate_op,
-                        [[maybe_unused]] AllThreading dummy1,
-                        [[maybe_unused]] AllMemoryModel dummy2,
-                        const std::function<bool(size_t)> &num_qubits_criterion,
-                        Gates::KernelType kernel) {
+    void assignKernelForGenerator(Gates::GeneratorOperation gntr_op,
+                                  [[maybe_unused]] AllThreading dummy1,
+                                  [[maybe_unused]] AllMemoryModel dummy2,
+                                  const Util::IntegerInterval<size_t> &interval,
+                                  Gates::KernelType kernel) {
         Util::for_each_enum<Threading, CPUMemoryModel>(
             [=](Threading threading, CPUMemoryModel memory_model) {
-                assignKernelForGate(gate_op, threading, memory_model,
-                                    num_qubits_criterion, kernel);
+                assignKernelForGenerator(gntr_op, threading, memory_model, 0,
+                                         interval, kernel);
             });
     }
 
-    void assignKernelForGenerator(
-        Gates::GeneratorOperation gntr_op, Threading threading,
-        CPUMemoryModel memory_model,
-        const std::function<bool(size_t)> &num_qubits_criterion,
-        Gates::KernelType kernel) {
+    void assignKernelForMatrix(Gates::MatrixOperation mat_op,
+                               Threading threading, CPUMemoryModel memory_model,
+                               uint32_t priority,
+                               const Util::IntegerInterval<size_t> &interval,
+                               Gates::KernelType kernel) {
         if (std::find(allowed_kernels.at(memory_model).cbegin(),
                       allowed_kernels.at(memory_model).cend(),
                       kernel) == allowed_kernels.at(memory_model).cend()) {
             throw std::invalid_argument("The given kernel is now allowed for "
                                         "the given memory model.");
         }
-        generator_kernel_map_[gntr_op].emplace_back(
-            toDispatchKey(threading, memory_model), num_qubits_criterion,
-            kernel);
+        const auto dispatch_key = toDispatchKey(threading, memory_model);
+        auto &set = matrix_kernel_map_[std::make_pair(mat_op, dispatch_key)];
+
+        if (set.conflict(priority, interval)) {
+            throw std::invalid_argument("The given interval conflicts with "
+                                        "existing intervals.");
+        }
+        set.emplace(priority, interval, kernel);
     }
 
-    void assignKernelForGenerator(
-        Gates::GeneratorOperation gntr_op, [[maybe_unused]] AllThreading dummy,
-        CPUMemoryModel memory_model,
-        const std::function<bool(size_t)> &num_qubits_criterion,
-        Gates::KernelType kernel) {
+    void assignKernelForMatrix(Gates::MatrixOperation mat_op,
+                               [[maybe_unused]] AllThreading dummy,
+                               CPUMemoryModel memory_model,
+                               const Util::IntegerInterval<size_t> &interval,
+                               Gates::KernelType kernel) {
         Util::for_each_enum<Threading>([=](Threading threading) {
-            assignKernelForGenerator(gntr_op, threading, memory_model,
-                                     num_qubits_criterion, kernel);
+            assignKernelForMatrix(mat_op, threading, memory_model, 1, interval,
+                                  kernel);
         });
     }
 
-    void assignKernelForGenerator(
-        Gates::GeneratorOperation gntr_op, Threading threading,
-        [[maybe_unused]] AllMemoryModel dummy,
-        const std::function<bool(size_t)> &num_qubits_criterion,
-        Gates::KernelType kernel) {
+    void assignKernelForMatrix(Gates::MatrixOperation mat_op,
+                               Threading threading,
+                               [[maybe_unused]] AllMemoryModel dummy,
+                               const Util::IntegerInterval<size_t> &interval,
+                               Gates::KernelType kernel) {
         Util::for_each_enum<CPUMemoryModel>([=](CPUMemoryModel memory_model) {
-            assignKernelForGenerator(gntr_op, threading, memory_model,
-                                     num_qubits_criterion, kernel);
+            assignKernelForMatrix(mat_op, threading, memory_model, 2, interval,
+                                  kernel);
         });
     }
 
-    void assignKernelForGenerator(
-        Gates::GeneratorOperation gntr_op, [[maybe_unused]] AllThreading dummy1,
-        [[maybe_unused]] AllMemoryModel dummy2,
-        const std::function<bool(size_t)> &num_qubits_criterion,
-        Gates::KernelType kernel) {
+    void assignKernelForMatrix(Gates::MatrixOperation mat_op,
+                               [[maybe_unused]] AllThreading dummy1,
+                               [[maybe_unused]] AllMemoryModel dummy2,
+                               const Util::IntegerInterval<size_t> &interval,
+                               Gates::KernelType kernel) {
         Util::for_each_enum<Threading, CPUMemoryModel>(
             [=](Threading threading, CPUMemoryModel memory_model) {
-                assignKernelForGenerator(gntr_op, threading, memory_model,
-                                         num_qubits_criterion, kernel);
+                assignKernelForMatrix(mat_op, threading, memory_model, 0,
+                                      interval, kernel);
             });
     }
 
+    /**
+     * @brief Create default kernels for all gates
+     * @param num_qubits Number of qubits
+     * @param threading Threading context
+     * @param memory_model Memory model of the underlying data
+     */
+    auto getGateKernelMap(size_t num_qubits, Threading threading,
+                          CPUMemoryModel memory_model) const
+        -> std::unordered_map<Gates::GateOperation, Gates::KernelType> {
+        uint32_t dispatch_key = toDispatchKey(threading, memory_model);
+
+        std::unordered_map<Gates::GateOperation, Gates::KernelType>
+            kernel_for_gates;
+
+        Util::for_each_enum<Gates::GateOperation>(
+            [&](Gates::GateOperation gate_op) {
+                const auto key = std::make_pair(gate_op, dispatch_key);
+                const auto &set = gate_kernel_map_.at(key);
+                kernel_for_gates.emplace(gate_op, set.getKernel(num_qubits));
+            });
+        return kernel_for_gates;
+    }
+
     /**
      * @brief Create default kernels for all generators
      * @param num_qubits Number of qubits
@@ -346,58 +540,54 @@ class DefaultKernelsForStateVector {
         std::unordered_map<Gates::GeneratorOperation, Gates::KernelType>
             kernel_for_generators;
 
-        for (auto generator = Gates::GeneratorOperation::BEGIN;
-             generator != Gates::GeneratorOperation::END;
-             generator = static_cast<Gates::GeneratorOperation>(
-                 static_cast<uint32_t>(generator) + 1)) {
-
-            const auto iter =
-                std::find_if(generator_kernel_map_.at(generator).cbegin(),
-                             generator_kernel_map_.at(generator).cend(),
-                             [dispatch_key = dispatch_key,
-                              num_qubits = num_qubits](const auto &t) {
-                                 return (std::get<0>(t) == dispatch_key &&
-                                         std::get<1>(t)(num_qubits));
-                             });
-            if (iter == generator_kernel_map_.at(generator).cend()) {
-                throw std::range_error("Cannot find registered kernel for a "
-                                       "dispatch key and number of qubits.");
-            }
-            kernel_for_generators.emplace(generator, std::get<2>(*iter));
-        }
+        Util::for_each_enum<Gates::GeneratorOperation>(
+            [&](Gates::GeneratorOperation gntr_op) {
+                const auto key = std::make_pair(gntr_op, dispatch_key);
+                const auto &set = generator_kernel_map_.at(key);
+                kernel_for_generators.emplace(gntr_op,
+                                              set.getKernel(num_qubits));
+            });
         return kernel_for_generators;
     }
 
-    auto getGateKernelMap(size_t num_qubits, Threading threading,
-                          CPUMemoryModel memory_model) const
-        -> std::unordered_map<Gates::GateOperation, Gates::KernelType> {
+    /**
+     * @brief Create default kernels for all matrix operations
+     * @param num_qubits Number of qubits
+     * @param threading Threading context
+     * @param memory_model Memory model of the underlying data
+     */
+    auto getMatrixKernelMap(size_t num_qubits, Threading threading,
+                            CPUMemoryModel memory_model) const
+        -> std::unordered_map<Gates::MatrixOperation, Gates::KernelType> {
         uint32_t dispatch_key = toDispatchKey(threading, memory_model);
 
-        std::unordered_map<Gates::GateOperation, Gates::KernelType>
-            kernel_for_gates;
+        std::unordered_map<Gates::MatrixOperation, Gates::KernelType>
+            kernel_for_matrices;
 
-        for (auto gate = Gates::GateOperation::BEGIN;
-             gate != Gates::GateOperation::END;
-             gate = static_cast<Gates::GateOperation>(
-                 static_cast<uint32_t>(gate) + 1)) {
+        Util::for_each_enum<Gates::MatrixOperation>(
+            [&](Gates::MatrixOperation mat_op) {
+                const auto key = std::make_pair(mat_op, dispatch_key);
+                const auto &set = matrix_kernel_map_.at(key);
+                kernel_for_matrices.emplace(mat_op, set.getKernel(num_qubits));
+            });
+        return kernel_for_matrices;
+    }
 
-            if (gate == Gates::GateOperation::Matrix) {
-                continue;
-            }
+    void removeKernelForGenerator(Gates::GateOperation gate_op,
+                                  Threading threading,
+                                  CPUMemoryModel memory_model,
+                                  uint32_t priority) {
+        uint32_t dispatch_key = toDispatchKey(threading, memory_model);
+        gate_kernel_map_[std::make_pair(gate_op, dispatch_key)].clearPriority(
+            priority);
+    }
 
-            const auto iter = std::find_if(
-                gate_kernel_map_.at(gate).cbegin(),
-                gate_kernel_map_.at(gate).cend(), [=](const auto &t) {
-                    return (std::get<0>(t) == dispatch_key &&
-                            std::get<1>(t)(num_qubits));
-                });
-            if (iter == gate_kernel_map_.at(gate).cend()) {
-                throw std::range_error("Cannot find registered kernel for a "
-                                       "dispatch key and number of qubits.");
-            }
-            kernel_for_gates.emplace(gate, std::get<2>(*iter));
-        }
-        return kernel_for_gates;
+    void removeKernelForMatrix(Gates::MatrixOperation mat_op,
+                               Threading threading, CPUMemoryModel memory_model,
+                               uint32_t priority) {
+        uint32_t dispatch_key = toDispatchKey(threading, memory_model);
+        matrix_kernel_map_[std::make_pair(mat_op, dispatch_key)].clearPriority(
+            priority);
     }
 };
 } // namespace Pennylane
diff --git a/pennylane_lightning/src/simulator/DispatchKeys.hpp b/pennylane_lightning/src/simulator/DispatchKeys.hpp
index 08265c9c59..a6d2f4ba94 100644
--- a/pennylane_lightning/src/simulator/DispatchKeys.hpp
+++ b/pennylane_lightning/src/simulator/DispatchKeys.hpp
@@ -17,6 +17,7 @@
  */
 #pragma once
 
+#include "CPUMemoryModel.hpp"
 #include "Macros.hpp"
 
 #include <cstdint>
@@ -33,14 +34,6 @@ enum class Threading : uint8_t {
     BEGIN = SingleThread,
 };
 
-enum class CPUMemoryModel : uint8_t {
-    Unaligned,
-    Aligned256,
-    Aligned512,
-    END,
-    BEGIN = Unaligned,
-};
-
 constexpr uint32_t toDispatchKey(Threading threading,
                                  CPUMemoryModel memory_model) {
     /* Threading is in higher priority */
@@ -48,18 +41,6 @@ constexpr uint32_t toDispatchKey(Threading threading,
            static_cast<uint32_t>(memory_model);
 }
 
-inline auto getMemoryModel(const void *ptr) -> CPUMemoryModel {
-    if ((reinterpret_cast<uintptr_t>(ptr) % 64) == 0) {
-        return CPUMemoryModel::Aligned512;
-    }
-
-    if ((reinterpret_cast<uintptr_t>(ptr) % 32) == 0) {
-        return CPUMemoryModel::Aligned256;
-    }
-
-    return CPUMemoryModel::Unaligned;
-}
-
 /**
  * @brief Choose the best threading based on the current context.
  */
@@ -75,13 +56,4 @@ inline auto bestThreading() -> Threading {
     return Threading::SingleThread;
 }
 
-constexpr inline auto bestCPUMemoryModel() -> CPUMemoryModel {
-    if constexpr (use_avx512f) {
-        return CPUMemoryModel::Aligned512;
-    } else if (use_avx2) {
-        return CPUMemoryModel::Aligned256;
-    }
-    return CPUMemoryModel::Unaligned;
-}
-
 } // namespace Pennylane
diff --git a/pennylane_lightning/src/simulator/DynamicDispatcher.cpp b/pennylane_lightning/src/simulator/DynamicDispatcher.cpp
index 22187d4fcf..4d4851d3d6 100644
--- a/pennylane_lightning/src/simulator/DynamicDispatcher.cpp
+++ b/pennylane_lightning/src/simulator/DynamicDispatcher.cpp
@@ -71,19 +71,11 @@ constexpr auto constructGateOpsFunctorTupleIter() {
     } else if (gate_idx < GateImplementation::implemented_gates.size()) {
         constexpr auto gate_op =
             GateImplementation::implemented_gates[gate_idx];
-        if constexpr (gate_op == Gates::GateOperation::Matrix) {
-            /* GateOperation::Matrix is not supported for dynamic dispatch now
-             */
-            return constructGateOpsFunctorTupleIter<
-                PrecisionT, ParamT, GateImplementation, gate_idx + 1>();
-        } else {
-            return prepend_to_tuple(
-                std::pair{gate_op,
-                          gateOpToFunctor<PrecisionT, ParamT,
-                                          GateImplementation, gate_op>()},
-                constructGateOpsFunctorTupleIter<
-                    PrecisionT, ParamT, GateImplementation, gate_idx + 1>());
-        }
+        return prepend_to_tuple(
+            std::pair{gate_op, gateOpToFunctor<PrecisionT, ParamT,
+                                               GateImplementation, gate_op>()},
+            constructGateOpsFunctorTupleIter<
+                PrecisionT, ParamT, GateImplementation, gate_idx + 1>());
     }
 }
 /**
@@ -105,6 +97,25 @@ constexpr auto constructGeneratorOpsFunctorTupleIter() {
                 PrecisionT, GateImplementation, gntr_idx + 1>());
     }
 }
+/**
+ * @brief Internal recustion function for constructMatrixOpsFunctorTuple
+ */
+template <class PrecisionT, class GateImplementation, size_t mat_idx>
+constexpr auto constructMatrixOpsFunctorTupleIter() {
+    if constexpr (mat_idx == GateImplementation::implemented_matrices.size()) {
+        return std::tuple{};
+    } else if (mat_idx < GateImplementation::implemented_matrices.size()) {
+        constexpr auto mat_op =
+            GateImplementation::implemented_matrices[mat_idx];
+        return prepend_to_tuple(
+            std::pair{
+                mat_op,
+                Gates::MatrixOpToMemberFuncPtr<PrecisionT, GateImplementation,
+                                               mat_op>::value},
+            constructMatrixOpsFunctorTupleIter<PrecisionT, GateImplementation,
+                                               mat_idx + 1>());
+    }
+}
 /// @endcond
 
 /**
@@ -122,13 +133,22 @@ constexpr auto gate_op_functor_tuple = constructGateOpsFunctorTupleIter<
  * @brief Tuple of gate operation and function pointer pairs.
  *
  * @tparam PrecisionT Floating point precision of underlying statevector data
- * @tparam ParamT Floating point type of gate parameters
  * @tparam GateImplementation Gate implementation class.
  */
 template <class PrecisionT, class GateImplementation>
 constexpr auto generator_op_functor_tuple =
     constructGeneratorOpsFunctorTupleIter<PrecisionT, GateImplementation, 0>();
 
+/**
+ * @brief Tuple of matrix operation and function pointer pairs
+ *
+ * @tparam PrecisionT Floating point precision of underlying statevector data
+ * @tparam GateImplementation Gate implementation class.
+ */
+template <class PrecisionT, class GateImplementation>
+constexpr auto matrix_op_functor_tuple =
+    constructMatrixOpsFunctorTupleIter<PrecisionT, GateImplementation, 0>();
+
 /**
  * @brief Register all implemented gates for a given kernel
  *
@@ -172,12 +192,36 @@ void registerAllImplementedGeneratorOps() {
             return gntr_op;
         };
 
-    [[maybe_unused]] const auto registerd_gate_ops = std::apply(
+    [[maybe_unused]] const auto registerd_gntr_ops = std::apply(
         [&registerGeneratorToDispatcher](auto... elt) {
             return std::make_tuple(registerGeneratorToDispatcher(elt)...);
         },
         generator_op_functor_tuple<PrecisionT, GateImplementation>);
 }
+/**
+ * @brief Register all implemented matrix oepration for a given kernel
+ *
+ * @tparam PrecisionT Floating point precision of underlying statevector data
+ * @tparam GateImplementation Gate implementation class.
+ */
+template <class PrecisionT, class GateImplementation>
+void registerAllImplementedMatrixOps() {
+    auto &dispatcher = DynamicDispatcher<PrecisionT>::getInstance();
+
+    auto registerMatrixToDispatcher = [&dispatcher](
+                                          const auto &mat_op_func_pair) {
+        const auto &[mat_op, func] = mat_op_func_pair;
+        dispatcher.registerMatrixOperation(mat_op,
+                                           GateImplementation::kernel_id, func);
+        return mat_op;
+    };
+
+    [[maybe_unused]] const auto registerd_mat_ops = std::apply(
+        [&registerMatrixToDispatcher](auto... elt) {
+            return std::make_tuple(registerMatrixToDispatcher(elt)...);
+        },
+        matrix_op_functor_tuple<PrecisionT, GateImplementation>);
+}
 
 /// @cond DEV
 /**
@@ -193,6 +237,7 @@ void registerKernelIter() {
                                       typename TypeList::Type>();
         registerAllImplementedGeneratorOps<PrecisionT,
                                            typename TypeList::Type>();
+        registerAllImplementedMatrixOps<PrecisionT, typename TypeList::Type>();
         registerKernelIter<PrecisionT, ParamT, typename TypeList::Next>();
     }
 }
diff --git a/pennylane_lightning/src/simulator/DynamicDispatcher.hpp b/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
index 284e221002..46fc68ab81 100644
--- a/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
+++ b/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
@@ -24,6 +24,9 @@
 #include "Error.hpp"
 #include "GateUtil.hpp"
 #include "KernelType.hpp"
+#include "Macros.hpp"
+#include "OpToMemberFuncPtr.hpp"
+#include "Util.hpp"
 
 #include <cassert>
 #include <complex>
@@ -35,12 +38,6 @@
 
 /// @cond DEV
 namespace Pennylane::Internal {
-struct PairHash {
-    template <typename T, typename U>
-    size_t operator()(const std::pair<T, U> &p) const {
-        return std::hash<T>()(p.first) ^ std::hash<U>()(p.second);
-    }
-};
 /**
  * @brief Register all implemented gates for all available kernels.
  *
@@ -82,23 +79,27 @@ template <typename PrecisionT> class DynamicDispatcher {
         const std::vector<size_t> & /*wires*/, bool /*inverse*/,
         const std::vector<PrecisionT> & /*params*/)>;
 
-    using GeneratorFunc = PrecisionT (*)(std::complex<PrecisionT> * /*data*/,
-                                         size_t /*num_qubits*/,
-                                         const std::vector<size_t> & /*wires*/,
-                                         bool /*adjoint*/);
+    using GeneratorFunc = Gates::GeneratorFuncPtrT<PrecisionT>;
+    using MatrixFunc = std::function<void(std::complex<PrecisionT> *, size_t,
+                                          const std::complex<PrecisionT> *,
+                                          const std::vector<size_t> &, bool)>;
 
   private:
     std::unordered_map<std::string, Gates::GateOperation> str_to_gates_;
     std::unordered_map<std::string, Gates::GeneratorOperation> str_to_gntrs_;
 
     std::unordered_map<std::pair<Gates::GateOperation, Gates::KernelType>,
-                       GateFunc, Internal::PairHash>
+                       GateFunc, Util::PairHash>
         gates_;
 
     std::unordered_map<std::pair<Gates::GeneratorOperation, Gates::KernelType>,
-                       GeneratorFunc, Internal::PairHash>
+                       GeneratorFunc, Util::PairHash>
         generators_;
 
+    std::unordered_map<std::pair<Gates::MatrixOperation, Gates::KernelType>,
+                       MatrixFunc, Util::PairHash>
+        matrices_;
+
     constexpr static auto removeGeneratorPrefix(std::string_view op_name)
         -> std::string_view {
         constexpr std::string_view prefix = "Generator";
@@ -161,6 +162,18 @@ template <typename PrecisionT> class DynamicDispatcher {
                             std::forward<FunctionType>(func));
     }
 
+    /**
+     * @brief Register a new matrix operation. Can pass a custom
+     * kernel
+     */
+    // template <typename FunctionType>
+    void registerMatrixOperation(Gates::MatrixOperation mat_op,
+                                 Gates::KernelType kernel, MatrixFunc func) {
+        // FunctionType&& func) {
+        // TODO: Add mutex when we go to multithreading
+        matrices_.emplace(std::make_pair(mat_op, kernel), func);
+    }
+
     /**
      * @brief Apply a single gate to the state-vector using the given kernel.
      *
@@ -180,7 +193,8 @@ template <typename PrecisionT> class DynamicDispatcher {
             gates_.find(std::make_pair(strToGateOp(op_name), kernel));
         if (iter == gates_.cend()) {
             throw std::invalid_argument(
-                "Cannot find a gate with a given name \"" + op_name + "\".");
+                "The gate " + op_name +
+                " is not registered for the given kernel");
         }
         (iter->second)(data, num_qubits, wires, inverse, params);
     }
@@ -203,10 +217,10 @@ template <typename PrecisionT> class DynamicDispatcher {
         const auto iter = gates_.find(std::make_pair(gate_op, kernel));
         if (iter == gates_.cend()) {
             throw std::invalid_argument(
-                std::string("Cannot find a gate with a given name \"") +
+                std::string("The gate ") +
                 std::string(
                     Util::lookup(Gates::Constant::gate_names, gate_op)) +
-                "\".");
+                " is not registered for the given kernel");
         }
         (iter->second)(data, num_qubits, wires, inverse, params);
     }
@@ -221,11 +235,12 @@ template <typename PrecisionT> class DynamicDispatcher {
      * @param inverse List of inverses
      * @param params List of parameters
      */
-    void applyOperations(CFP_t *data, size_t num_qubits,
-                         const std::vector<std::string> &ops,
-                         const std::vector<std::vector<size_t>> &wires,
-                         const std::vector<bool> &inverse,
-                         const std::vector<std::vector<PrecisionT>> &params) {
+    void
+    applyOperations(CFP_t *data, size_t num_qubits,
+                    const std::vector<std::string> &ops,
+                    const std::vector<std::vector<size_t>> &wires,
+                    const std::vector<bool> &inverse,
+                    const std::vector<std::vector<PrecisionT>> &params) const {
         const size_t numOperations = ops.size();
         if (numOperations != wires.size() || numOperations != params.size()) {
             throw std::invalid_argument(
@@ -252,7 +267,7 @@ template <typename PrecisionT> class DynamicDispatcher {
     void applyOperations(CFP_t *data, size_t num_qubits,
                          const std::vector<std::string> &ops,
                          const std::vector<std::vector<size_t>> &wires,
-                         const std::vector<bool> &inverse) {
+                         const std::vector<bool> &inverse) const {
         const size_t numOperations = ops.size();
         if (numOperations != wires.size()) {
             throw std::invalid_argument(
@@ -265,6 +280,86 @@ template <typename PrecisionT> class DynamicDispatcher {
         }
     }
 
+    /**
+     * @brief Apply a given matrix directly to the statevector.
+     *
+     * @param arr Pointer to the statevector.
+     * @param num_qubits Number of qubits.
+     * @param matrix Perfect square matrix in row-major order.
+     * @param wires Wires the gate applies to.
+     * @param inverse Indicate whether inverse should be taken.
+     */
+    void applyMatrix(Gates::KernelType kernel, CFP_t *data,
+                     Gates::MatrixOperation mat_op, size_t num_qubits,
+                     const std::complex<PrecisionT> *matrix,
+                     const std::vector<size_t> &wires, bool inverse) const {
+        assert(num_qubits >= wires.size());
+
+        switch (mat_op) {
+        case Gates::MatrixOperation::SingleQubitOp:
+            assert(wires.size() == 1);
+            break;
+        case Gates::MatrixOperation::TwoQubitOp:
+            assert(wires.size() == 2);
+            break;
+        default:
+            break;
+        }
+        const auto iter = matrices_.find(std::make_pair(mat_op, kernel));
+        if (iter == matrices_.end()) {
+            throw std::invalid_argument(
+                std::string(
+                    Util::lookup(Gates::Constant::matrix_names, mat_op)) +
+                " is not registered for the given kernel");
+        }
+        (iter->second)(data, num_qubits, matrix, wires, inverse);
+    }
+
+    /**
+     * @brief Apply a given matrix directly to the statevector.
+     *
+     * @param arr Pointer to the statevector.
+     * @param num_qubits Number of qubits.
+     * @param matrix Perfect square matrix in row-major order.
+     * @param wires Wires the gate applies to.
+     * @param inverse Indicate whether inverse should be taken.
+     */
+    void applyMatrix(Gates::KernelType kernel, CFP_t *data,
+                     Gates::MatrixOperation mat_op, size_t num_qubits,
+                     const std::complex<PrecisionT> &matrix,
+                     const std::vector<size_t> &wires, bool inverse) const {
+        if (matrix.size() != Util::exp2(2 * wires.size())) {
+            throw std::invalid_argument(
+                "The size of matrix does not match with the given "
+                "number of wires");
+        }
+        applyMatrix(kernel, data, num_qubits, matrix.data(), wires, inverse);
+    }
+
+    /**
+     * @brief Apply a single generator to the state-vector using the given
+     * kernel.
+     *
+     * @param kernel Kernel to run the gate operation.
+     * @param data Pointer to data.
+     * @param num_qubits Number of qubits.
+     * @param op_name Gate operation name.
+     * @param wires Wires to apply gate to.
+     * @param adj Indicates whether to use adjoint of gate.
+     */
+    auto applyGenerator(Gates::KernelType kernel, CFP_t *data,
+                        size_t num_qubits, Gates::GeneratorOperation gntr_op,
+                        const std::vector<size_t> &wires, bool adj) const
+        -> PrecisionT {
+        using Gates::Constant::generator_names;
+        const auto iter = generators_.find(std::make_pair(gntr_op, kernel));
+        if (iter == generators_.cend()) {
+            throw std::invalid_argument(
+                "Cannot find a gate with a given name \"" +
+                std::string(Util::lookup(generator_names, gntr_op)) + "\".");
+        }
+        return (iter->second)(data, num_qubits, wires, adj);
+    }
     /**
      * @brief Apply a single generator to the state-vector using the given
      * kernel.
diff --git a/pennylane_lightning/src/simulator/StateVectorBase.hpp b/pennylane_lightning/src/simulator/StateVectorBase.hpp
index e2b3ac32e8..9861ec0c39 100644
--- a/pennylane_lightning/src/simulator/StateVectorBase.hpp
+++ b/pennylane_lightning/src/simulator/StateVectorBase.hpp
@@ -98,11 +98,12 @@ namespace Pennylane {
  * @tparam PrecisionT Floating point precision of underlying statevector data.
  * @tparam Derived Type of a derived class
  */
-template <class PrecisionT, class Derived> class StateVectorBase {
+template <class T, class Derived> class StateVectorBase {
   public:
     /**
      * @brief StateVector complex precision type.
      */
+    using PrecisionT = T;
     using ComplexPrecisionT = std::complex<PrecisionT>;
 
   private:
@@ -153,6 +154,12 @@ template <class PrecisionT, class Derived> class StateVectorBase {
             gntr_op);
     }
 
+    [[nodiscard]] inline auto
+    getKernelForMatrix(Gates::MatrixOperation mat_op) const
+        -> Gates::KernelType {
+        return static_cast<const Derived *>(this)->getKernelForMatrix(mat_op);
+    }
+
     /**
      * @brief Compare two statevectors.
      *
@@ -288,31 +295,6 @@ template <class PrecisionT, class Derived> class StateVectorBase {
             num_qubits_, opName, wires, adj);
     }
 
-    /**
-     * @brief Apply a given matrix directly to the statevector read directly
-     * from numpy data. Data can be in 1D or 2D format.
-     *
-     * @param matrix Pointer to the array data.
-     * @param wires Wires the gate applies to.
-     * @param inverse Indicate whether inverse should be taken.
-     */
-    template <Gates::KernelType kernel>
-    inline void applyMatrix_(const ComplexPrecisionT *matrix,
-                             const std::vector<size_t> &wires,
-                             bool inverse = false) {
-        auto *arr = getData();
-        Gates::SelectKernel<kernel>::applyMatrix(arr, num_qubits_, matrix,
-                                                 wires, inverse);
-    }
-    template <Gates::KernelType kernel>
-    inline void applyMatrix_(const std::vector<ComplexPrecisionT> &matrix,
-                             const std::vector<size_t> &wires,
-                             bool inverse = false) {
-        auto *arr = getData();
-        Gates::SelectKernel<kernel>::applyMatrix(arr, num_qubits_, matrix,
-                                                 wires, inverse);
-    }
-
     /**
      * @brief Apply a given matrix directly to the statevector read directly
      * from numpy data. Data can be in 1D or 2D format.
@@ -325,33 +307,52 @@ template <class PrecisionT, class Derived> class StateVectorBase {
                             const std::vector<size_t> &wires,
                             bool inverse = false) {
         namespace Constant = Gates::Constant;
-        using Gates::GateOperation;
+        using Gates::MatrixOperation;
         using Gates::SelectKernel;
         using Gates::static_lookup;
 
-        constexpr auto kernel = static_lookup<GateOperation::Matrix>(
-            Constant::default_kernel_for_gates);
-        static_assert(
-            Util::array_has_elt(SelectKernel<kernel>::implemented_gates,
-                                GateOperation::Matrix),
-            "The default kernel for applyMatrix does not implement it.");
-        applyMatrix_<kernel>(matrix, wires, inverse);
+        auto &dispatcher = DynamicDispatcher<PrecisionT>::getInstance();
+        auto *arr = getData();
+
+        if (wires.empty()) {
+            throw std::invalid_argument(
+                "Number of wires must be larger than 0");
+        }
+
+        switch (wires.size()) {
+        case 1:
+            dispatcher.applyMatrix(
+                getKernelForMatrix(MatrixOperation::SingleQubitOp), arr,
+                MatrixOperation::SingleQubitOp, num_qubits_, matrix, wires,
+                inverse);
+            return;
+        case 2:
+            dispatcher.applyMatrix(
+                getKernelForMatrix(MatrixOperation::TwoQubitOp), arr,
+                MatrixOperation::TwoQubitOp, num_qubits_, matrix, wires,
+                inverse);
+            return;
+        default:
+            dispatcher.applyMatrix(
+                getKernelForMatrix(MatrixOperation::MultiQubitOp), arr,
+                MatrixOperation::MultiQubitOp, num_qubits_, matrix, wires,
+                inverse);
+            return;
+        }
+        PL_UNREACHABLE;
     }
-    inline void applyMatrix(const std::vector<ComplexPrecisionT> &matrix,
+
+    template <typename Alloc>
+    inline void applyMatrix(const std::vector<ComplexPrecisionT, Alloc> &matrix,
                             const std::vector<size_t> &wires,
                             bool inverse = false) {
-        namespace Constant = Gates::Constant;
-        using Gates::GateOperation;
-        using Gates::SelectKernel;
-        using Gates::static_lookup;
+        if (matrix.size() != Util::exp2(2 * wires.size())) {
+            throw std::invalid_argument(
+                "The size of matrix does not match with the given "
+                "number of wires");
+        }
 
-        constexpr auto kernel = static_lookup<GateOperation::Matrix>(
-            Constant::default_kernel_for_gates);
-        static_assert(
-            Util::array_has_elt(SelectKernel<kernel>::implemented_gates,
-                                GateOperation::Matrix),
-            "The default kernel for applyMatrix does not implement it.");
-        applyMatrix_<kernel>(matrix, wires, inverse);
+        applyMatrix(matrix.data(), wires, inverse);
     }
 
     /**
diff --git a/pennylane_lightning/src/simulator/StateVectorCPU.hpp b/pennylane_lightning/src/simulator/StateVectorCPU.hpp
index 89ff8d40a4..bb08bf30e1 100644
--- a/pennylane_lightning/src/simulator/StateVectorCPU.hpp
+++ b/pennylane_lightning/src/simulator/StateVectorCPU.hpp
@@ -32,16 +32,19 @@ class StateVectorCPU : public StateVectorBase<PrecisionT, Derived> {
   public:
     using ComplexPrecisionT = std::complex<PrecisionT>;
 
+  protected:
+    const Threading threading_;
+    const CPUMemoryModel memory_model_;
+
   private:
     using BaseType = StateVectorBase<PrecisionT, Derived>;
 
-    Threading threading_;
-    CPUMemoryModel memory_model_;
-
     std::unordered_map<Gates::GateOperation, Gates::KernelType>
         kernel_for_gates_;
     std::unordered_map<Gates::GeneratorOperation, Gates::KernelType>
         kernel_for_generators_;
+    std::unordered_map<Gates::MatrixOperation, Gates::KernelType>
+        kernel_for_matrices_;
 
     void setKernels(size_t num_qubits, Threading threading,
                     CPUMemoryModel memory_model) {
@@ -50,6 +53,8 @@ class StateVectorCPU : public StateVectorBase<PrecisionT, Derived> {
             num_qubits, threading, memory_model);
         kernel_for_generators_ = default_kernels.getGeneratorKernelMap(
             num_qubits, threading, memory_model);
+        kernel_for_matrices_ = default_kernels.getMatrixKernelMap(
+            num_qubits, threading, memory_model);
     }
 
   protected:
@@ -71,6 +76,14 @@ class StateVectorCPU : public StateVectorBase<PrecisionT, Derived> {
         -> Gates::KernelType {
         return kernel_for_generators_.at(gntr_op);
     }
-};
 
+    [[nodiscard]] inline auto
+    getKernelForMatrix(Gates::MatrixOperation mat_op) const
+        -> Gates::KernelType {
+        return kernel_for_matrices_.at(mat_op);
+    }
+
+    inline CPUMemoryModel memoryModel() const { return memory_model_; }
+    inline Threading threading() const { return threading_; }
+};
 } // namespace Pennylane
diff --git a/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp b/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
index b36aac7f6d..599cb9b91b 100644
--- a/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
+++ b/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
@@ -11,6 +11,7 @@
 #pragma once
 
 #include "BitUtil.hpp"
+#include "CPUMemoryModel.hpp"
 #include "DispatchKeys.hpp"
 #include "Gates.hpp"
 #include "KernelType.hpp"
@@ -49,9 +50,7 @@ class StateVectorManagedCPU
         : BaseType{num_qubits, threading, memory_model} {
 
         size_t length = BaseType::getLength();
-        // NOLINTNEXTLINE(modernize-avoid-c-arrays,hicpp-avoid-c-arrays)
-        data_ = std::unique_ptr<ComplexPrecisionT[]>{
-            new (std::align_val_t{64}) ComplexPrecisionT[length]};
+        data_ = allocateMemory<ComplexPrecisionT>(memory_model, length);
         std::fill(data_.get(), data_.get() + length,
                   ComplexPrecisionT{0.0, 0.0});
         data_[0] = {1, 0};
@@ -59,19 +58,14 @@ class StateVectorManagedCPU
 
     template <class OtherDerived>
     explicit StateVectorManagedCPU(
-        const StateVectorBase<PrecisionT, OtherDerived> &other,
-        Threading threading = bestThreading(),
-        CPUMemoryModel memory_model = bestCPUMemoryModel())
-        : BaseType(other.getNumQubits(), threading, memory_model) {
+        const StateVectorCPU<PrecisionT, OtherDerived> &other)
+        : BaseType(other.getNumQubits(), other.threading(),
+                   other.memoryModel()) {
 
         size_t length = BaseType::getLength();
-        // NOLINTNEXTLINE(modernize-avoid-c-arrays,hicpp-avoid-c-arrays)
-        data_ = std::unique_ptr<ComplexPrecisionT[]>{
-            new (std::align_val_t{64}) ComplexPrecisionT[length]};
+        data_ = allocateMemory<ComplexPrecisionT>(other.memoryModel(), length);
 
         std::copy(other.getData(), other.getData() + length, data_.get());
-
-        setKernels(BaseType::getNumQubits(), threading, memory_model);
     }
 
     StateVectorManagedCPU(const ComplexPrecisionT *other_data,
@@ -83,9 +77,7 @@ class StateVectorManagedCPU
         PL_ABORT_IF_NOT(Util::isPerfectPowerOf2(other_size),
                         "The size of provided data must be a power of 2.");
 
-        // NOLINTNEXTLINE(modernize-avoid-c-arrays,hicpp-avoid-c-arrays)
-        data_ = std::unique_ptr<ComplexPrecisionT[]>{
-            new (std::align_val_t{64}) ComplexPrecisionT[other_size]};
+        data_ = allocateMemory<ComplexPrecisionT>(memory_model, other_size);
         updateData(other_data);
     }
 
@@ -101,9 +93,7 @@ class StateVectorManagedCPU
 
     StateVectorManagedCPU(const StateVectorManagedCPU &rhs) : BaseType(rhs) {
         size_t length = BaseType::getLength();
-        // NOLINTNEXTLINE(modernize-avoid-c-arrays,hicpp-avoid-c-arrays)
-        data_ = std::unique_ptr<ComplexPrecisionT[]>{
-            new (std::align_val_t{64}) ComplexPrecisionT[length]};
+        data_ = allocateMemory<ComplexPrecisionT>(rhs.memory_model_, length);
         std::copy(rhs.getData(), rhs.getData() + length, data_.get());
     }
 
@@ -130,5 +120,4 @@ class StateVectorManagedCPU
         std::copy(data, data + BaseType::getLength(), data_.get());
     }
 };
-
 } // namespace Pennylane
diff --git a/pennylane_lightning/src/tests/Test_DefaultKernelsForStateVector.cpp b/pennylane_lightning/src/tests/Test_DefaultKernelsForStateVector.cpp
index aadc5426d0..663c76e51c 100644
--- a/pennylane_lightning/src/tests/Test_DefaultKernelsForStateVector.cpp
+++ b/pennylane_lightning/src/tests/Test_DefaultKernelsForStateVector.cpp
@@ -30,3 +30,74 @@ TEST_CASE("Test default kernels for generators are well defined",
             }
         });
 }
+
+TEST_CASE("Test unallowed kernel", "[Test_DefaultKernelsForStateVector]") {
+    using Gates::GateOperation;
+    using Gates::GeneratorOperation;
+    using Gates::KernelType;
+    auto &instance = DefaultKernelsForStateVector::getInstance();
+    REQUIRE_THROWS(instance.assignKernelForGate(
+        GateOperation::PauliX, Threading::SingleThread,
+        CPUMemoryModel::Unaligned, 0, Util::full_domain<size_t>(),
+        KernelType::None));
+}
+
+TEST_CASE("Test few limiting cases of default kernels",
+          "[Test_DefaultKernelsForStateVector]") {
+    auto &instance = DefaultKernelsForStateVector::getInstance();
+    SECTION("Single thread, large number of qubits") {
+        // For large N, single thread calls "LM" for all single- and two-qubit
+        // gates. For three-qubit gates, we use PI.
+        auto gate_map = instance.getGateKernelMap(24, Threading::SingleThread,
+                                                  CPUMemoryModel::Unaligned);
+        Util::for_each_enum<Gates::GateOperation>(
+            [&gate_map](Gates::GateOperation gate_op) {
+                INFO(Util::lookup(Gates::Constant::gate_names, gate_op));
+                if (gate_op == Gates::GateOperation::MultiRZ) {
+                    REQUIRE(gate_map[gate_op] == Gates::KernelType::LM);
+                } else if (Util::lookup(Gates::Constant::gate_wires, gate_op) !=
+                           3) {
+                    REQUIRE(gate_map[gate_op] == Gates::KernelType::LM);
+                } else {
+                    REQUIRE(gate_map[gate_op] == Gates::KernelType::PI);
+                }
+            });
+    }
+    SECTION("Single thread, N = 14") {
+        // For large N = 14, IsingXX with "PI" is slightly faster
+        auto gate_map = instance.getGateKernelMap(14, Threading::SingleThread,
+                                                  CPUMemoryModel::Unaligned);
+        REQUIRE(gate_map[Gates::GateOperation::IsingXX] ==
+                Gates::KernelType::PI);
+    }
+}
+
+TEST_CASE("Test priority works", "[Test_DefaultKernelsForStateVector]") {
+    using Gates::GateOperation;
+    using Gates::GeneratorOperation;
+    using Gates::KernelType;
+    auto &instance = DefaultKernelsForStateVector::getInstance();
+    SECTION("Test assignKernelForGate") {
+        auto original_kernel = instance.getGateKernelMap(
+            24, Threading::SingleThread,
+            CPUMemoryModel::Unaligned)[GateOperation::PauliX];
+
+        instance.assignKernelForGate(
+            GateOperation::PauliX, Threading::SingleThread,
+            CPUMemoryModel::Unaligned, 100, Util::full_domain<size_t>(),
+            KernelType::PI);
+
+        REQUIRE(instance.getGateKernelMap(
+                    24, Threading::SingleThread,
+                    CPUMemoryModel::Unaligned)[GateOperation::PauliX] ==
+                KernelType::PI);
+
+        instance.removeKernelForGenerator(GateOperation::PauliX,
+                                          Threading::SingleThread,
+                                          CPUMemoryModel::Unaligned, 100);
+        REQUIRE(instance.getGateKernelMap(
+                    24, Threading::SingleThread,
+                    CPUMemoryModel::Unaligned)[GateOperation::PauliX] ==
+                original_kernel);
+    }
+}
diff --git a/pennylane_lightning/src/tests/Test_DynamicDispatcher.cpp b/pennylane_lightning/src/tests/Test_DynamicDispatcher.cpp
index 0146f99c35..3511a12da9 100644
--- a/pennylane_lightning/src/tests/Test_DynamicDispatcher.cpp
+++ b/pennylane_lightning/src/tests/Test_DynamicDispatcher.cpp
@@ -81,12 +81,10 @@ constexpr void testAllGatesForKernelIter(RandomEngine &re,
     if constexpr (idx < static_cast<int>(GateOperation::END)) {
         constexpr auto gate_op = static_cast<GateOperation>(idx);
 
-        if constexpr (gate_op != GateOperation::Matrix) { // ignore Matrix
-            for (size_t num_qubits = 3; num_qubits <= max_num_qubits;
-                 num_qubits++) {
-                testDispatchForKernel<PrecisionT, ParamT, GateImplementation>::
-                    template test<gate_op>(re, num_qubits);
-            }
+        for (size_t num_qubits = 3; num_qubits <= max_num_qubits;
+             num_qubits++) {
+            testDispatchForKernel<PrecisionT, ParamT, GateImplementation>::
+                template test<gate_op>(re, num_qubits);
         }
 
         testAllGatesForKernelIter<PrecisionT, ParamT, GateImplementation,
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
index aea72009e9..eb65520c7b 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
@@ -113,41 +113,36 @@ void testApplyGate(RandomEngine &re, size_t num_qubits) {
     INFO("PrecisionT, ParamT = " << PrecisionToName<PrecisionT>::value << ", "
                                  << PrecisionToName<ParamT>::value);
 
-    if constexpr (gate_op != GateOperation::Matrix) {
-        const auto all_wires = crateAllWires(num_qubits, gate_op, true);
-        for (const auto &wires : all_wires) {
-            const auto params = createParams<ParamT>(gate_op);
-            const auto gate_name = lookup(Constant::gate_names, gate_op);
-            DYNAMIC_SECTION(
-                "Test gate "
-                << gate_name
-                << " with inverse = false") { // Test with inverse = false
-                const auto results = Util::tuple_to_array(
-                    applyGateForImplemetingKernels<gate_op, PrecisionT, ParamT,
-                                                   Kernels>(
-                        ini, num_qubits, wires, false, params,
-                        std::make_index_sequence<length<Kernels>()>()));
-
-                for (size_t i = 0; i < results.size() - 1; i++) {
-                    REQUIRE(results[i] ==
-                            PLApprox(results[i + 1]).margin(1e-7));
-                }
+    const auto all_wires = crateAllWires(num_qubits, gate_op, true);
+    for (const auto &wires : all_wires) {
+        const auto params = createParams<ParamT>(gate_op);
+        const auto gate_name = lookup(Constant::gate_names, gate_op);
+        DYNAMIC_SECTION(
+            "Test gate "
+            << gate_name
+            << " with inverse = false") { // Test with inverse = false
+            const auto results = Util::tuple_to_array(
+                applyGateForImplemetingKernels<gate_op, PrecisionT, ParamT,
+                                               Kernels>(
+                    ini, num_qubits, wires, false, params,
+                    std::make_index_sequence<length<Kernels>()>()));
+
+            for (size_t i = 0; i < results.size() - 1; i++) {
+                REQUIRE(results[i] == PLApprox(results[i + 1]).margin(1e-7));
             }
+        }
 
-            DYNAMIC_SECTION(
-                "Test gate "
-                << gate_name
-                << " with inverse = true") { // Test with inverse = true
-                const auto results = Util::tuple_to_array(
-                    applyGateForImplemetingKernels<gate_op, PrecisionT, ParamT,
-                                                   Kernels>(
-                        ini, num_qubits, wires, true, params,
-                        std::make_index_sequence<length<Kernels>()>()));
-
-                for (size_t i = 0; i < results.size() - 1; i++) {
-                    REQUIRE(results[i] ==
-                            PLApprox(results[i + 1]).margin(1e-7));
-                }
+        DYNAMIC_SECTION("Test gate "
+                        << gate_name
+                        << " with inverse = true") { // Test with inverse = true
+            const auto results = Util::tuple_to_array(
+                applyGateForImplemetingKernels<gate_op, PrecisionT, ParamT,
+                                               Kernels>(
+                    ini, num_qubits, wires, true, params,
+                    std::make_index_sequence<length<Kernels>()>()));
+
+            for (size_t i = 0; i < results.size() - 1; i++) {
+                REQUIRE(results[i] == PLApprox(results[i + 1]).margin(1e-7));
             }
         }
     }
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Inverse.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Inverse.cpp
index 19ffb8535b..4869678201 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Inverse.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Inverse.cpp
@@ -26,29 +26,24 @@ using namespace Pennylane::Gates;
 template <typename PrecisionT, typename ParamT, class GateImplementation,
           GateOperation gate_op, class RandomEngine>
 void testInverseKernelGate(RandomEngine &re, size_t num_qubits) {
-    if constexpr (gate_op != GateOperation::Matrix) {
-        constexpr auto gate_name = static_lookup<gate_op>(Constant::gate_names);
-        DYNAMIC_SECTION("Test inverse of " << gate_name << " for kernel "
-                                           << GateImplementation::name) {
-            const auto ini_st = createRandomState<PrecisionT>(re, num_qubits);
+    constexpr auto gate_name = static_lookup<gate_op>(Constant::gate_names);
+    DYNAMIC_SECTION("Test inverse of " << gate_name << " for kernel "
+                                       << GateImplementation::name) {
+        const auto ini_st = createRandomState<PrecisionT>(re, num_qubits);
 
-            auto st = ini_st;
+        auto st = ini_st;
 
-            const auto func_ptr =
-                GateOpToMemberFuncPtr<PrecisionT, ParamT, GateImplementation,
-                                      gate_op>::value;
+        const auto func_ptr =
+            GateOpToMemberFuncPtr<PrecisionT, ParamT, GateImplementation,
+                                  gate_op>::value;
 
-            const auto wires = createWires(gate_op, num_qubits);
-            const auto params = createParams<ParamT>(gate_op);
+        const auto wires = createWires(gate_op, num_qubits);
+        const auto params = createParams<ParamT>(gate_op);
 
-            callGateOps(func_ptr, st.data(), num_qubits, wires, false, params);
-            callGateOps(func_ptr, st.data(), num_qubits, wires, true, params);
+        callGateOps(func_ptr, st.data(), num_qubits, wires, false, params);
+        callGateOps(func_ptr, st.data(), num_qubits, wires, true, params);
 
-            REQUIRE(st == PLApprox(ini_st).margin(1e-7));
-        }
-    } else {
-        static_cast<void>(re);
-        static_cast<void>(num_qubits);
+        REQUIRE(st == PLApprox(ini_st).margin(1e-7));
     }
 }
 
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Matrix.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Matrix.cpp
index dfda96073f..72eba17f63 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Matrix.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Matrix.cpp
@@ -13,20 +13,46 @@ using ApplyMatrixType = void (*)(std::complex<PrecisionT> *, size_t,
                                  const std::vector<size_t> &, bool);
 
 template <typename PrecisionT, class GateImplementation, class U = void>
-struct IsApplyMatrixDefined {
+struct IsApplySingleQubitOpDefined {
     constexpr static bool value = false;
 };
 template <typename PrecisionT, class GateImplementation>
-struct IsApplyMatrixDefined<
+struct IsApplySingleQubitOpDefined<
     PrecisionT, GateImplementation,
     std::enable_if_t<
         std::is_pointer_v<decltype(static_cast<ApplyMatrixType<PrecisionT>>(
-            &GateImplementation::template applyMatrix<PrecisionT>))>>> {
+            &GateImplementation::template applySingleQubitOp<PrecisionT>))>>> {
     constexpr static bool value = true;
 };
 
+template <typename PrecisionT, class GateImplementation, class U = void>
+struct IsApplyTwoQubitOpDefined {
+    constexpr static bool value = false;
+};
 template <typename PrecisionT, class GateImplementation>
-void testApplyMatrix() {
+struct IsApplyTwoQubitOpDefined<
+    PrecisionT, GateImplementation,
+    std::enable_if_t<
+        std::is_pointer_v<decltype(static_cast<ApplyMatrixType<PrecisionT>>(
+            &GateImplementation::template applyTwoQubitOp<PrecisionT>))>>> {
+    constexpr static bool value = true;
+};
+
+template <typename PrecisionT, class GateImplementation, class U = void>
+struct IsApplyMultiQubitOpDefined {
+    constexpr static bool value = false;
+};
+template <typename PrecisionT, class GateImplementation>
+struct IsApplyMultiQubitOpDefined<
+    PrecisionT, GateImplementation,
+    std::enable_if_t<
+        std::is_pointer_v<decltype(static_cast<ApplyMatrixType<PrecisionT>>(
+            &GateImplementation::template applyMultiQubitOp<PrecisionT>))>>> {
+    constexpr static bool value = true;
+};
+
+template <typename PrecisionT, class GateImplementation>
+void testApplySingleQubitOp() {
     using ComplexPrecisionT = std::complex<PrecisionT>;
 
     DYNAMIC_SECTION(GateImplementation::name
@@ -80,8 +106,8 @@ void testApplyMatrix() {
         };
 
         auto st = ini_st;
-        GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
-                                        wires, false);
+        GateImplementation::applySingleQubitOp(st.data(), num_qubits,
+                                               matrix.data(), wires, false);
         REQUIRE(st == PLApprox(expected).margin(1e-5));
     }
 
@@ -136,8 +162,8 @@ void testApplyMatrix() {
         };
 
         auto st = ini_st;
-        GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
-                                        wires, false);
+        GateImplementation::applySingleQubitOp(st.data(), num_qubits,
+                                               matrix.data(), wires, false);
         REQUIRE(st == PLApprox(expected).margin(1e-5));
     }
 
@@ -192,10 +218,15 @@ void testApplyMatrix() {
         };
 
         auto st = ini_st;
-        GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
-                                        wires, false);
+        GateImplementation::applySingleQubitOp(st.data(), num_qubits,
+                                               matrix.data(), wires, false);
         REQUIRE(st == PLApprox(expected).margin(1e-5));
     }
+}
+
+template <typename PrecisionT, class GateImplementation>
+void testApplyTwoQubitOp() {
+    using ComplexPrecisionT = std::complex<PrecisionT>;
 
     DYNAMIC_SECTION(GateImplementation::name
                     << ", Matrix0,1 - " << PrecisionToName<PrecisionT>::value) {
@@ -260,8 +291,8 @@ void testApplyMatrix() {
         };
 
         auto st = ini_st;
-        GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
-                                        wires, false);
+        GateImplementation::applyTwoQubitOp(st.data(), num_qubits,
+                                            matrix.data(), wires, false);
         REQUIRE(st == PLApprox(expected).margin(1e-5));
     }
 
@@ -328,10 +359,15 @@ void testApplyMatrix() {
         };
 
         auto st = ini_st;
-        GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
-                                        wires, false);
+        GateImplementation::applyTwoQubitOp(st.data(), num_qubits,
+                                            matrix.data(), wires, false);
         REQUIRE(st == PLApprox(expected).margin(1e-5));
     }
+}
+
+template <typename PrecisionT, class GateImplementation>
+void testApplyMultiQubitOp() {
+    using ComplexPrecisionT = std::complex<PrecisionT>;
 
     DYNAMIC_SECTION(GateImplementation::name
                     << ", Matrix1,2,3 - "
@@ -445,8 +481,8 @@ void testApplyMatrix() {
         };
 
         auto st = ini_st;
-        GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
-                                        wires, false);
+        GateImplementation::applyMultiQubitOp(st.data(), num_qubits,
+                                              matrix.data(), wires, false);
         REQUIRE(st == PLApprox(expected).margin(1e-5));
     }
 
@@ -754,25 +790,57 @@ void testApplyMatrix() {
         };
 
         auto st = ini_st;
-        GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
-                                        wires, false);
+        GateImplementation::applyMultiQubitOp(st.data(), num_qubits,
+                                              matrix.data(), wires, false);
         REQUIRE(st == PLApprox(expected).margin(1e-5));
     }
 }
 
 template <typename PrecisionT, typename TypeList>
-void testApplyMatrixForKernels() {
+void testApplySingleQubitOpForKernels() {
+    if constexpr (!std::is_same_v<TypeList, void>) {
+        using GateImplementation = typename TypeList::Type;
+
+        if constexpr (IsApplySingleQubitOpDefined<PrecisionT,
+                                                  GateImplementation>::value) {
+            testApplySingleQubitOp<PrecisionT, GateImplementation>();
+        } else {
+            SUCCEED(
+                "Member function applySingleQubitOp is not defined in kernel"
+                << GateImplementation::name);
+        }
+        testApplySingleQubitOpForKernels<PrecisionT, typename TypeList::Next>();
+    }
+}
+
+template <typename PrecisionT, typename TypeList>
+void testApplyTwoQubitOpForKernels() {
     if constexpr (!std::is_same_v<TypeList, void>) {
         using GateImplementation = typename TypeList::Type;
 
-        if constexpr (IsApplyMatrixDefined<PrecisionT,
-                                           GateImplementation>::value) {
-            testApplyMatrix<PrecisionT, GateImplementation>();
+        if constexpr (IsApplyTwoQubitOpDefined<PrecisionT,
+                                               GateImplementation>::value) {
+            testApplyTwoQubitOp<PrecisionT, GateImplementation>();
         } else {
-            SUCCEED("Member function applyMatrix is not defined in kernel"
+            SUCCEED("Member function applyTwoQubitOp is not defined in kernel"
                     << GateImplementation::name);
         }
-        testApplyMatrixForKernels<PrecisionT, typename TypeList::Next>();
+        testApplyTwoQubitOpForKernels<PrecisionT, typename TypeList::Next>();
+    }
+}
+template <typename PrecisionT, typename TypeList>
+void testApplyMultiQubitOpForKernels() {
+    if constexpr (!std::is_same_v<TypeList, void>) {
+        using GateImplementation = typename TypeList::Type;
+
+        if constexpr (IsApplyMultiQubitOpDefined<PrecisionT,
+                                                 GateImplementation>::value) {
+            testApplyMultiQubitOp<PrecisionT, GateImplementation>();
+        } else {
+            SUCCEED("Member function applyMultiQubitOp is not defined in kernel"
+                    << GateImplementation::name);
+        }
+        testApplyMultiQubitOpForKernels<PrecisionT, typename TypeList::Next>();
     }
 }
 
@@ -780,11 +848,13 @@ TEMPLATE_TEST_CASE("GateImplementation::applyMatrix, inverse = false",
                    "[GateImplementations_Matrix]", float, double) {
     using PrecisionT = TestType;
 
-    testApplyMatrixForKernels<PrecisionT, AvailableKernels>();
+    testApplySingleQubitOpForKernels<PrecisionT, AvailableKernels>();
+    testApplyTwoQubitOpForKernels<PrecisionT, AvailableKernels>();
+    testApplyMultiQubitOpForKernels<PrecisionT, AvailableKernels>();
 }
 
 template <typename PrecisionT, class GateImplementation>
-void testApplyMatrixInverse() {
+void testApplySingleQubitOpInverse() {
     std::mt19937 re{1337};
     const int num_qubits = 4;
 
@@ -798,10 +868,10 @@ void testApplyMatrixInverse() {
         const auto matrix = randomUnitary<PrecisionT>(re, wires.size());
 
         auto st = ini_st;
-        GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
-                                        wires, false);
-        GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
-                                        wires, true);
+        GateImplementation::applySingleQubitOp(st.data(), num_qubits,
+                                               matrix.data(), wires, false);
+        GateImplementation::applySingleQubitOp(st.data(), num_qubits,
+                                               matrix.data(), wires, true);
         REQUIRE(st == PLApprox(ini_st).margin(1e-5));
     }
 
@@ -815,10 +885,10 @@ void testApplyMatrixInverse() {
         const auto matrix = randomUnitary<PrecisionT>(re, wires.size());
 
         auto st = ini_st;
-        GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
-                                        wires, false);
-        GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
-                                        wires, true);
+        GateImplementation::applySingleQubitOp(st.data(), num_qubits,
+                                               matrix.data(), wires, false);
+        GateImplementation::applySingleQubitOp(st.data(), num_qubits,
+                                               matrix.data(), wires, true);
 
         REQUIRE(st == PLApprox(ini_st).margin(1e-5));
     }
@@ -833,10 +903,10 @@ void testApplyMatrixInverse() {
         const auto matrix = randomUnitary<PrecisionT>(re, wires.size());
 
         auto st = ini_st;
-        GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
-                                        wires, false);
-        GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
-                                        wires, true);
+        GateImplementation::applySingleQubitOp(st.data(), num_qubits,
+                                               matrix.data(), wires, false);
+        GateImplementation::applySingleQubitOp(st.data(), num_qubits,
+                                               matrix.data(), wires, true);
 
         REQUIRE(st == PLApprox(ini_st).margin(1e-5));
     }
@@ -851,13 +921,19 @@ void testApplyMatrixInverse() {
         const auto matrix = randomUnitary<PrecisionT>(re, wires.size());
 
         auto st = ini_st;
-        GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
-                                        wires, false);
-        GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
-                                        wires, true);
+        GateImplementation::applySingleQubitOp(st.data(), num_qubits,
+                                               matrix.data(), wires, false);
+        GateImplementation::applySingleQubitOp(st.data(), num_qubits,
+                                               matrix.data(), wires, true);
 
         REQUIRE(st == PLApprox(ini_st).margin(1e-5));
     }
+}
+
+template <typename PrecisionT, class GateImplementation>
+void testApplyTwoQubitOpInverse() {
+    std::mt19937 re{1337};
+    const int num_qubits = 4;
 
     DYNAMIC_SECTION(GateImplementation::name
                     << ", wires = {0,1} - "
@@ -869,10 +945,10 @@ void testApplyMatrixInverse() {
         const auto matrix = randomUnitary<PrecisionT>(re, wires.size());
 
         auto st = ini_st;
-        GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
-                                        wires, false);
-        GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
-                                        wires, true);
+        GateImplementation::applyTwoQubitOp(st.data(), num_qubits,
+                                            matrix.data(), wires, false);
+        GateImplementation::applyTwoQubitOp(st.data(), num_qubits,
+                                            matrix.data(), wires, true);
 
         REQUIRE(st == PLApprox(ini_st).margin(1e-5));
     }
@@ -885,10 +961,10 @@ void testApplyMatrixInverse() {
         const auto matrix = randomUnitary<PrecisionT>(re, wires.size());
 
         auto st = ini_st;
-        GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
-                                        wires, false);
-        GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
-                                        wires, true);
+        GateImplementation::applyTwoQubitOp(st.data(), num_qubits,
+                                            matrix.data(), wires, false);
+        GateImplementation::applyTwoQubitOp(st.data(), num_qubits,
+                                            matrix.data(), wires, true);
 
         REQUIRE(st == PLApprox(ini_st).margin(1e-5));
     }
@@ -901,13 +977,19 @@ void testApplyMatrixInverse() {
         const auto matrix = randomUnitary<PrecisionT>(re, wires.size());
 
         auto st = ini_st;
-        GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
-                                        wires, false);
-        GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
-                                        wires, true);
+        GateImplementation::applyTwoQubitOp(st.data(), num_qubits,
+                                            matrix.data(), wires, false);
+        GateImplementation::applyTwoQubitOp(st.data(), num_qubits,
+                                            matrix.data(), wires, true);
 
         REQUIRE(st == PLApprox(ini_st).margin(1e-5));
     }
+}
+
+template <typename PrecisionT, class GateImplementation>
+void testApplyMultiQubitOpInverse() {
+    std::mt19937 re{1337};
+    const int num_qubits = 4;
 
     DYNAMIC_SECTION(GateImplementation::name
                     << ", wires = {1,2,3} - "
@@ -918,10 +1000,10 @@ void testApplyMatrixInverse() {
         const auto matrix = randomUnitary<PrecisionT>(re, wires.size());
 
         auto st = ini_st;
-        GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
-                                        wires, false);
-        GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
-                                        wires, true);
+        GateImplementation::applyMultiQubitOp(st.data(), num_qubits,
+                                              matrix.data(), wires, false);
+        GateImplementation::applyMultiQubitOp(st.data(), num_qubits,
+                                              matrix.data(), wires, true);
 
         REQUIRE(st == PLApprox(ini_st).margin(1e-5));
     }
@@ -934,27 +1016,61 @@ void testApplyMatrixInverse() {
         const auto matrix = randomUnitary<PrecisionT>(re, wires.size());
 
         auto st = ini_st;
-        GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
-                                        wires, false);
-        GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
-                                        wires, true);
+        GateImplementation::applyMultiQubitOp(st.data(), num_qubits,
+                                              matrix.data(), wires, false);
+        GateImplementation::applyMultiQubitOp(st.data(), num_qubits,
+                                              matrix.data(), wires, true);
 
         REQUIRE(st == PLApprox(ini_st).margin(1e-5));
     }
 }
 
 template <typename PrecisionT, typename TypeList>
-void testApplyMatrixInverseForKernels() {
+void testApplySingleQubitOpInverseForKernels() {
+    if constexpr (!std::is_same_v<TypeList, void>) {
+        using GateImplementation = typename TypeList::Type;
+        if constexpr (IsApplySingleQubitOpDefined<PrecisionT,
+                                                  GateImplementation>::value) {
+            testApplySingleQubitOpInverse<PrecisionT, GateImplementation>();
+        } else {
+            SUCCEED(
+                "Member function applySingleQubitOp is not defined in kernel"
+                << GateImplementation::name);
+        }
+        testApplySingleQubitOpInverseForKernels<PrecisionT,
+                                                typename TypeList::Next>();
+    }
+}
+
+template <typename PrecisionT, typename TypeList>
+void testApplyTwoQubitOpInverseForKernels() {
+    if constexpr (!std::is_same_v<TypeList, void>) {
+        using GateImplementation = typename TypeList::Type;
+        if constexpr (IsApplyTwoQubitOpDefined<PrecisionT,
+                                               GateImplementation>::value) {
+            testApplyTwoQubitOpInverse<PrecisionT, GateImplementation>();
+        } else {
+            SUCCEED("Member function applyTwoQubitOp is not defined in kernel"
+                    << GateImplementation::name);
+        }
+        testApplyTwoQubitOpInverseForKernels<PrecisionT,
+                                             typename TypeList::Next>();
+    }
+}
+
+template <typename PrecisionT, typename TypeList>
+void testApplyMultiQubitOpInverseForKernels() {
     if constexpr (!std::is_same_v<TypeList, void>) {
         using GateImplementation = typename TypeList::Type;
-        if constexpr (IsApplyMatrixDefined<PrecisionT,
-                                           GateImplementation>::value) {
-            testApplyMatrixInverse<PrecisionT, GateImplementation>();
+        if constexpr (IsApplyMultiQubitOpDefined<PrecisionT,
+                                                 GateImplementation>::value) {
+            testApplyMultiQubitOpInverse<PrecisionT, GateImplementation>();
         } else {
-            SUCCEED("Member function applyMatrix is not defined in kernel"
+            SUCCEED("Member function applyMultiQubitOp is not defined in kernel"
                     << GateImplementation::name);
         }
-        testApplyMatrixInverseForKernels<PrecisionT, typename TypeList::Next>();
+        testApplyMultiQubitOpInverseForKernels<PrecisionT,
+                                               typename TypeList::Next>();
     }
 }
 
@@ -962,5 +1078,7 @@ TEMPLATE_TEST_CASE("GateImplementation::applyMatrix, inverse = true",
                    "[GateImplementations_Matrix]", float, double) {
     using PrecisionT = TestType;
 
-    testApplyMatrixInverseForKernels<PrecisionT, AvailableKernels>();
+    testApplySingleQubitOpInverseForKernels<PrecisionT, AvailableKernels>();
+    testApplyTwoQubitOpInverseForKernels<PrecisionT, AvailableKernels>();
+    testApplyMultiQubitOpInverseForKernels<PrecisionT, AvailableKernels>();
 }
diff --git a/pennylane_lightning/src/tests/Test_OpToMemberFuncPtr.cpp b/pennylane_lightning/src/tests/Test_OpToMemberFuncPtr.cpp
index a46a7387f6..6f3f5bdd4f 100644
--- a/pennylane_lightning/src/tests/Test_OpToMemberFuncPtr.cpp
+++ b/pennylane_lightning/src/tests/Test_OpToMemberFuncPtr.cpp
@@ -23,11 +23,9 @@ template <class PrecisionT, class ParamT, class GateImplemenation,
 constexpr bool testAllGatesImplementedIter() {
     if constexpr (gate_idx < static_cast<uint32_t>(GateOperation::END)) {
         constexpr auto gate_op = static_cast<GateOperation>(gate_idx);
-        if constexpr (gate_op != GateOperation::Matrix) {
-            static_cast<void>(
-                GateOpToMemberFuncPtr<PrecisionT, ParamT, GateImplemenation,
-                                      gate_op>::value);
-        }
+        static_cast<void>(
+            GateOpToMemberFuncPtr<PrecisionT, ParamT, GateImplemenation,
+                                  gate_op>::value);
         return testAllGatesImplementedIter<PrecisionT, ParamT,
                                            GateImplemenation, gate_idx + 1>();
     } else {
@@ -153,8 +151,7 @@ static_assert(testAllGatesImplemeted<float, float, DummyImplementation>(),
 
 struct ImplementedGates {
     constexpr static auto value = DummyImplementation::implemented_gates;
-    constexpr static std::array<GateOperation, 1> ignore_list = {
-        GateOperation::Matrix};
+    constexpr static std::array<GateOperation, 0> ignore_list = {};
 
     template <typename PrecisionT, typename ParamT, GateOperation op>
     constexpr static auto func_ptr =
diff --git a/pennylane_lightning/src/util/ConstantUtil.hpp b/pennylane_lightning/src/util/ConstantUtil.hpp
index 532b49ee01..aec36602e1 100644
--- a/pennylane_lightning/src/util/ConstantUtil.hpp
+++ b/pennylane_lightning/src/util/ConstantUtil.hpp
@@ -207,4 +207,11 @@ constexpr auto reverse_pairs(const std::array<std::pair<T, U>, size> &arr)
     return Internal::reverse_pairs_helper(arr,
                                           std::make_index_sequence<size>{});
 }
+
+constexpr auto constIsPerfectPowerOf2(size_t value) -> bool {
+    while ((value & 1U) == 0) {
+        value >>= 1U;
+    }
+    return value == 1;
+}
 } // namespace Pennylane::Util
diff --git a/pennylane_lightning/src/util/IntegerInterval.hpp b/pennylane_lightning/src/util/IntegerInterval.hpp
new file mode 100644
index 0000000000..24f14959b9
--- /dev/null
+++ b/pennylane_lightning/src/util/IntegerInterval.hpp
@@ -0,0 +1,94 @@
+// Copyright 2022 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/**
+ * @file
+ */
+#include <algorithm>
+#include <cassert>
+#include <limits>
+#include <type_traits>
+
+namespace Pennylane::Util {
+
+/**
+ * @brief Define integer interval [min_, max_)
+ */
+template <typename IntegerType> class IntegerInterval {
+  private:
+    static_assert(std::is_integral_v<IntegerType> &&
+                  std::is_unsigned_v<IntegerType>);
+
+    IntegerType min_;
+    IntegerType max_;
+
+  public:
+    constexpr IntegerInterval(IntegerType min, IntegerType max)
+        : min_{min}, max_{max} {
+        assert(min < max);
+    }
+    bool operator()(IntegerType test_val) const {
+        return (min_ <= test_val) && (test_val < max_);
+    }
+
+    [[nodiscard]] IntegerType min() const { return min_; }
+
+    [[nodiscard]] IntegerType max() const { return max_; }
+};
+
+template <typename IntegerType>
+auto larger_than(IntegerType from) -> IntegerInterval<IntegerType> {
+    return IntegerInterval<IntegerType>{
+        from + 1, std::numeric_limits<IntegerType>::max()};
+}
+template <typename IntegerType>
+auto larger_than_equal_to(IntegerType from) -> IntegerInterval<IntegerType> {
+    return IntegerInterval<IntegerType>{
+        from, std::numeric_limits<IntegerType>::max()};
+}
+template <typename IntegerType>
+auto less_than(IntegerType to) -> IntegerInterval<IntegerType> {
+    return IntegerInterval<IntegerType>{0, to};
+}
+template <typename IntegerType>
+auto less_than_equal_to(IntegerType to) -> IntegerInterval<IntegerType> {
+    return IntegerInterval<IntegerType>{0, to + 1};
+}
+template <typename IntegerType>
+auto in_between_closed(IntegerType from, IntegerType to)
+    -> IntegerInterval<IntegerType> {
+    return IntegerInterval<IntegerType>{from, to + 1};
+}
+template <typename IntegerType>
+auto full_domain() -> IntegerInterval<IntegerType> {
+    return IntegerInterval<IntegerType>{
+        0, std::numeric_limits<IntegerType>::max()};
+}
+
+template <typename IntegerType>
+bool is_disjoint(const IntegerInterval<IntegerType> &interval1,
+                 const IntegerInterval<IntegerType> &interval2) {
+    return (interval1.max() <= interval2.min()) ||
+           (interval2.max() <= interval1.min());
+}
+
+template <typename IntegerType>
+auto union_interval(const IntegerInterval<IntegerType> &interval1,
+                    const IntegerInterval<IntegerType> &interval2)
+    -> IntegerInterval<IntegerType> {
+    return IntegerInterval<IntegerType>{
+        std::min(interval1.min(), interval2.min()),
+        std::max(interval1.max(), interval2.max())};
+}
+
+} // namespace Pennylane::Util
diff --git a/pennylane_lightning/src/util/Memory.hpp b/pennylane_lightning/src/util/Memory.hpp
index 6a6df1f1ce..223d977c0a 100644
--- a/pennylane_lightning/src/util/Memory.hpp
+++ b/pennylane_lightning/src/util/Memory.hpp
@@ -16,19 +16,12 @@
 #include <memory>
 #include <new>
 
+#include "ConstantUtil.hpp"
 #include "TypeList.hpp"
 
 namespace Pennylane {
-
-constexpr auto constIsPerfectPowerOf2(size_t value) -> bool {
-    while ((value & 1U) == 0) {
-        value >>= 1U;
-    }
-    return value == 1;
-}
-
 template <class T, uint32_t alignment> struct AlignedAllocator {
-    static_assert(constIsPerfectPowerOf2(alignment),
+    static_assert(Util::constIsPerfectPowerOf2(alignment),
                   "Template parameter alignment must be power of 2.");
     using value_type = T;
 
@@ -101,4 +94,5 @@ template <typename TypeList>
 template <class T, uint32_t alignment>
 using PLAllocator = std::conditional_t<alignment == 4, std::allocator<T>,
                                        AlignedAllocator<T, alignment>>;
+
 } // namespace Pennylane
diff --git a/pennylane_lightning/src/util/Util.hpp b/pennylane_lightning/src/util/Util.hpp
index ca029c0609..2da023e2ea 100644
--- a/pennylane_lightning/src/util/Util.hpp
+++ b/pennylane_lightning/src/util/Util.hpp
@@ -404,6 +404,16 @@ auto chunkData(const Container<T> &data, std::size_t num_chunks)
 // type alias
 template <class T> using remove_cvref_t = typename remove_cvref<T>::type;
 
+/**
+ * @brief Hash for std::pair
+ */
+struct PairHash {
+    template <typename T, typename U>
+    size_t operator()(const std::pair<T, U> &p) const {
+        return std::hash<T>()(p.first) ^ std::hash<U>()(p.second);
+    }
+};
+
 /**
  * @brief Iterate over all enum values (if BEGIN and END are defined).
  *
diff --git a/tests/test_adjoint_jacobian.py b/tests/test_adjoint_jacobian.py
index 8d3d31d62c..4872212506 100644
--- a/tests/test_adjoint_jacobian.py
+++ b/tests/test_adjoint_jacobian.py
@@ -168,16 +168,16 @@ def test_unsupported_hermitian_expectation(self, dev):
     )
     @pytest.mark.skipif(not lq._CPP_BINARY_AVAILABLE, reason="Lightning binary required")
     def test_unsupported_complex_type(self, dev):
-        dev._state = dev._asarray(dev._state, np.complex256)
+        with pytest.raises(TypeError, match="Unsupported .*"):
+            dev._state = dev._asarray(dev._state, np.complex256)
 
-        with qml.tape.JacobianTape() as tape:
-            qml.QubitStateVector(np.array([1.0, -1.0]) / np.sqrt(2), wires=0)
-            qml.RX(0.3, wires=[0])
-            qml.expval(qml.PauliZ(0))
+            with qml.tape.JacobianTape() as tape:
+                qml.QubitStateVector(np.array([1.0, -1.0]) / np.sqrt(2), wires=0)
+                qml.RX(0.3, wires=[0])
+                qml.expval(qml.PauliZ(0))
 
-        tape.trainable_params = {1}
+            tape.trainable_params = {1}
 
-        with pytest.raises(TypeError, match="Unsupported complex Type: complex256"):
             dev.adjoint_jacobian(tape)
 
     @pytest.mark.parametrize("theta", np.linspace(-2 * np.pi, 2 * np.pi, 7))
diff --git a/tests/test_apply.py b/tests/test_apply.py
index d3ab603077..ee3af3144c 100644
--- a/tests/test_apply.py
+++ b/tests/test_apply.py
@@ -643,44 +643,6 @@ def test_load_default_qubit_device(self):
         assert dev.shots is None
         assert dev.short_name == "lightning.qubit"
 
-    @pytest.mark.skipif(not lq._CPP_BINARY_AVAILABLE, reason="Lightning binary required")
-    def test_load_default_qubit_device_with_valid_kernel(self):
-        """Test that lightning.qubit works with valid kernel_for_ops argument."""
-        for gate in ["PauliX", "CRot", "CSWAP", "Matrix"]:
-            dev = qml.device("lightning.qubit", kernel_for_ops={gate: "PI"}, wires=2)
-
-            assert dev.num_wires == 2
-            assert dev.shots is None
-            assert dev.short_name == "lightning.qubit"
-
-    @pytest.mark.skipif(not lq._CPP_BINARY_AVAILABLE, reason="Lightning binary required")
-    def test_load_default_qubit_device_with_invalid_kernel(self):
-        """Test that lightning.qubit raises error for unsupported gate/kernel pair."""
-
-        for gate in ["PauliX", "CRot", "CSWAP", "Matrix"]:
-            with pytest.raises(
-                ValueError, match=f"The given kernel Unknown does not implement {gate} gate."
-            ):
-                dev = qml.device("lightning.qubit", kernel_for_ops={gate: "Unknown"}, wires=2)
-
-    @pytest.mark.skipif(not lq._CPP_BINARY_AVAILABLE, reason="Lightning binary required")
-    def test_load_default_qubit_device_with_invalid_param(self):
-        """Test that lightning.qubit does not support kernel_for_ops type list."""
-        with pytest.raises(ValueError, match=f"Argument kernel_for_ops must be a dictionary."):
-            dev = qml.device("lightning.qubit", kernel_for_ops=["I am a list"], wires=2)
-
-    @pytest.mark.skipif(not lq._CPP_BINARY_AVAILABLE, reason="Lightning binary required")
-    def test_all_exported_gates_are_available(self):
-        """Test all exported gates from lightning_qubit_ops are accessible"""
-        from pennylane_lightning import lightning_qubit_ops
-        from pennylane_lightning.lightning_qubit_ops import StateVectorC128 as SV
-
-        for kernel, gate_op in lightning_qubit_ops.EXPORTED_KERNEL_OPS:
-            if gate_op != "Matrix":
-                assert getattr(SV, f"{gate_op}_{kernel}", None) is not None
-            else:
-                assert getattr(SV, f"applyMatrix_{kernel}", None) is not None
-
     def test_no_backprop(self):
         """Test that lightning.qubit does not support the backprop
         differentiation method."""
diff --git a/tests/test_array.py b/tests/test_array.py
new file mode 100644
index 0000000000..b154ac2a4d
--- /dev/null
+++ b/tests/test_array.py
@@ -0,0 +1,34 @@
+# Copyright 2022 Xanadu Quantum Technologies Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Unit tests for the :mod:`pennylane_lightning.LightningQubit` device.
+"""
+import numpy as np
+import pennylane as qml
+import pytest
+from pennylane import DeviceError
+
+from pennylane_lightning.lightning_qubit import CPP_BINARY_AVAILABLE
+
+try:
+    from pennylane_lightning.lightning_qubit_ops import allocate_aligned_array
+except (ImportError, ModuleNotFoundError):
+    pytest.skip("No binary module found. Skipping.", allow_module_level=True)
+
+
+@pytest.mark.skipif(not CPP_BINARY_AVAILABLE, reason="Lightning binary required")
+@pytest.mark.parametrize("dt", [np.dtype(np.complex64), np.dtype(np.complex128)])
+def test_allocate_aligned_array(dt):
+    arr = allocate_aligned_array(1024, dt)
+    assert arr.dtype == dt
diff --git a/tests/test_serialize.py b/tests/test_serialize.py
index f62795f4d5..2cff781428 100644
--- a/tests/test_serialize.py
+++ b/tests/test_serialize.py
@@ -22,7 +22,6 @@
     _serialize_obs,
     _serialize_ops,
     _obs_has_kernel,
-    _is_lightning_gate,
 )
 import pytest
 from unittest import mock
@@ -36,45 +35,6 @@
     pytest.skip("No binary module found. Skipping.", allow_module_level=True)
 
 
-class TestIsLightningGate:
-    """Tests for the _is_lightning_gate"""
-
-    def test_gates(self):
-        """Test if returns true for some gates"""
-        for gate in [
-            "PauliX",
-            "PauliY",
-            "PauliZ",
-            "Hadamard",
-            "S",
-            "T",
-            "PhaseShift",
-            "RX",
-            "RY",
-            "RZ",
-            "Rot",
-            "CNOT",
-            "CY",
-            "CZ",
-            "SWAP",
-            "ControlledPhaseShift",
-            "CRX",
-            "CRY",
-            "CRZ",
-            "CRot",
-            "Toffoli",
-            "CSWAP",
-        ]:
-            assert _is_lightning_gate(gate)
-
-    def test_matrix(self):
-        assert not _is_lightning_gate("Matrix")
-
-    def test_non_gates(self):
-        for gate in ["Quantum", "computing", "in", "2022", "with", "Pennylane", "Lightning"]:
-            assert not _is_lightning_gate(gate)
-
-
 class TestObsHasKernel:
     """Tests for the _obs_has_kernel function"""
 
diff --git a/tests/test_vjp.py b/tests/test_vjp.py
index 3ab352a6d9..2aa97c6faa 100644
--- a/tests/test_vjp.py
+++ b/tests/test_vjp.py
@@ -39,12 +39,14 @@ def dev(self):
         not hasattr(np, "complex256"), reason="Numpy only defines complex256 in Linux-like system"
     )
     def test_unsupported_complex_type(self, dev):
-        dev._state = dev._asarray(dev._state, np.complex256)
+        with pytest.raises(TypeError, match="Unsupported .*"):
+            dev._state = dev._asarray(dev._state, np.complex256)
 
-        dy = np.array([[1.0, 2.0], [3.0, 4.0]])
-        jac = np.array([[[1.0, 0.1, 0.2], [0.2, 0.6, 0.1]], [[0.4, -0.7, 1.2], [-0.5, -0.6, 0.7]]])
+            dy = np.array([[1.0, 2.0], [3.0, 4.0]])
+            jac = np.array(
+                [[[1.0, 0.1, 0.2], [0.2, 0.6, 0.1]], [[0.4, -0.7, 1.2], [-0.5, -0.6, 0.7]]]
+            )
 
-        with pytest.raises(TypeError, match="Unsupported complex Type: complex256"):
             dev.compute_vjp(dy, jac)
 
     @pytest.mark.parametrize("C", [np.complex64, np.complex128])
@@ -120,21 +122,21 @@ def dev(self):
         not hasattr(np, "complex256"), reason="Numpy only defines complex256 in Linux-like system"
     )
     def test_unsupported_complex_type(self, dev):
-        dev._state = dev._asarray(dev._state, np.complex256)
+        with pytest.raises(TypeError, match="Unsupported .*"):
+            dev._state = dev._asarray(dev._state, np.complex256)
 
-        x, y, z = [0.5, 0.3, -0.7]
+            x, y, z = [0.5, 0.3, -0.7]
 
-        with qml.tape.JacobianTape() as tape:
-            qml.RX(0.4, wires=[0])
-            qml.Rot(x, y, z, wires=[0])
-            qml.RY(-0.2, wires=[0])
-            qml.expval(qml.PauliZ(0))
+            with qml.tape.JacobianTape() as tape:
+                qml.RX(0.4, wires=[0])
+                qml.Rot(x, y, z, wires=[0])
+                qml.RY(-0.2, wires=[0])
+                qml.expval(qml.PauliZ(0))
 
-        tape.trainable_params = {1, 2, 3}
+            tape.trainable_params = {1, 2, 3}
 
-        dy = np.array([1.0])
+            dy = np.array([1.0])
 
-        with pytest.raises(TypeError, match="Unsupported complex Type: complex256"):
             dev.vjp(tape, dy)(tape)
 
     @pytest.mark.parametrize("C", [np.complex64, np.complex128])
@@ -468,26 +470,26 @@ def dev(self):
         not hasattr(np, "complex256"), reason="Numpy only defines complex256 in Linux-like system"
     )
     def test_unsupported_complex_type(self, dev):
-        dev._state = dev._asarray(dev._state, np.complex256)
+        with pytest.raises(TypeError, match="Unsupported .*"):
+            dev._state = dev._asarray(dev._state, np.complex256)
 
-        with qml.tape.QuantumTape() as tape1:
-            qml.RX(0.4, wires=0)
-            qml.CNOT(wires=[0, 1])
-            qml.expval(qml.PauliZ(0))
+            with qml.tape.QuantumTape() as tape1:
+                qml.RX(0.4, wires=0)
+                qml.CNOT(wires=[0, 1])
+                qml.expval(qml.PauliZ(0))
 
-        with qml.tape.JacobianTape() as tape2:
-            qml.RX(0.4, wires=0)
-            qml.RX(0.6, wires=0)
-            qml.CNOT(wires=[0, 1])
-            qml.expval(qml.PauliZ(0))
+            with qml.tape.JacobianTape() as tape2:
+                qml.RX(0.4, wires=0)
+                qml.RX(0.6, wires=0)
+                qml.CNOT(wires=[0, 1])
+                qml.expval(qml.PauliZ(0))
 
-        tape1.trainable_params = {0}
-        tape2.trainable_params = {0, 1}
+            tape1.trainable_params = {0}
+            tape2.trainable_params = {0, 1}
 
-        tapes = [tape1, tape2]
-        dys = [np.array([1.0]), np.array([1.0])]
+            tapes = [tape1, tape2]
+            dys = [np.array([1.0]), np.array([1.0])]
 
-        with pytest.raises(TypeError, match="Unsupported complex Type: complex256"):
             dev.batch_vjp(tapes, dys)
 
     @pytest.mark.parametrize("C", [np.complex64, np.complex128])

From 12299f625246de099f8d966113168e63cec61de1 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Tue, 1 Mar 2022 20:30:11 -0500
Subject: [PATCH 04/94] Fix examples

---
 .../src/examples/CMakeLists.txt               |  27 ++-
 .../src/examples/benchmark_gate.cpp           | 204 ++++++++++++++++
 .../src/examples/benchmark_gate_list.cpp      | 223 -----------------
 .../src/examples/benchmark_generator.cpp      | 202 ++++++++++++++++
 .../src/examples/benchmark_matrix.cpp         | 144 +++++++++++
 .../src/examples/benchmark_multi_rz.cpp       |  77 ------
 .../src/examples/benchmark_operation.cpp      | 210 ----------------
 .../src/examples/gate_benchmark_oplist.cpp    | 227 ------------------
 ...ot_gate_benchmark.py => plot_benchmark.py} |   0
 .../src/examples/run_benchmark.sh             |  39 +++
 .../src/examples/run_gate_benchmark.sh        |  55 -----
 pennylane_lightning/src/examples/test.sh      |   1 +
 .../cpu_kernels/GateImplementationsLM.hpp     |  60 +++--
 .../cpu_kernels/GateImplementationsPI.hpp     |   8 +-
 .../src/simulator/DynamicDispatcher.hpp       |  10 -
 .../src/simulator/StateVectorBase.hpp         | 107 ++++++++-
 pennylane_lightning/src/tests/TestHelpers.hpp |  66 +----
 .../src/tests/Test_AdjDiff.cpp                |  14 +-
 .../tests/Test_GateImplementations_Matrix.cpp |   1 +
 .../src/tests/Test_Internal.cpp               |  32 ---
 pennylane_lightning/src/tests/Test_Util.cpp   |  32 +++
 .../src/util/LinearAlgebra.hpp                |  67 ++++++
 22 files changed, 866 insertions(+), 940 deletions(-)
 create mode 100644 pennylane_lightning/src/examples/benchmark_gate.cpp
 delete mode 100644 pennylane_lightning/src/examples/benchmark_gate_list.cpp
 create mode 100644 pennylane_lightning/src/examples/benchmark_generator.cpp
 create mode 100644 pennylane_lightning/src/examples/benchmark_matrix.cpp
 delete mode 100644 pennylane_lightning/src/examples/benchmark_multi_rz.cpp
 delete mode 100644 pennylane_lightning/src/examples/benchmark_operation.cpp
 delete mode 100644 pennylane_lightning/src/examples/gate_benchmark_oplist.cpp
 rename pennylane_lightning/src/examples/{plot_gate_benchmark.py => plot_benchmark.py} (100%)
 create mode 100755 pennylane_lightning/src/examples/run_benchmark.sh
 delete mode 100755 pennylane_lightning/src/examples/run_gate_benchmark.sh
 create mode 100644 pennylane_lightning/src/examples/test.sh

diff --git a/pennylane_lightning/src/examples/CMakeLists.txt b/pennylane_lightning/src/examples/CMakeLists.txt
index d58bcce5ba..addb786ad6 100644
--- a/pennylane_lightning/src/examples/CMakeLists.txt
+++ b/pennylane_lightning/src/examples/CMakeLists.txt
@@ -21,25 +21,28 @@ target_link_libraries(lightning_examples INTERFACE lightning_compile_options
                                                    lightning_simulator
                                                    lightning_utils)
 
-add_executable(benchmark_operation benchmark_operation.cpp)
-target_link_libraries(benchmark_operation PRIVATE lightning_examples)
+add_executable(benchmark_gate benchmark_gate.cpp)
+target_link_libraries(benchmark_gate PRIVATE lightning_examples)
 
-add_executable(benchmark_operation_float benchmark_operation.cpp)
-target_compile_options(benchmark_operation_float PRIVATE "-DUSE_SINGLE_PRECISION")
-target_link_libraries(benchmark_operation_float PRIVATE lightning_examples)
+add_executable(benchmark_generator benchmark_generator.cpp)
+target_link_libraries(benchmark_generator PRIVATE lightning_examples)
 
-add_executable(benchmark_multi_rz benchmark_multi_rz.cpp)
-target_link_libraries(benchmark_multi_rz PRIVATE lightning_examples)
+add_executable(benchmark_matrix benchmark_matrix.cpp)
+target_link_libraries(benchmark_matrix PRIVATE lightning_examples)
+
+# add_executable(benchmark_operation_float benchmark_operation.cpp)
+# target_compile_options(benchmark_operation_float PRIVATE "-DUSE_SINGLE_PRECISION")
+# target_link_libraries(benchmark_operation_float PRIVATE lightning_examples)
 
 configure_file("compiler_info.in" "compiler_info.txt")
 
-add_custom_command(TARGET benchmark_operation POST_BUILD 
+add_custom_command(TARGET benchmark_gate POST_BUILD 
                    COMMAND ${CMAKE_COMMAND} -E copy
-                           ${PROJECT_SOURCE_DIR}/run_gate_benchmark.sh
-                           ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/run_gate_benchmark.sh
+                           ${PROJECT_SOURCE_DIR}/run_benchmark.sh
+                           ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/run_benchmark.sh
                    COMMAND ${CMAKE_COMMAND} -E create_symlink
-                           ${PROJECT_SOURCE_DIR}/plot_gate_benchmark.py
-                           ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/plot_gate_benchmark.py
+                           ${PROJECT_SOURCE_DIR}/plot_benchmark.py
+                           ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/plot_benchmark.py
                    COMMAND ${CMAKE_COMMAND} -E rename
                            ${CMAKE_CURRENT_BINARY_DIR}/compiler_info.txt
                            ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/compiler_info.txt)
diff --git a/pennylane_lightning/src/examples/benchmark_gate.cpp b/pennylane_lightning/src/examples/benchmark_gate.cpp
new file mode 100644
index 0000000000..00545b1988
--- /dev/null
+++ b/pennylane_lightning/src/examples/benchmark_gate.cpp
@@ -0,0 +1,204 @@
+#include <algorithm>
+#include <chrono>
+#include <cstdlib>
+#include <iostream>
+#include <map>
+#include <random>
+#include <stdexcept>
+#include <string>
+
+#include "Constant.hpp"
+#include "ExampleUtil.hpp"
+#include "StateVectorManagedCPU.hpp"
+
+#ifdef USE_SINGLE_PRECISION
+using PrecisionT = float;
+#pragma message "Using single precision"
+#else
+using PrecisionT = double;
+#endif
+
+using namespace Pennylane;
+using namespace Pennylane::Gates;
+using namespace Pennylane::Util;
+
+struct GateDesc {
+    std::string name;
+    std::vector<size_t> wires;
+    bool inverse;
+    std::vector<PrecisionT> params;
+
+    template <typename Arg0, typename Arg1, typename Arg2, typename Arg3>
+    GateDesc(Arg0 &&arg0, Arg1 &&arg1, Arg2 &&arg2, Arg3 &&arg3)
+        : name{std::forward<Arg0>(arg0)}, wires{std::forward<Arg1>(arg1)},
+          inverse{std::forward<Arg2>(arg2)}, params{std::forward<Arg3>(arg3)} {}
+};
+
+std::ostream &operator<<(std::ostream &os, GateDesc &desc) {
+    os << desc.name << ", " << desc.wires << "," << desc.inverse << ","
+       << desc.params << std::endl;
+    return os;
+}
+
+template <class RandomEngine>
+auto generateGateSequence(RandomEngine &re, const std::string &gate_name,
+                          const size_t num_reps, const size_t num_qubits,
+                          const size_t num_wires_for_multi_qubit)
+    -> std::vector<GateDesc> {
+    using Gates::Constant::multi_qubit_gates;
+
+    const GateOperation gate_op = Util::lookup(
+        Util::reverse_pairs(Constant::gate_names), std::string_view(gate_name));
+    const size_t num_wires = [=]() {
+        if (Util::array_has_elt(multi_qubit_gates, gate_op)) {
+            // if multi qubit gate
+            return num_wires_for_multi_qubit;
+        }
+        return Util::lookup(Constant::gate_wires, gate_op);
+    }();
+    const size_t num_params = Util::lookup(Constant::gate_num_params, gate_op);
+
+    std::vector<GateDesc> gate_seq;
+    std::uniform_int_distribution<size_t> inverse_dist(0, 1);
+    std::uniform_real_distribution<PrecisionT> param_dist(0.0, 2 * M_PI);
+
+    for (uint32_t k = 0; k < num_reps; k++) {
+        std::vector<PrecisionT> params;
+        params.reserve(num_params);
+
+        bool inverse = static_cast<bool>(inverse_dist(re));
+        auto wires = generateNeighboringWires(re, num_qubits, num_wires);
+
+        for (size_t idx = 0; idx < num_params; idx++) {
+            params.emplace_back(param_dist(re));
+        }
+
+        gate_seq.emplace_back(gate_name, std::move(wires), inverse,
+                              std::move(params));
+    }
+    return gate_seq;
+}
+
+double benchmarkGate(KernelType kernel, const size_t num_qubits,
+                     const std::vector<GateDesc> &gate_seq) {
+    // Run benchmark. Total num_reps number of gates is used.
+    StateVectorManagedCPU<PrecisionT> svdat{num_qubits};
+
+    std::chrono::time_point<std::chrono::high_resolution_clock> t_start =
+        std::chrono::high_resolution_clock::now();
+    for (const auto &gate : gate_seq) {
+        svdat.applyOperation(kernel, gate.name, gate.wires, gate.inverse,
+                             gate.params);
+    }
+    std::chrono::time_point<std::chrono::high_resolution_clock> t_end =
+        std::chrono::high_resolution_clock::now();
+
+    return std::chrono::duration<double, std::milli>(t_end - t_start).count();
+}
+
+template <typename RandomEngine>
+double runBenchmarkGate(RandomEngine &re, KernelType kernel,
+                        const std::string &gate_name, size_t num_reps,
+                        size_t num_qubits, size_t num_wires_for_multi_qubit) {
+    auto gate_seq = generateGateSequence(re, gate_name, num_reps, num_qubits,
+                                         num_wires_for_multi_qubit);
+
+    // Log generated sequence if LOG is turned on
+    const char *env_p = std::getenv("LOG");
+    try {
+        if (env_p != nullptr && std::stoi(env_p) != 0) {
+            for (const auto &gate : gate_seq) {
+                std::cerr << gate.name << ", " << gate.wires << ","
+                          << gate.inverse << "," << gate.params << std::endl;
+            }
+        }
+    } catch (std::exception &e) {
+        // Just do not print log
+    }
+
+    return benchmarkGate(kernel, num_qubits, gate_seq);
+}
+
+/**
+ * @brief Benchmark Pennylane-Lightning for a given generator
+ *
+ * @param argc Number of arguments
+ * @param argv Command line arguments
+ * @return Returns 0 is completed successfully
+ */
+int main(int argc, char *argv[]) {
+    namespace Constant = Gates::Constant;
+    // Handle input
+    if (argc != 5 && argc != 6) { // NOLINT(readability-magic-numbers)
+        std::cerr
+            << "Wrong number of inputs. User provided " << argc - 1
+            << " inputs. \n"
+            << "Usage: " + std::string(argv[0]) +
+                   " num_reps num_qubits kernel [generator|gate] [num_wires]\n"
+                   "Examples: \n"
+            << "\t" << argv[0] << " 1000 10 PI GeneratorCRX\n"
+            << "\t" << argv[0] << " 1000 10 LM CRX\n"
+            << "\t" << argv[0] << " 1000 10 LM MutliRZ 3\n";
+        return -1;
+    }
+
+    size_t num_reps;
+    size_t num_qubits;
+
+    try {
+        num_reps = std::stoi(argv[1]);
+        num_qubits = std::stoi(argv[2]);
+    } catch (std::exception &e) {
+        std::cerr << "Arguments num_reps and num_qubits must be integers."
+                  << std::endl;
+        return -1;
+    }
+
+    std::string_view kernel_name = argv[3];
+    KernelType kernel = string_to_kernel(kernel_name);
+    if (kernel == KernelType::None) {
+        std::cerr << "Kernel " << kernel_name << " is unknown." << std::endl;
+        return 1;
+    }
+
+    std::string_view gate_name = argv[4];
+    if (!Util::array_has_elt(Util::second_elts_of(Constant::gate_names),
+                             gate_name)) {
+        std::cerr << "Unknown gate name " << gate_name << " is provided"
+                  << std::endl;
+        return 1;
+    }
+
+    Gates::GateOperation gate_op =
+        Util::lookup(Util::reverse_pairs(Constant::gate_names), gate_name);
+
+    size_t num_wires_for_multi_qubit = 0;
+    if (Util::array_has_elt(Constant::multi_qubit_gates, gate_op)) {
+        // User provided a multi-qubit gates
+        if (argc != 6) {
+            std::cerr << "One should provide the number of wires when using "
+                         "multi qubit gates."
+                      << std::endl;
+            return 1;
+        }
+
+        try {
+            num_wires_for_multi_qubit = std::stoi(argv[5]);
+        } catch (std::exception &e) {
+            std::cerr << "Number of wires must be an integer" << std::endl;
+            return 1;
+        }
+    }
+
+    std::random_device rd;
+    std::mt19937 re(rd());
+
+    double walltime =
+        runBenchmarkGate(re, kernel, std::string(gate_name), num_reps,
+                         num_qubits, num_wires_for_multi_qubit);
+
+    // Output walltime in csv format (Num Qubits, Time (milliseconds))
+    std::cout << num_qubits << ", " << walltime / static_cast<double>(num_reps)
+              << std::endl;
+    return 0;
+}
diff --git a/pennylane_lightning/src/examples/benchmark_gate_list.cpp b/pennylane_lightning/src/examples/benchmark_gate_list.cpp
deleted file mode 100644
index 5910ad0884..0000000000
--- a/pennylane_lightning/src/examples/benchmark_gate_list.cpp
+++ /dev/null
@@ -1,223 +0,0 @@
-#include <algorithm>
-#include <chrono>
-#include <cstdlib>
-#include <iostream>
-#include <map>
-#include <random>
-#include <stdexcept>
-#include <string>
-
-#include "Constant.hpp"
-#include "ExampleUtil.hpp"
-#include "StateVectorManaged.hpp"
-
-using namespace Pennylane;
-using namespace Pennylane::Gates;
-using namespace Pennylane::Util;
-
-std::string_view strip(std::string_view str) {
-    auto start = str.find_first_not_of(" \t");
-    auto end = str.find_last_not_of(" \t");
-    return str.substr(start, end - start + 1);
-}
-
-struct GateDesc {
-    size_t n_wires;  // number of wires the gate applies to
-    size_t n_params; // number of parameters the gate requires
-};
-
-std::vector<std::pair<std::string, GateDesc>>
-parseGateLists(std::string_view arg) {
-    namespace Constant = Gates::Constant;
-    std::map<std::string, GateDesc> available_gates_wires;
-
-    for (const auto &[gate_op, gate_name] : Constant::gate_names) {
-        if (!array_has_elt(Constant::multi_qubit_gates, gate_op)) {
-            // We do not support multi qubit gates yet
-            size_t n_wires = Util::lookup(Constant::gate_wires, gate_op);
-            size_t n_params = Util::lookup(Constant::gate_num_params, gate_op);
-            available_gates_wires.emplace(gate_name,
-                                          GateDesc{n_wires, n_params});
-        }
-    }
-
-    if (arg.empty()) {
-        return {};
-    }
-
-    std::vector<std::pair<std::string, GateDesc>> ops;
-
-    if (auto pos = arg.find_first_of('['); pos != std::string_view::npos) {
-        // arg is a list "[...]"
-        auto start = pos + 1;
-        auto end = arg.find_last_of(']');
-        if (end == std::string_view::npos) {
-            throw std::invalid_argument(
-                "Argument must contain operators within square brackets [].");
-        }
-        arg = arg.substr(start, end - start);
-    }
-
-    size_t start;
-    size_t end = 0;
-    while ((start = arg.find_first_not_of(',', end)) != std::string::npos) {
-        end = arg.find(',', start);
-        auto op_name = strip(arg.substr(start, end - start));
-
-        auto iter = available_gates_wires.find(std::string(op_name));
-
-        if (iter == available_gates_wires.end()) {
-            std::ostringstream ss;
-            ss << "Given gate " << op_name
-               << " is not availabe"; // TODO: Change to std::format in C++20
-            throw std::invalid_argument(ss.str());
-        }
-        ops.emplace_back(*iter);
-    }
-    return ops;
-}
-
-/**
- * @brief Benchmark Pennylane-Lightning for a given gate set
- *
- * Example usage:
- *
- *     $ gate_benchmark_oplist 10 22 # Benchmark using 10 random gates (sampled
- * evenly from all possible gates) for 22 qubits
- *     $ gate_benchmark_oplist 100 20 [PauliX, CNOT] # Benchmark using 100
- * random gates (where each gate is PauliX or CNOT) for 20 qubits
- *
- * The whole supported gates are PauliX, PauliY, PauliZ, Hadamard, S, T, RX, RY,
- * RZ, Rot, PhaseShift, CNOT, SWAP, ControlledPhaseShift, CRX, CRY, CRZ, CRot,
- * Toffoli and CSWAP.
- *
- * @param argc Number of arguments
- * @param argv Command line arguments
- * @return Returns 0 is completed successfully
- */
-int main(int argc, char *argv[]) {
-    using TestType = double;
-
-    // Handle input
-    if (argc < 4) {
-        std::cerr << "Wrong number of inputs. User provided " << argc - 1
-                  << " inputs. "
-                  << "Usage: " + std::string(argv[0]) +
-                         " num_gate_reps num_qubits kernel [gate_lists]\n"
-                         "\tExample: "
-                  << argv[0] << " 1000 10 PI [PauliX, CNOT]"
-                  << std::endl; // Change to std::format in C++20
-        return -1;
-    }
-
-    size_t num_gate_reps;
-    size_t num_qubits;
-
-    try {
-        num_gate_reps = std::stoi(argv[1]);
-        num_qubits = std::stoi(argv[2]);
-    } catch (std::exception &e) {
-        std::cerr << "Arguments num_gate_reps and num_qubits must be integers."
-                  << std::endl;
-        return -1;
-    }
-
-    std::string_view kernel_name = argv[3];
-    KernelType kernel = string_to_kernel(kernel_name);
-    if (kernel == KernelType::None) {
-        std::cerr << "Kernel " << kernel_name << " is unknown." << std::endl;
-        return 1;
-    }
-
-    // Gate list is provided
-    std::string op_list_s;
-    {
-        std::ostringstream ss;
-        for (int idx = 4; idx < argc; idx++) {
-            ss << argv[idx] << " ";
-        }
-        op_list_s = ss.str();
-    }
-
-    std::vector<std::pair<std::string, GateDesc>> op_list;
-    try {
-        op_list = parseGateLists(op_list_s);
-    } catch (std::exception &e) {
-        std::cerr << e.what() << std::endl;
-        return 1;
-    }
-
-    if (op_list.empty()) {
-        std::cerr << "Please provide a gate list." << std::endl;
-        return 1;
-    }
-
-    // Generate random gate sequences
-    std::random_device rd;
-    std::mt19937 re(rd());
-
-    std::vector<std::string_view> random_gate_names;
-    std::vector<std::vector<size_t>> random_gate_wires;
-    std::vector<bool> random_inverses;
-    std::vector<std::vector<TestType>> random_gate_parameters;
-
-    std::uniform_int_distribution<size_t> gate_dist(0, op_list.size() - 1);
-    std::uniform_int_distribution<size_t> inverse_dist(0, 1);
-    std::uniform_real_distribution<TestType> param_dist(0.0, 2 * M_PI);
-    std::uniform_int_distribution<size_t> wire_dist(0, num_qubits - 1);
-
-    auto gen_param = [&param_dist, &re]() { return param_dist(re); };
-
-    for (uint32_t k = 0; k < num_gate_reps; k++) {
-        const auto &[op_name, gate_desc] = op_list[gate_dist(re)];
-
-        std::vector<TestType> gate_params(gate_desc.n_params, 0.0);
-        std::generate(gate_params.begin(), gate_params.end(), gen_param);
-
-        random_gate_names.emplace_back(op_name);
-        random_inverses.emplace_back(static_cast<bool>(inverse_dist(re)));
-        // random_gate_wires.emplace_back(generateDistinctWires(re, num_qubits,
-        // gate_desc.n_wires));
-        random_gate_wires.emplace_back(
-            generateNeighboringWires(re, num_qubits, gate_desc.n_wires));
-        random_gate_parameters.emplace_back(std::move(gate_params));
-    }
-
-    // Log generated sequence if LOG is turned on
-    const char *env_p = std::getenv("LOG");
-    try {
-        if (env_p != nullptr && std::stoi(env_p) != 0) {
-            for (size_t gate_rep = 0; gate_rep < num_gate_reps; gate_rep++) {
-                std::cerr << random_gate_names[gate_rep] << ", "
-                          << random_gate_wires[gate_rep] << ", "
-                          << random_gate_parameters[gate_rep] << std::endl;
-            }
-        }
-    } catch (std::exception &e) {
-        // Just do not print log
-    }
-
-    // Run benchmark. Total num_gate_reps number of gates is used.
-    Pennylane::StateVectorManaged<TestType> svdat{num_qubits};
-    std::chrono::time_point<std::chrono::high_resolution_clock> t_start;
-    std::chrono::time_point<std::chrono::high_resolution_clock> t_end;
-    t_start = std::chrono::high_resolution_clock::now();
-
-    for (size_t gate_rep = 0; gate_rep < num_gate_reps; gate_rep++) {
-        svdat.applyOperation(kernel, std::string(random_gate_names[gate_rep]),
-                             random_gate_wires[gate_rep],
-                             random_inverses[gate_rep],
-                             random_gate_parameters[gate_rep]);
-    }
-
-    t_end = std::chrono::high_resolution_clock::now();
-
-    // Output walltime in csv format (Num Qubits, Time (milliseconds))
-    const auto walltime =
-        0.001 * ((std::chrono::duration_cast<std::chrono::microseconds>(
-                      t_end - t_start))
-                     .count());
-    std::cout << num_qubits << ", "
-              << walltime / static_cast<double>(num_gate_reps) << std::endl;
-    return 0;
-}
diff --git a/pennylane_lightning/src/examples/benchmark_generator.cpp b/pennylane_lightning/src/examples/benchmark_generator.cpp
new file mode 100644
index 0000000000..0753b57e6a
--- /dev/null
+++ b/pennylane_lightning/src/examples/benchmark_generator.cpp
@@ -0,0 +1,202 @@
+#include <algorithm>
+#include <chrono>
+#include <cstdlib>
+#include <iostream>
+#include <map>
+#include <random>
+#include <stdexcept>
+#include <string>
+
+#include "Constant.hpp"
+#include "DynamicDispatcher.hpp"
+#include "ExampleUtil.hpp"
+#include "StateVectorManagedCPU.hpp"
+
+#ifdef USE_SINGLE_PRECISION
+using PrecisionT = float;
+#pragma message "Using single precision"
+#else
+using PrecisionT = double;
+#endif
+
+using namespace Pennylane;
+using namespace Pennylane::Gates;
+using namespace Pennylane::Util;
+
+auto generatorOp(const std::string_view &name) -> Gates::GeneratorOperation {
+    auto &dispatcher = DynamicDispatcher<PrecisionT>::getInstance();
+    return dispatcher.strToGeneratorOp(std::string(name));
+}
+
+struct GeneratorDesc {
+    std::string name;
+    std::vector<size_t> wires;
+    bool inverse;
+
+    template <typename Arg0, typename Arg1, typename Arg2>
+    GeneratorDesc(Arg0 &&arg0, Arg1 &&arg1, Arg2 &&arg2)
+        : name{std::forward<Arg0>(arg0)}, wires{std::forward<Arg1>(arg1)},
+          inverse{std::forward<Arg2>(arg2)} {}
+};
+
+std::ostream &operator<<(std::ostream &os, GeneratorDesc &desc) {
+    os << desc.name << ", " << desc.wires << "," << desc.inverse << std::endl;
+    return os;
+}
+
+template <class RandomEngine>
+auto generateGeneratorSequence(RandomEngine &re,
+                               const GeneratorOperation gntr_op,
+                               const size_t num_reps, const size_t num_qubits,
+                               const size_t num_wires_for_multi_qubit)
+    -> std::vector<GeneratorDesc> {
+    namespace Constant = Gates::Constant;
+    using Gates::GeneratorOperation;
+
+    const auto gntr_name =
+        Util::lookup(Constant::generator_names, gntr_op).substr(9);
+
+    const size_t num_wires = [=]() {
+        if (Util::array_has_elt(Constant::multi_qubit_generators, gntr_op)) {
+            // if multi qubit gate
+            return num_wires_for_multi_qubit;
+        }
+        return Util::lookup(Constant::generator_wires, gntr_op);
+    }();
+
+    std::vector<GeneratorDesc> gntr_seq;
+    std::uniform_int_distribution<size_t> inverse_dist(0, 1);
+
+    for (uint32_t k = 0; k < num_reps; k++) {
+
+        bool inverse = static_cast<bool>(inverse_dist(re));
+        auto wires = generateNeighboringWires(re, num_qubits, num_wires);
+
+        gntr_seq.emplace_back(gntr_name, std::move(wires), inverse);
+    }
+    return gntr_seq;
+}
+
+double benchmarkGenerator(KernelType kernel, const size_t num_qubits,
+                          const std::vector<GeneratorDesc> &gntr_seq) {
+    // Run benchmark. Total num_reps number of gates is used.
+    StateVectorManagedCPU<PrecisionT> svdat{num_qubits};
+
+    std::chrono::time_point<std::chrono::high_resolution_clock> t_start =
+        std::chrono::high_resolution_clock::now();
+    for (const auto &gntr : gntr_seq) {
+        [[maybe_unused]] PrecisionT scale =
+            svdat.applyGenerator(kernel, gntr.name, gntr.wires, gntr.inverse);
+    }
+    std::chrono::time_point<std::chrono::high_resolution_clock> t_end =
+        std::chrono::high_resolution_clock::now();
+
+    return std::chrono::duration<double, std::milli>(t_end - t_start).count();
+}
+
+template <typename RandomEngine>
+double runBenchmarkGenerator(RandomEngine &re, KernelType kernel,
+                             const GeneratorOperation gntr_op, size_t num_reps,
+                             size_t num_qubits,
+                             size_t num_wires_for_multi_qubit) {
+    auto gntr_seq = generateGeneratorSequence(re, gntr_op, num_reps, num_qubits,
+                                              num_wires_for_multi_qubit);
+
+    // Log generated sequence if LOG is turned on
+    const char *env_p = std::getenv("LOG");
+    try {
+        if (env_p != nullptr && std::stoi(env_p) != 0) {
+            for (const auto &gntr : gntr_seq) {
+                std::cerr << gntr.name << ", " << gntr.wires << ","
+                          << gntr.inverse << std::endl;
+            }
+        }
+    } catch (std::exception &e) {
+        // Just do not print log
+    }
+
+    return benchmarkGenerator(kernel, num_qubits, gntr_seq);
+}
+
+/**
+ * @brief Benchmark Pennylane-Lightning for a given generator
+ *
+ * @param argc Number of arguments
+ * @param argv Command line arguments
+ * @return Returns 0 is completed successfully
+ */
+int main(int argc, char *argv[]) {
+    namespace Constant = Gates::Constant;
+    // Handle input
+    if (argc != 5 && argc != 6) { // NOLINT(readability-magic-numbers)
+        std::cerr
+            << "Wrong number of inputs. User provided " << argc - 1
+            << " inputs. \n"
+            << "Usage: " + std::string(argv[0]) +
+                   " num_reps num_qubits kernel [generator|gate] [num_wires]\n"
+                   "Examples: \n"
+            << "\t" << argv[0] << " 1000 10 PI GeneratorCRX\n"
+            << "\t" << argv[0] << " 1000 10 LM CRX\n"
+            << "\t" << argv[0] << " 1000 10 LM MutliRZ 3\n";
+        return -1;
+    }
+
+    size_t num_reps;
+    size_t num_qubits;
+
+    try {
+        num_reps = std::stoi(argv[1]);
+        num_qubits = std::stoi(argv[2]);
+    } catch (std::exception &e) {
+        std::cerr << "Arguments num_reps and num_qubits must be integers."
+                  << std::endl;
+        return -1;
+    }
+
+    std::string_view kernel_name = argv[3];
+    KernelType kernel = string_to_kernel(kernel_name);
+    if (kernel == KernelType::None) {
+        std::cerr << "Kernel " << kernel_name << " is unknown." << std::endl;
+        return 1;
+    }
+
+    std::string_view gntr_name = argv[4];
+    Gates::GeneratorOperation gntr_op;
+
+    try {
+        gntr_op = generatorOp(gntr_name);
+    } catch (std::exception &e) {
+        std::cout << "Unknown generator " + std::string(gntr_name) + " provided"
+                  << std::endl;
+        return 1;
+    }
+
+    size_t num_wires_for_multi_qubit = 0;
+    if (Util::array_has_elt(Constant::multi_qubit_generators, gntr_op)) {
+        // User provided a multi-qubit gates
+        if (argc != 6) {
+            std::cerr << "One should provide the number of wires when using "
+                         "multi qubit generators."
+                      << std::endl;
+            return 1;
+        }
+
+        try {
+            num_wires_for_multi_qubit = std::stoi(argv[5]);
+        } catch (std::exception &e) {
+            std::cerr << "Number of wires must be an integer" << std::endl;
+            return 1;
+        }
+    }
+
+    std::random_device rd;
+    std::mt19937 re(rd());
+
+    double walltime = runBenchmarkGenerator(
+        re, kernel, gntr_op, num_reps, num_qubits, num_wires_for_multi_qubit);
+
+    // Output walltime in csv format (Num Qubits, Time (milliseconds))
+    std::cout << num_qubits << ", " << walltime / static_cast<double>(num_reps)
+              << std::endl;
+    return 0;
+}
diff --git a/pennylane_lightning/src/examples/benchmark_matrix.cpp b/pennylane_lightning/src/examples/benchmark_matrix.cpp
new file mode 100644
index 0000000000..9d297db91c
--- /dev/null
+++ b/pennylane_lightning/src/examples/benchmark_matrix.cpp
@@ -0,0 +1,144 @@
+#include <algorithm>
+#include <chrono>
+#include <cstdlib>
+#include <iostream>
+#include <map>
+#include <random>
+#include <stdexcept>
+#include <string>
+
+#include "Constant.hpp"
+#include "ExampleUtil.hpp"
+#include "LinearAlgebra.hpp"
+#include "StateVectorManagedCPU.hpp"
+
+#ifdef USE_SINGLE_PRECISION
+using PrecisionT = float;
+#pragma message "Using single precision"
+#else
+using PrecisionT = double;
+#endif
+
+using namespace Pennylane;
+using namespace Pennylane::Gates;
+using namespace Pennylane::Util;
+
+struct MatOpDesc {
+    std::vector<size_t> wires;
+    bool inverse;
+    std::vector<std::complex<PrecisionT>> mat;
+
+    template <typename Arg0, typename Arg1, typename Arg2>
+    MatOpDesc(Arg0 &&arg0, Arg1 &&arg1, Arg2 &&arg2)
+        : wires{std::forward<Arg0>(arg0)}, inverse{std::forward<Arg1>(arg1)},
+          mat{std::forward<Arg2>(arg2)} {}
+};
+
+template <class RandomEngine>
+auto generateMatrixSequence(RandomEngine &re, const size_t num_reps,
+                            const size_t num_qubits, const size_t num_wires)
+    -> std::vector<MatOpDesc> {
+
+    std::vector<MatOpDesc> matrix_seq;
+    matrix_seq.reserve(num_reps);
+    std::uniform_int_distribution<size_t> inverse_dist(0, 1);
+    for (uint32_t k = 0; k < num_reps; k++) {
+        bool inverse = static_cast<bool>(inverse_dist(re));
+        auto wires = generateNeighboringWires(re, num_qubits, num_wires);
+
+        matrix_seq.emplace_back(std::move(wires), inverse,
+                                Util::randomUnitary<PrecisionT>(re, num_wires));
+    }
+    return matrix_seq;
+}
+
+double benchmarkMatrix(KernelType kernel, const size_t num_qubits,
+                       const std::vector<MatOpDesc> &mat_seq) {
+    // Run benchmark. Total num_reps number of gates is used.
+    StateVectorManagedCPU<PrecisionT> svdat{num_qubits};
+
+    std::chrono::time_point<std::chrono::high_resolution_clock> t_start =
+        std::chrono::high_resolution_clock::now();
+    for (const auto &mat_desc : mat_seq) {
+        svdat.applyMatrix(kernel, mat_desc.mat.data(), mat_desc.wires,
+                          mat_desc.inverse);
+    }
+    std::chrono::time_point<std::chrono::high_resolution_clock> t_end =
+        std::chrono::high_resolution_clock::now();
+
+    return std::chrono::duration<double, std::milli>(t_end - t_start).count();
+}
+
+template <typename RandomEngine>
+double runBenchmarkMatrix(RandomEngine &re, KernelType kernel, size_t num_reps,
+                          size_t num_qubits, size_t num_wires) {
+    auto mat_seq = generateMatrixSequence(re, num_reps, num_qubits, num_wires);
+
+    // Log generated sequence if LOG is turned on
+    const char *env_p = std::getenv("LOG");
+    try {
+        if (env_p != nullptr && std::stoi(env_p) != 0) {
+            for (const auto &mat_desc : mat_seq) {
+                std::cerr << mat_desc.wires << ", " << mat_desc.inverse << ", "
+                          << mat_desc.mat << std::endl;
+            }
+        }
+    } catch (std::exception &e) {
+        // Just do not print log
+    }
+
+    return benchmarkMatrix(kernel, num_qubits, mat_seq);
+}
+
+/**
+ * @brief Benchmark Pennylane-Lightning for a given generator
+ *
+ * @param argc Number of arguments
+ * @param argv Command line arguments
+ * @return Returns 0 is completed successfully
+ */
+int main(int argc, char *argv[]) {
+    namespace Constant = Gates::Constant;
+    // Handle input
+    if (argc != 5) { // NOLINT(readability-magic-numbers)
+        std::cerr << "Wrong number of inputs. User provided " << argc - 1
+                  << " inputs. \n"
+                  << "Usage: " + std::string(argv[0]) +
+                         " num_reps num_qubits kernel num_wires\n"
+                         "Examples: \n"
+                  << "\t" << argv[0] << " 1000 10 PI 4\n";
+        return -1;
+    }
+
+    size_t num_reps;
+    size_t num_qubits;
+    size_t num_wires;
+
+    try {
+        num_reps = std::stoi(argv[1]);
+        num_qubits = std::stoi(argv[2]);
+        num_wires = std::stoi(argv[4]);
+    } catch (std::exception &e) {
+        std::cerr << "Arguments num_reps and num_qubits must be integers."
+                  << std::endl;
+        return -1;
+    }
+
+    std::string_view kernel_name = argv[3];
+    KernelType kernel = string_to_kernel(kernel_name);
+    if (kernel == KernelType::None) {
+        std::cerr << "Kernel " << kernel_name << " is unknown." << std::endl;
+        return 1;
+    }
+
+    std::random_device rd;
+    std::mt19937 re(rd());
+
+    double walltime =
+        runBenchmarkMatrix(re, kernel, num_reps, num_qubits, num_wires);
+
+    // Output walltime in csv format (Num Qubits, Time (milliseconds))
+    std::cout << num_qubits << ", " << walltime / static_cast<double>(num_reps)
+              << std::endl;
+    return 0;
+}
diff --git a/pennylane_lightning/src/examples/benchmark_multi_rz.cpp b/pennylane_lightning/src/examples/benchmark_multi_rz.cpp
deleted file mode 100644
index 49bac2ead2..0000000000
--- a/pennylane_lightning/src/examples/benchmark_multi_rz.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-#include "ExampleUtil.hpp"
-#include "StateVectorCPU.hpp"
-
-#include <chrono>
-#include <cstdio>
-#include <iostream>
-#include <random>
-
-using namespace Pennylane;
-using namespace Pennylane::Gates;
-
-constexpr uint32_t seed = 1337;
-
-int main(int argc, char *argv[]) {
-    using TestType = double;
-
-    if (argc != 5) { // NOLINT(readability-magic-numbers)
-        std::cout << "Usage: " << argv[0]
-                  << " num_gate_reps num_qubits num_wires kernel" << std::endl;
-        return 1;
-    }
-
-    size_t num_gate_reps;
-    size_t num_qubits;
-    size_t num_wires;
-
-    try {
-        num_gate_reps = std::stoi(argv[1]);
-        num_qubits = std::stoi(argv[2]);
-        num_wires = std::stoi(argv[3]);
-    } catch (std::exception &e) {
-        std::cerr << "Arguments must be integers." << std::endl;
-        return 1;
-    }
-
-    std::string_view kernel_name = argv[4];
-    KernelType kernel = string_to_kernel(kernel_name);
-    if (kernel == KernelType::None) {
-        std::cerr << "Kernel " << kernel_name << " is unknown." << std::endl;
-        return 1;
-    }
-
-    std::mt19937 re{seed}; // NOLINT(readability-magic-number)
-    std::uniform_real_distribution<double> param_dist(-M_PI, M_PI);
-
-    std::vector<std::vector<size_t>> wires;
-    std::vector<double> params;
-
-    wires.reserve(num_gate_reps);
-    params.reserve(num_gate_reps);
-
-    for (size_t gate_rep = 0; gate_rep < num_gate_reps; gate_rep++) {
-        wires.emplace_back(generateDistinctWires(re, num_qubits, num_wires));
-        params.emplace_back(param_dist(re));
-    }
-
-    StateVectorCPU<TestType> sv{num_qubits};
-
-    std::chrono::time_point<std::chrono::high_resolution_clock> t_start =
-        std::chrono::high_resolution_clock::now();
-
-    for (size_t gate_rep = 0; gate_rep < num_gate_reps; gate_rep++) {
-        sv.applyOperation(kernel, "MultiRZ", wires[gate_rep], false,
-                          {params[gate_rep]});
-    }
-
-    std::chrono::time_point<std::chrono::high_resolution_clock> t_end =
-        std::chrono::high_resolution_clock::now();
-    const auto walltime =
-        0.001 * ((std::chrono::duration_cast<std::chrono::microseconds>(
-                      t_end - t_start))
-                     .count());
-    std::cout << num_qubits << ", "
-              << walltime / static_cast<double>(num_gate_reps) << std::endl;
-
-    return 0;
-}
diff --git a/pennylane_lightning/src/examples/benchmark_operation.cpp b/pennylane_lightning/src/examples/benchmark_operation.cpp
deleted file mode 100644
index 0978a90550..0000000000
--- a/pennylane_lightning/src/examples/benchmark_operation.cpp
+++ /dev/null
@@ -1,210 +0,0 @@
-#include <algorithm>
-#include <chrono>
-#include <cstdlib>
-#include <iostream>
-#include <map>
-#include <random>
-#include <stdexcept>
-#include <string>
-
-#include "Constant.hpp"
-#include "ExampleUtil.hpp"
-#include "StateVectorCPU.hpp"
-
-#ifdef USE_SINGLE_PRECISION
-using PrecisionT = float;
-#pragma message "Using single precision"
-#else
-using PrecisionT = double;
-#endif
-
-using namespace Pennylane;
-using namespace Pennylane::Gates;
-using namespace Pennylane::Util;
-
-std::string_view strip(std::string_view str) {
-    auto start = str.find_first_not_of(" \t");
-    auto end = str.find_last_not_of(" \t");
-    return str.substr(start, end - start + 1);
-}
-
-template <class RandomEngine>
-double benchmark_gate(RandomEngine &re, KernelType kernel,
-                      const std::string &gate_name, const size_t num_reps,
-                      const size_t num_qubits) {
-    const GateOperation gate_op = Util::lookup(
-        Util::reverse_pairs(Constant::gate_names), std::string_view(gate_name));
-    const size_t num_wires = Util::lookup(Constant::gate_wires, gate_op);
-    const size_t num_params = Util::lookup(Constant::gate_num_params, gate_op);
-
-    // Generate random generator sequences
-    std::vector<std::vector<size_t>> random_wires;
-    std::vector<bool> random_inverses;
-    std::vector<std::vector<PrecisionT>> random_params;
-    random_wires.reserve(num_reps);
-    random_inverses.reserve(num_reps);
-    random_params.reserve(num_reps);
-
-    std::uniform_int_distribution<size_t> inverse_dist(0, 1);
-    std::uniform_real_distribution<PrecisionT> param_dist(0.0, 2 * M_PI);
-
-    for (uint32_t k = 0; k < num_reps; k++) {
-        std::vector<PrecisionT> gate_params;
-        gate_params.reserve(num_params);
-
-        random_inverses.emplace_back(static_cast<bool>(inverse_dist(re)));
-        random_wires.emplace_back(
-            generateNeighboringWires(re, num_qubits, num_wires));
-
-        for (size_t idx = 0; idx < num_params; idx++) {
-            gate_params.emplace_back(param_dist(re));
-        }
-        random_params.emplace_back(std::move(gate_params));
-    }
-
-    // Log generated sequence if LOG is turned on
-    const char *env_p = std::getenv("LOG");
-    try {
-        if (env_p != nullptr && std::stoi(env_p) != 0) {
-            for (size_t gate_rep = 0; gate_rep < num_reps; gate_rep++) {
-                std::cerr << gate_name << ", " << random_wires[gate_rep] << ","
-                          << random_inverses[gate_rep] << ","
-                          << random_params[gate_rep] << std::endl;
-            }
-        }
-    } catch (std::exception &e) {
-        // Just do not print log
-    }
-
-    // Run benchmark. Total num_reps number of gates is used.
-    StateVectorCPU<PrecisionT> svdat{num_qubits};
-
-    std::chrono::time_point<std::chrono::high_resolution_clock> t_start =
-        std::chrono::high_resolution_clock::now();
-    for (size_t gate_rep = 0; gate_rep < num_reps; gate_rep++) {
-        svdat.applyOperation(kernel, gate_name, random_wires[gate_rep],
-                             random_inverses[gate_rep],
-                             random_params[gate_rep]);
-    }
-    std::chrono::time_point<std::chrono::high_resolution_clock> t_end =
-        std::chrono::high_resolution_clock::now();
-
-    return std::chrono::duration<double, std::milli>(t_end - t_start).count();
-}
-
-template <class RandomEngine>
-double benchmark_generator(RandomEngine &re, KernelType kernel,
-                           const std::string &gntr_name, const size_t num_reps,
-                           const size_t num_qubits) {
-    const auto gntr_name_without_prefix = gntr_name.substr(9);
-    const GeneratorOperation gntr_op =
-        Util::lookup(Util::reverse_pairs(Constant::generator_names),
-                     std::string_view(gntr_name));
-    const size_t num_wires = Util::lookup(Constant::generator_wires, gntr_op);
-
-    // Generate random generator sequences
-    std::vector<std::vector<size_t>> random_wires;
-    std::vector<bool> random_inverses;
-    random_wires.reserve(num_reps);
-    random_inverses.reserve(num_reps);
-
-    std::uniform_int_distribution<size_t> inverse_dist(0, 1);
-
-    for (uint32_t k = 0; k < num_reps; k++) {
-        random_inverses.emplace_back(static_cast<bool>(inverse_dist(re)));
-        random_wires.emplace_back(
-            generateNeighboringWires(re, num_qubits, num_wires));
-    }
-
-    // Log generated sequence if LOG is turned on
-    const char *env_p = std::getenv("LOG");
-    try {
-        if (env_p != nullptr && std::stoi(env_p) != 0) {
-            for (size_t gate_rep = 0; gate_rep < num_reps; gate_rep++) {
-                std::cerr << gntr_name << ", " << random_wires[gate_rep] << ","
-                          << random_inverses[gate_rep] << std::endl;
-            }
-        }
-    } catch (std::exception &e) {
-        // Just do not print log
-    }
-
-    // Run benchmark. Total num_reps number of gates is used.
-    StateVectorCPU<PrecisionT> svdat{num_qubits};
-
-    std::chrono::time_point<std::chrono::high_resolution_clock> t_start =
-        std::chrono::high_resolution_clock::now();
-    for (size_t gate_rep = 0; gate_rep < num_reps; gate_rep++) {
-        [[maybe_unused]] auto scale = svdat.applyGenerator(
-            kernel, gntr_name_without_prefix, random_wires[gate_rep],
-            random_inverses[gate_rep]);
-    }
-    std::chrono::time_point<std::chrono::high_resolution_clock> t_end =
-        std::chrono::high_resolution_clock::now();
-
-    return std::chrono::duration<double, std::milli>(t_end - t_start).count();
-}
-
-/**
- * @brief Benchmark Pennylane-Lightning for a given generator
- *
- * @param argc Number of arguments
- * @param argv Command line arguments
- * @return Returns 0 is completed successfully
- */
-int main(int argc, char *argv[]) {
-    // Handle input
-    if (argc < 5) { // NOLINT(readability-magic-numbers)
-        std::cerr << "Wrong number of inputs. User provided " << argc - 1
-                  << " inputs. \n"
-                  << "Usage: " + std::string(argv[0]) +
-                         " num_reps num_qubits kernel [generator|gate]\n"
-                         "Examples: \n"
-                         "\t"
-                  << argv[0] << " 1000 10 PI GeneratorCRX\n"
-                  << "\t" << argv[0] << " 1000 10 LM CRX"
-                  << std::endl; // Change to std::format in C++20
-        return -1;
-    }
-
-    size_t num_reps;
-    size_t num_qubits;
-
-    try {
-        num_reps = std::stoi(argv[1]);
-        num_qubits = std::stoi(argv[2]);
-    } catch (std::exception &e) {
-        std::cerr << "Arguments num_reps and num_qubits must be integers."
-                  << std::endl;
-        return -1;
-    }
-
-    std::string_view kernel_name = argv[3];
-    KernelType kernel = string_to_kernel(kernel_name);
-    if (kernel == KernelType::None) {
-        std::cerr << "Kernel " << kernel_name << " is unknown." << std::endl;
-        return 1;
-    }
-
-    const std::string_view gate_or_gntr_name = argv[4];
-    const std::string_view generator_prefix = "Generator";
-
-    std::random_device rd;
-    std::mt19937 re(rd());
-
-    double walltime;
-
-    if (gate_or_gntr_name.substr(0, generator_prefix.length()) ==
-        generator_prefix) { // generators
-        walltime = benchmark_generator(
-            re, kernel, std::string(gate_or_gntr_name), num_reps, num_qubits);
-    } else {
-        walltime = benchmark_gate(re, kernel, std::string(gate_or_gntr_name),
-                                  num_reps, num_qubits);
-    }
-
-    // Output walltime in csv format (Num Qubits, Time (milliseconds))
-    std::cout << num_qubits << ", " << walltime / static_cast<double>(num_reps)
-              << std::endl;
-    return 0;
-}
diff --git a/pennylane_lightning/src/examples/gate_benchmark_oplist.cpp b/pennylane_lightning/src/examples/gate_benchmark_oplist.cpp
deleted file mode 100644
index 3ec93a272b..0000000000
--- a/pennylane_lightning/src/examples/gate_benchmark_oplist.cpp
+++ /dev/null
@@ -1,227 +0,0 @@
-#include <algorithm>
-#include <chrono>
-#include <cstdlib>
-#include <iostream>
-#include <map>
-#include <random>
-#include <stdexcept>
-#include <string>
-
-#include "Constant.hpp"
-#include "ExampleUtil.hpp"
-#include "StateVectorManaged.hpp"
-
-using namespace Pennylane;
-using namespace Pennylane::Gates;
-using namespace Pennylane::Util;
-
-std::string_view strip(std::string_view str) {
-    auto start = str.find_first_not_of(" \t");
-    auto end = str.find_last_not_of(" \t");
-    return str.substr(start, end - start + 1);
-}
-
-struct GateDesc {
-    size_t n_wires;  // number of wires the gate applies to
-    size_t n_params; // number of parameters the gate requires
-};
-
-std::vector<std::pair<std::string, GateDesc>>
-parseGateLists(std::string_view arg) {
-    namespace Constant = Gates::Constant;
-    std::map<std::string, GateDesc> available_gates_wires;
-
-    for (const auto &[gate_op, gate_name] : Constant::gate_names) {
-        if (!array_has_elt(Constant::multi_qubit_gates, gate_op)) {
-            // We do not support multi qubit gates yet
-            size_t n_wires = Util::lookup(Constant::gate_wires, gate_op);
-            size_t n_params = Util::lookup(Constant::gate_num_params, gate_op);
-            available_gates_wires.emplace(gate_name,
-                                          GateDesc{n_wires, n_params});
-        }
-    }
-
-    if (arg.empty()) {
-        /*
-        return std::vector<std::pair<std::string_view, GateDesc>>(
-            available_gates_wires.begin(), available_gates_wires.end());
-        */
-        return {};
-    }
-
-    std::vector<std::pair<std::string, GateDesc>> ops;
-
-    if (auto pos = arg.find_first_of('['); pos != std::string_view::npos) {
-        // arg is a list "[...]"
-        auto start = pos + 1;
-        auto end = arg.find_last_of(']');
-        if (end == std::string_view::npos) {
-            throw std::invalid_argument(
-                "Argument must contain operators within square brackets [].");
-        }
-        arg = arg.substr(start, end - start);
-    }
-
-    size_t start;
-    size_t end = 0;
-    while ((start = arg.find_first_not_of(',', end)) != std::string::npos) {
-        end = arg.find(',', start);
-        auto op_name = strip(arg.substr(start, end - start));
-
-        auto iter = available_gates_wires.find(std::string(op_name));
-
-        if (iter == available_gates_wires.end()) {
-            std::ostringstream ss;
-            ss << "Given gate " << op_name
-               << " is not availabe"; // TODO: Change to std::format in C++20
-            throw std::invalid_argument(ss.str());
-        }
-        ops.emplace_back(*iter);
-    }
-    return ops;
-}
-
-/**
- * @brief Benchmark Pennylane-Lightning for a given gate set
- *
- * Example usage:
- *
- *     $ gate_benchmark_oplist 10 22 # Benchmark using 10 random gates (sampled
- * evenly from all possible gates) for 22 qubits
- *     $ gate_benchmark_oplist 100 20 [PauliX, CNOT] # Benchmark using 100
- * random gates (where each gate is PauliX or CNOT) for 20 qubits
- *
- * The whole supported gates are PauliX, PauliY, PauliZ, Hadamard, S, T, RX, RY,
- * RZ, Rot, PhaseShift, CNOT, SWAP, ControlledPhaseShift, CRX, CRY, CRZ, CRot,
- * Toffoli and CSWAP.
- *
- * @param argc Number of arguments
- * @param argv Command line arguments
- * @return Returns 0 is completed successfully
- */
-int main(int argc, char *argv[]) {
-    using TestType = double;
-
-    // Handle input
-    if (argc < 4) {
-        std::cerr << "Wrong number of inputs. User provided " << argc - 1
-                  << " inputs. "
-                  << "Usage: " + std::string(argv[0]) +
-                         " num_gate_reps num_qubits kernel [gate_lists]\n"
-                         "\tExample: "
-                  << argv[0] << " 1000 10 PI [PauliX, CNOT]"
-                  << std::endl; // Change to std::format in C++20
-        return -1;
-    }
-
-    size_t num_gate_reps;
-    size_t num_qubits;
-
-    try {
-        num_gate_reps = std::stoi(argv[1]);
-        num_qubits = std::stoi(argv[2]);
-    } catch (std::exception &e) {
-        std::cerr << "Arguments num_gate_reps and num_qubits must be integers."
-                  << std::endl;
-        return -1;
-    }
-
-    std::string_view kernel_name = argv[3];
-    KernelType kernel = string_to_kernel(kernel_name);
-    if (kernel == KernelType::None) {
-        std::cerr << "Kernel " << kernel_name << " is unknown." << std::endl;
-        return 1;
-    }
-
-    // Gate list is provided
-    std::string op_list_s;
-    {
-        std::ostringstream ss;
-        for (int idx = 4; idx < argc; idx++) {
-            ss << argv[idx] << " ";
-        }
-        op_list_s = ss.str();
-    }
-
-    std::vector<std::pair<std::string, GateDesc>> op_list;
-    try {
-        op_list = parseGateLists(op_list_s);
-    } catch (std::exception &e) {
-        std::cerr << e.what() << std::endl;
-        return 1;
-    }
-
-    if (op_list.empty()) {
-        std::cerr << "Please provide a gate list." << std::endl;
-        return 1;
-    }
-
-    // Generate random gate sequences
-    std::random_device rd;
-    std::mt19937 re(rd());
-
-    std::vector<std::string_view> random_gate_names;
-    std::vector<std::vector<size_t>> random_gate_wires;
-    std::vector<bool> random_inverses;
-    std::vector<std::vector<TestType>> random_gate_parameters;
-
-    std::uniform_int_distribution<size_t> gate_dist(0, op_list.size() - 1);
-    std::uniform_int_distribution<size_t> inverse_dist(0, 1);
-    std::uniform_real_distribution<TestType> param_dist(0.0, 2 * M_PI);
-    std::uniform_int_distribution<size_t> wire_dist(0, num_qubits - 1);
-
-    auto gen_param = [&param_dist, &re]() { return param_dist(re); };
-
-    for (uint32_t k = 0; k < num_gate_reps; k++) {
-        const auto &[op_name, gate_desc] = op_list[gate_dist(re)];
-
-        std::vector<TestType> gate_params(gate_desc.n_params, 0.0);
-        std::generate(gate_params.begin(), gate_params.end(), gen_param);
-
-        random_gate_names.emplace_back(op_name);
-        random_inverses.emplace_back(static_cast<bool>(inverse_dist(re)));
-        // random_gate_wires.emplace_back(generateDistinctWires(re, num_qubits,
-        // gate_desc.n_wires));
-        random_gate_wires.emplace_back(
-            generateNeighboringWires(re, num_qubits, gate_desc.n_wires));
-        random_gate_parameters.emplace_back(std::move(gate_params));
-    }
-
-    // Log generated sequence if LOG is turned on
-    const char *env_p = std::getenv("LOG");
-    try {
-        if (env_p != nullptr && std::stoi(env_p) != 0) {
-            for (size_t gate_rep = 0; gate_rep < num_gate_reps; gate_rep++) {
-                std::cerr << random_gate_names[gate_rep] << ", "
-                          << random_gate_wires[gate_rep] << ", "
-                          << random_gate_parameters[gate_rep] << std::endl;
-            }
-        }
-    } catch (std::exception &e) {
-        // Just do not print log
-    }
-
-    // Run benchmark. Total num_gate_reps number of gates is used.
-    Pennylane::StateVectorManaged<TestType> svdat{num_qubits};
-    std::chrono::time_point<std::chrono::high_resolution_clock> t_start;
-    std::chrono::time_point<std::chrono::high_resolution_clock> t_end;
-    t_start = std::chrono::high_resolution_clock::now();
-
-    for (size_t gate_rep = 0; gate_rep < num_gate_reps; gate_rep++) {
-        svdat.applyOperation(kernel, std::string(random_gate_names[gate_rep]),
-                             random_gate_wires[gate_rep],
-                             random_inverses[gate_rep],
-                             random_gate_parameters[gate_rep]);
-    }
-
-    t_end = std::chrono::high_resolution_clock::now();
-
-    // Output walltime in csv format (Num Qubits, Time (milliseconds))
-    const auto walltime =
-        0.001 * ((std::chrono::duration_cast<std::chrono::microseconds>(
-                      t_end - t_start))
-                     .count());
-    std::cout << num_qubits << ", "
-              << walltime / static_cast<double>(num_gate_reps) << std::endl;
-    return 0;
-}
diff --git a/pennylane_lightning/src/examples/plot_gate_benchmark.py b/pennylane_lightning/src/examples/plot_benchmark.py
similarity index 100%
rename from pennylane_lightning/src/examples/plot_gate_benchmark.py
rename to pennylane_lightning/src/examples/plot_benchmark.py
diff --git a/pennylane_lightning/src/examples/run_benchmark.sh b/pennylane_lightning/src/examples/run_benchmark.sh
new file mode 100755
index 0000000000..e8f7daf657
--- /dev/null
+++ b/pennylane_lightning/src/examples/run_benchmark.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+currdir=$(pwd)
+
+if [ "$#" -lt 2 ]; then
+	echo "Usage: $0 Kernel Gate [Number of wires (for MultiRZ)]"
+	exit 1
+fi
+
+# Parameter initialization
+min_num_qubits=8
+max_num_qubits=24
+num_qubits_increment=2
+num_gate_reps=1000
+kernel="$1"
+gate="$2"
+path_to_binary="./benchmark_gate"
+
+compiler_info=$(<compiler_info.txt)
+
+resdir="$currdir/res_${compiler_info}"
+mkdir -p $resdir
+data_file_name="benchmark_${kernel}_${gate}.csv"
+path_to_csv="$resdir/$data_file_name"
+echo "Creating $path_to_csv"
+echo "Num Qubits, Time (milliseconds)" > $path_to_csv
+
+if [[ "$#" -eq 3 ]]; then
+	command_format="$path_to_binary ${num_gate_reps} %d ${kernel} ${gate} $3"
+else
+	command_format="$path_to_binary ${num_gate_reps} %d ${kernel} ${gate}"
+fi
+
+# Generate data
+for ((num_qubits=$min_num_qubits; num_qubits<$max_num_qubits+1; num_qubits+=$num_qubits_increment)); do
+	echo "Gate repetition=$num_gate_reps, num_qubits=$num_qubits, kernel=$kernel, gate=$gate"
+	command=$(printf "$command_format" "$num_qubits")
+	$command >> $path_to_csv
+done
diff --git a/pennylane_lightning/src/examples/run_gate_benchmark.sh b/pennylane_lightning/src/examples/run_gate_benchmark.sh
deleted file mode 100755
index 315c3ebdda..0000000000
--- a/pennylane_lightning/src/examples/run_gate_benchmark.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-currdir=$(pwd)
-
-if [ "$#" -lt 2 ]; then
-	echo "Usage: $0 Kernel Gate [Number of wires (for MultiRZ)]"
-	exit 1
-fi
-
-# Parameter initialization
-min_num_qubits=8
-max_num_qubits=24
-num_qubits_increment=2
-num_gate_reps=1000
-kernel="$1"
-gate="$2"
-
-compiler_info=$(<compiler_info.txt)
-
-if [[ "$gate" != "MultiRZ" ]]; then
-	# Creating data file
-	binary_name="./benchmark_operation"
-	path_to_binary="$currdir/$binary_name"
-
-	resdir="$currdir/res_${compiler_info}"
-	mkdir -p $resdir
-	data_file_name="benchmark_${kernel}_${gate}.csv"
-	path_to_csv="$resdir/$data_file_name"
-	echo "Creating $path_to_csv"
-	echo "Num Qubits, Time (milliseconds)" > $path_to_csv
-
-	# Generate data
-	for ((num_qubits=$min_num_qubits; num_qubits<$max_num_qubits+1; num_qubits+=$num_qubits_increment)); do
-		echo "Gate repetition=$num_gate_reps, num_qubits=$num_qubits, kernel=$kernel, gate=$gate"
-		$path_to_binary ${num_gate_reps} ${num_qubits} ${kernel} ${gate} >> $path_to_csv
-	done
-else
-	num_wires="$3"
-	# Creating data file
-	binary_name="./benchmark_multi_rz"
-	path_to_binary="$currdir/$binary_name"
-
-	resdir="$currdir/res_${compiler_info}"
-	mkdir -p $resdir
-	data_file_name="benchmark_${kernel}_${gate}_${num_wires}.csv"
-	path_to_csv="$resdir/$data_file_name"
-	echo "Creating $path_to_csv"
-	echo "Num Qubits, Time (milliseconds)" > $path_to_csv
-
-	# Generate data
-	for ((num_qubits=$min_num_qubits; num_qubits<$max_num_qubits+1; num_qubits+=$num_qubits_increment)); do
-		echo "Gate repetition=$num_gate_reps, num_qubits=$num_qubits, kernel=$kernel, gate=$gate"
-		$path_to_binary ${num_gate_reps} ${num_qubits} ${num_wires} ${kernel} >> $path_to_csv
-	done
-fi
diff --git a/pennylane_lightning/src/examples/test.sh b/pennylane_lightning/src/examples/test.sh
new file mode 100644
index 0000000000..bb03f7a040
--- /dev/null
+++ b/pennylane_lightning/src/examples/test.sh
@@ -0,0 +1 @@
+echo $#
diff --git a/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp b/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp
index 5618bffc7b..2a0e75c529 100644
--- a/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp
+++ b/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp
@@ -256,28 +256,56 @@ class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
         size_t dim = 1U << wires.size();
         std::vector<size_t> indices;
         indices.resize(dim);
+        std::vector<std::complex<PrecisionT>> coeffs_in(dim, 0.0);
 
-        for (size_t k = 0; k < Util::exp2(num_qubits); k += dim) {
-            std::vector<std::complex<PrecisionT>> coeffs_in(dim);
-            std::vector<std::complex<PrecisionT>> coeffs_out(dim);
+        if (inverse) {
+            for (size_t k = 0; k < Util::exp2(num_qubits); k += dim) {
+
+                for (size_t inner_idx = 0; inner_idx < dim; inner_idx++) {
+                    size_t idx = k | inner_idx;
+                    size_t n_wires = wires.size();
+                    for (size_t pos = 0; pos < n_wires; pos++) {
+                        idx = bitswap(idx, n_wires - pos - 1,
+                                      num_qubits - wires[pos] - 1);
+                    }
+                    indices[inner_idx] = idx;
+                    coeffs_in[inner_idx] = arr[idx];
+                }
+
+                for (size_t i = 0; i < dim; i++) {
+                    const auto idx = indices[i];
+                    arr[idx] = 0.0;
 
-            for (size_t inner_idx = 0; inner_idx < dim; inner_idx++) {
-                size_t idx = k | inner_idx;
-                size_t n_wires = wires.size();
-                for (size_t pos = 0; pos < n_wires; pos++) {
-                    idx = bitswap(idx, n_wires - pos - 1,
-                                  num_qubits - wires[pos] - 1);
+                    for (size_t j = 0; j < dim; j++) {
+                        const size_t base_idx = j * dim;
+                        arr[idx] +=
+                            std::conj(matrix[base_idx + i]) * coeffs_in[j];
+                    }
                 }
-                indices[inner_idx] = idx;
-                coeffs_in[inner_idx] = arr[idx];
             }
+        } else {
+            for (size_t k = 0; k < Util::exp2(num_qubits); k += dim) {
+
+                for (size_t inner_idx = 0; inner_idx < dim; inner_idx++) {
+                    size_t idx = k | inner_idx;
+                    size_t n_wires = wires.size();
+                    for (size_t pos = 0; pos < n_wires; pos++) {
+                        idx = bitswap(idx, n_wires - pos - 1,
+                                      num_qubits - wires[pos] - 1);
+                    }
+                    indices[inner_idx] = idx;
+                    coeffs_in[inner_idx] = arr[idx];
+                }
 
-            Util::matrixVecProd(matrix, coeffs_in.data(), coeffs_out.data(),
-                                dim, dim,
-                                inverse ? Trans::Adjoint : Trans::NoTranspose);
+                for (size_t i = 0; i < dim; i++) {
+                    const auto idx = indices[i];
+                    arr[idx] = 0.0;
+                    const size_t base_idx = i * dim;
 
-            for (size_t inner_idx = 0; inner_idx < dim; inner_idx++) {
-                arr[indices[inner_idx]] = coeffs_out[inner_idx];
+                    for (size_t j = 0; j < dim; j++) {
+                        arr[idx] += matrix[base_idx + j] * coeffs_in[j];
+                    }
+                }
             }
         }
     }
diff --git a/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsPI.hpp b/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsPI.hpp
index b4314411f5..c2558b5021 100644
--- a/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsPI.hpp
+++ b/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsPI.hpp
@@ -206,23 +206,23 @@ class GateImplementationsPI : public PauliGenerator<GateImplementationsPI> {
                 const std::complex<PrecisionT> v10 = shiftedState[indices[2]];
                 const std::complex<PrecisionT> v11 = shiftedState[indices[3]];
 
-                // NOLINTNEXTLINE(readability-magic-numbers)
                 shiftedState[indices[0]] =
+                    // NOLINTNEXTLINE(readability-magic-numbers)
                     matrix[0b0000] * v00 + matrix[0b0001] * v01 +
                     // NOLINTNEXTLINE(readability-magic-numbers)
                     matrix[0b0010] * v10 + matrix[0b0011] * v11;
-                // NOLINTNEXTLINE(readability-magic-numbers)
                 shiftedState[indices[1]] =
+                    // NOLINTNEXTLINE(readability-magic-numbers)
                     matrix[0b0100] * v00 + matrix[0b0101] * v01 +
                     // NOLINTNEXTLINE(readability-magic-numbers)
                     matrix[0b0110] * v10 + matrix[0b0111] * v11;
-                // NOLINTNEXTLINE(readability-magic-numbers)
                 shiftedState[indices[2]] =
+                    // NOLINTNEXTLINE(readability-magic-numbers)
                     matrix[0b1000] * v00 + matrix[0b1001] * v01 +
                     // NOLINTNEXTLINE(readability-magic-numbers)
                     matrix[0b1010] * v10 + matrix[0b1011] * v11;
-                // NOLINTNEXTLINE(readability-magic-numbers)
                 shiftedState[indices[3]] =
+                    // NOLINTNEXTLINE(readability-magic-numbers)
                     matrix[0b1100] * v00 + matrix[0b1101] * v01 +
                     // NOLINTNEXTLINE(readability-magic-numbers)
                     matrix[0b1110] * v10 + matrix[0b1111] * v11;
diff --git a/pennylane_lightning/src/simulator/DynamicDispatcher.hpp b/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
index 46fc68ab81..e761cdeca4 100644
--- a/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
+++ b/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
@@ -295,16 +295,6 @@ template <typename PrecisionT> class DynamicDispatcher {
                      const std::vector<size_t> &wires, bool inverse) const {
         assert(num_qubits >= wires.size());
 
-        switch (mat_op) {
-        case Gates::MatrixOperation::SingleQubitOp:
-            assert(wires.size() == 1);
-            break;
-        case Gates::MatrixOperation::TwoQubitOp:
-            assert(wires.size() == 2);
-            break;
-        default:
-            break;
-        }
         const auto iter = matrices_.find(std::make_pair(mat_op, kernel));
         if (iter == matrices_.end()) {
             throw std::invalid_argument(
diff --git a/pennylane_lightning/src/simulator/StateVectorBase.hpp b/pennylane_lightning/src/simulator/StateVectorBase.hpp
index 9861ec0c39..4853754815 100644
--- a/pennylane_lightning/src/simulator/StateVectorBase.hpp
+++ b/pennylane_lightning/src/simulator/StateVectorBase.hpp
@@ -295,6 +295,110 @@ template <class T, class Derived> class StateVectorBase {
             num_qubits_, opName, wires, adj);
     }
 
+    /**
+     * @brief Apply a general single qubit matrix to given wires.
+     *
+     * @param kernel Kernel to run the operation
+     * @param matrix Pointer to the array data.
+     * @param wires Wires to apply gate to.
+     * @param inverse Indicate whether inverse should be taken.
+     */
+    inline void applySingleQubitOp(Gates::KernelType kernel,
+                                   const ComplexPrecisionT *matrix,
+                                   const std::vector<size_t> &wires,
+                                   bool inverse = false) {
+        using Gates::MatrixOperation;
+
+        assert(wires.size() == 1);
+
+        auto &dispatcher = DynamicDispatcher<PrecisionT>::getInstance();
+        auto *arr = getData();
+        dispatcher.applyMatrix(kernel, arr, MatrixOperation::SingleQubitOp,
+                               num_qubits_, matrix, wires, inverse);
+    }
+
+    /**
+     * @brief Apply a general single qubit matrix to given wires.
+     *
+     * @param kernel Kernel to run the operation
+     * @param matrix Pointer to the array data.
+     * @param wires Wires to apply gate to.
+     * @param inverse Indicate whether inverse should be taken.
+     */
+    inline void applyTwoQubitOp(Gates::KernelType kernel,
+                                const ComplexPrecisionT *matrix,
+                                const std::vector<size_t> &wires,
+                                bool inverse = false) {
+        using Gates::MatrixOperation;
+
+        assert(wires.size() == 2);
+
+        auto &dispatcher = DynamicDispatcher<PrecisionT>::getInstance();
+        auto *arr = getData();
+        dispatcher.applyMatrix(kernel, arr, MatrixOperation::TwoQubitOp,
+                               num_qubits_, matrix, wires, inverse);
+    }
+
+    /**
+     * @brief Apply a general multi qubit matrix to given wires.
+     *
+     * @param kernel Kernel to run the operation
+     * @param matrix Pointer to the array data.
+     * @param wires Wires to apply gate to.
+     * @param inverse Indicate whether inverse should be taken.
+     */
+    inline void applyMultiQubitOp(Gates::KernelType kernel,
+                                  const ComplexPrecisionT *matrix,
+                                  const std::vector<size_t> &wires,
+                                  bool inverse = false) {
+        using Gates::MatrixOperation;
+
+        auto &dispatcher = DynamicDispatcher<PrecisionT>::getInstance();
+        auto *arr = getData();
+        dispatcher.applyMatrix(kernel, arr, MatrixOperation::MultiQubitOp,
+                               num_qubits_, matrix, wires, inverse);
+    }
+
+    /**
+     * @brief Apply a given matrix directly to the statevector read directly
+     * from numpy data. Data can be in 1D or 2D format.
+     *
+     * @param kernel Kernel to run the operation
+     * @param matrix Pointer to the array data.
+     * @param wires Wires to apply gate to.
+     * @param inverse Indicate whether inverse should be taken.
+     */
+    inline void applyMatrix(Gates::KernelType kernel,
+                            const ComplexPrecisionT *matrix,
+                            const std::vector<size_t> &wires,
+                            bool inverse = false) {
+        using Gates::MatrixOperation;
+
+        auto &dispatcher = DynamicDispatcher<PrecisionT>::getInstance();
+        auto *arr = getData();
+
+        if (wires.empty()) {
+            throw std::invalid_argument(
+                "Number of wires must be larger than 0");
+        }
+
+        switch (wires.size()) {
+        case 1:
+            dispatcher.applyMatrix(kernel, arr, MatrixOperation::SingleQubitOp,
+                                   num_qubits_, matrix, wires, inverse);
+            return;
+        case 2:
+            dispatcher.applyMatrix(kernel, arr, MatrixOperation::TwoQubitOp,
+                                   num_qubits_, matrix, wires, inverse);
+            return;
+        default:
+            dispatcher.applyMatrix(kernel, arr, MatrixOperation::MultiQubitOp,
+                                   num_qubits_, matrix, wires, inverse);
+            return;
+        }
+        PL_UNREACHABLE;
+    }
+
     /**
      * @brief Apply a given matrix directly to the statevector read directly
      * from numpy data. Data can be in 1D or 2D format.
@@ -306,10 +410,7 @@ template <class T, class Derived> class StateVectorBase {
     inline void applyMatrix(const ComplexPrecisionT *matrix,
                             const std::vector<size_t> &wires,
                             bool inverse = false) {
-        namespace Constant = Gates::Constant;
         using Gates::MatrixOperation;
-        using Gates::SelectKernel;
-        using Gates::static_lookup;
 
         auto &dispatcher = DynamicDispatcher<PrecisionT>::getInstance();
         auto *arr = getData();
diff --git a/pennylane_lightning/src/tests/TestHelpers.hpp b/pennylane_lightning/src/tests/TestHelpers.hpp
index 723d03b10a..dc6056016b 100644
--- a/pennylane_lightning/src/tests/TestHelpers.hpp
+++ b/pennylane_lightning/src/tests/TestHelpers.hpp
@@ -210,24 +210,14 @@ auto createPlusState(size_t num_qubits)
     return res;
 }
 
-/**
- * @brief Calculate the squared norm of a vector
- */
-template <typename PrecisionT>
-auto squaredNorm(const std::complex<PrecisionT> *data, size_t data_size)
-    -> PrecisionT {
-    return std::transform_reduce(
-        data, data + data_size, PrecisionT{}, std::plus<PrecisionT>(),
-        static_cast<PrecisionT (*)(const std::complex<PrecisionT> &)>(
-            &std::norm<PrecisionT>));
-}
-
 /**
  * @brief create a random state
  */
 template <typename PrecisionT, class RandomEngine>
 auto createRandomState(RandomEngine &re, size_t num_qubits)
     -> TestVector<std::complex<PrecisionT>> {
+    using Util::squaredNorm;
+
     TestVector<std::complex<PrecisionT>> res(1U << num_qubits, {0.0, 0.0});
     std::uniform_real_distribution<PrecisionT> dist;
     for (size_t idx = 0; idx < (1U << num_qubits); idx++) {
@@ -321,58 +311,6 @@ auto createParams(Gates::GateOperation op) -> std::vector<PrecisionT> {
     }
     return {};
 }
-/**
- * @brief Generate random unitary matrix
- *
- * @return Generated unitary matrix in row-major format
- */
-template <typename PrecisionT, class RandomEngine>
-auto randomUnitary(RandomEngine &re, size_t num_qubits)
-    -> TestVector<std::complex<PrecisionT>> {
-    using ComplexPrecisionT = std::complex<PrecisionT>;
-    const size_t dim = (1U << num_qubits);
-    TestVector<ComplexPrecisionT> res(dim * dim, ComplexPrecisionT{});
-
-    std::normal_distribution<PrecisionT> dist;
-
-    auto generator = [&dist, &re]() -> ComplexPrecisionT {
-        return ComplexPrecisionT{dist(re), dist(re)};
-    };
-
-    std::generate(res.begin(), res.end(), generator);
-
-    // Simple algorithm to make rows orthogonal with Gram-Schmidt
-    // This algorithm is unstable but works for a small matrix.
-    // Use QR decomposition when we have LAPACK support.
-
-    for (size_t row2 = 0; row2 < dim; row2++) {
-        ComplexPrecisionT *row2_p = res.data() + row2 * dim;
-        for (size_t row1 = 0; row1 < row2; row1++) {
-            const ComplexPrecisionT *row1_p = res.data() + row1 * dim;
-            ComplexPrecisionT dot12 = Util::innerProdC(row1_p, row2_p, dim);
-            ComplexPrecisionT dot11 = squaredNorm(row1_p, dim);
-
-            // orthogonalize row2
-            std::transform(
-                row2_p, row2_p + dim, row1_p, row2_p,
-                [scale = dot12 / dot11](auto &elt2, const auto &elt1) {
-                    return elt2 - scale * elt1;
-                });
-        }
-    }
-
-    // Normalize each row
-    for (size_t row = 0; row < dim; row++) {
-        ComplexPrecisionT *row_p = res.data() + row * dim;
-        PrecisionT norm2 = std::sqrt(squaredNorm(row_p, dim));
-
-        // normalize row2
-        std::transform(row_p, row_p + dim, row_p, [norm2](const auto c) {
-            return (static_cast<PrecisionT>(1.0) / norm2) * c;
-        });
-    }
-    return res;
-}
 
 template <class PrecisionT> struct PrecisionToName;
 
diff --git a/pennylane_lightning/src/tests/Test_AdjDiff.cpp b/pennylane_lightning/src/tests/Test_AdjDiff.cpp
index 696d66d41d..d1f9e94136 100644
--- a/pennylane_lightning/src/tests/Test_AdjDiff.cpp
+++ b/pennylane_lightning/src/tests/Test_AdjDiff.cpp
@@ -50,7 +50,7 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=RX, Obs=Z",
         for (const auto &p : param) {
             auto ops = OpsData<double>({"RX"}, {{p}}, {{0}}, {false});
 
-            std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+            std::vector<std::complex<double>> cdata(1U << num_qubits);
             cdata[0] = std::complex<double>{1, 0};
 
             StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
@@ -82,7 +82,7 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=RY, Obs=X",
         for (const auto &p : param) {
             auto ops = OpsData<double>({"RY"}, {{p}}, {{0}}, {false});
 
-            std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+            std::vector<std::complex<double>> cdata(1U << num_qubits);
             cdata[0] = std::complex<double>{1, 0};
 
             StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
@@ -109,7 +109,7 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=RX, Obs=[Z,Z]",
         const size_t num_obs = 2;
         std::vector<double> jacobian(num_obs * num_params, 0);
 
-        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+        std::vector<std::complex<double>> cdata(1U << num_qubits);
         StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
         cdata[0] = std::complex<double>{1, 0};
 
@@ -140,7 +140,7 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=[RX,RX,RX], Obs=[Z,Z,Z]",
         const size_t num_obs = 3;
         std::vector<double> jacobian(num_obs * num_params, 0);
 
-        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+        std::vector<std::complex<double>> cdata(1U << num_qubits);
         StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
         cdata[0] = std::complex<double>{1, 0};
 
@@ -179,7 +179,7 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=[RX,RX,RX], Obs=[Z,Z,Z], "
         std::vector<double> jacobian(num_obs * num_params, 0);
         std::vector<size_t> t_params{0, 2};
 
-        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+        std::vector<std::complex<double>> cdata(1U << num_qubits);
         StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
         cdata[0] = std::complex<double>{1, 0};
 
@@ -214,7 +214,7 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=[RX,RX,RX], Obs=[ZZZ]",
         const size_t num_obs = 1;
         std::vector<double> jacobian(num_obs * num_params, 0);
 
-        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+        std::vector<std::complex<double>> cdata(1U << num_qubits);
         StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
         cdata[0] = std::complex<double>{1, 0};
 
@@ -249,7 +249,7 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=Mixed, Obs=[XXX]",
         const size_t num_obs = 1;
         std::vector<double> jacobian(num_obs * num_params, 0);
 
-        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+        std::vector<std::complex<double>> cdata(1U << num_qubits);
         StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
         cdata[0] = std::complex<double>{1, 0};
 
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Matrix.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Matrix.cpp
index 72eba17f63..512c33ee57 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Matrix.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Matrix.cpp
@@ -6,6 +6,7 @@
 #include <catch2/catch.hpp>
 
 using namespace Pennylane;
+using Util::randomUnitary;
 
 template <typename PrecisionT>
 using ApplyMatrixType = void (*)(std::complex<PrecisionT> *, size_t,
diff --git a/pennylane_lightning/src/tests/Test_Internal.cpp b/pennylane_lightning/src/tests/Test_Internal.cpp
index 819f472586..b595d5daeb 100644
--- a/pennylane_lightning/src/tests/Test_Internal.cpp
+++ b/pennylane_lightning/src/tests/Test_Internal.cpp
@@ -85,38 +85,6 @@ TEMPLATE_TEST_CASE("createProductState", "[Test_Internal]", float, double) {
     }
 }
 
-/**
- * @brief Test randomUnitary is correct
- */
-TEMPLATE_TEST_CASE("randomUnitary", "[Test_Internal]", float, double) {
-    using PrecisionT = TestType;
-
-    std::mt19937 re{1337};
-
-    for (size_t num_qubits = 1; num_qubits <= 5; num_qubits++) {
-        const size_t dim = (1U << num_qubits);
-        const auto unitary = randomUnitary<PrecisionT>(re, num_qubits);
-
-        auto unitary_dagger = Util::Transpose(unitary, dim, dim);
-        std::transform(
-            unitary_dagger.begin(), unitary_dagger.end(),
-            unitary_dagger.begin(),
-            [](const std::complex<PrecisionT> &v) { return std::conj(v); });
-
-        std::vector<std::complex<PrecisionT>> mat(dim * dim);
-        Util::matrixMatProd(unitary.data(), unitary_dagger.data(), mat.data(),
-                            dim, dim, dim);
-
-        std::vector<std::complex<PrecisionT>> identity(
-            dim * dim, std::complex<PrecisionT>{});
-        for (size_t i = 0; i < dim; i++) {
-            identity[i * dim + i] = std::complex<PrecisionT>{1.0, 0.0};
-        }
-
-        REQUIRE(mat == PLApprox(identity).margin(1e-5));
-    }
-}
-
 size_t binomialCeff(size_t n, size_t r) {
     size_t num = 1;
     size_t dem = 1;
diff --git a/pennylane_lightning/src/tests/Test_Util.cpp b/pennylane_lightning/src/tests/Test_Util.cpp
index 8ac67d087a..adebf08c66 100644
--- a/pennylane_lightning/src/tests/Test_Util.cpp
+++ b/pennylane_lightning/src/tests/Test_Util.cpp
@@ -608,3 +608,35 @@ TEST_CASE("Utility array and tuples", "[Util]") {
                 std::pair<std::string_view, int>("Four", 4),
             });
 }
+
+/**
+ * @brief Test randomUnitary is correct
+ */
+TEMPLATE_TEST_CASE("randomUnitary", "[Test_Internal]", float, double) {
+    using PrecisionT = TestType;
+
+    std::mt19937 re{1337};
+
+    for (size_t num_qubits = 1; num_qubits <= 5; num_qubits++) {
+        const size_t dim = (1U << num_qubits);
+        const auto unitary = Util::randomUnitary<PrecisionT>(re, num_qubits);
+
+        auto unitary_dagger = Util::Transpose(unitary, dim, dim);
+        std::transform(
+            unitary_dagger.begin(), unitary_dagger.end(),
+            unitary_dagger.begin(),
+            [](const std::complex<PrecisionT> &v) { return std::conj(v); });
+
+        std::vector<std::complex<PrecisionT>> mat(dim * dim);
+        Util::matrixMatProd(unitary.data(), unitary_dagger.data(), mat.data(),
+                            dim, dim, dim);
+
+        std::vector<std::complex<PrecisionT>> identity(
+            dim * dim, std::complex<PrecisionT>{});
+        for (size_t i = 0; i < dim; i++) {
+            identity[i * dim + i] = std::complex<PrecisionT>{1.0, 0.0};
+        }
+
+        REQUIRE(mat == PLApprox(identity).margin(1e-5));
+    }
+}
diff --git a/pennylane_lightning/src/util/LinearAlgebra.hpp b/pennylane_lightning/src/util/LinearAlgebra.hpp
index bdf0e35f99..9f6941f891 100644
--- a/pennylane_lightning/src/util/LinearAlgebra.hpp
+++ b/pennylane_lightning/src/util/LinearAlgebra.hpp
@@ -17,9 +17,11 @@
  */
 #pragma once
 
+#include <algorithm>
 #include <complex>
 #include <cstdlib>
 #include <numeric>
+#include <random>
 #include <vector>
 
 #include "Util.hpp"
@@ -746,4 +748,69 @@ inline auto matrixMatProd(const std::vector<std::complex<T>> m_left,
 
     return m_out;
 }
+
+/**
+ * @brief Calculate the squared norm of a vector
+ */
+template <typename PrecisionT>
+auto squaredNorm(const std::complex<PrecisionT> *data, size_t data_size)
+    -> PrecisionT {
+    return std::transform_reduce(
+        data, data + data_size, PrecisionT{}, std::plus<PrecisionT>(),
+        static_cast<PrecisionT (*)(const std::complex<PrecisionT> &)>(
+            &std::norm<PrecisionT>));
+}
+
+/**
+ * @brief Generate random unitary matrix
+ *
+ * @return Generated unitary matrix in row-major format
+ */
+template <typename PrecisionT, class RandomEngine>
+auto randomUnitary(RandomEngine &re, size_t num_qubits)
+    -> std::vector<std::complex<PrecisionT>> {
+    using ComplexPrecisionT = std::complex<PrecisionT>;
+    const size_t dim = (1U << num_qubits);
+    std::vector<ComplexPrecisionT> res(dim * dim, ComplexPrecisionT{});
+
+    std::normal_distribution<PrecisionT> dist;
+
+    auto generator = [&dist, &re]() -> ComplexPrecisionT {
+        return ComplexPrecisionT{dist(re), dist(re)};
+    };
+
+    std::generate(res.begin(), res.end(), generator);
+
+    // Simple algorithm to make rows orthogonal with Gram-Schmidt
+    // This algorithm is unstable but works for a small matrix.
+    // Use QR decomposition when we have LAPACK support.
+
+    for (size_t row2 = 0; row2 < dim; row2++) {
+        ComplexPrecisionT *row2_p = res.data() + row2 * dim;
+        for (size_t row1 = 0; row1 < row2; row1++) {
+            const ComplexPrecisionT *row1_p = res.data() + row1 * dim;
+            ComplexPrecisionT dot12 = Util::innerProdC(row1_p, row2_p, dim);
+            ComplexPrecisionT dot11 = squaredNorm(row1_p, dim);
+
+            // orthogonalize row2
+            std::transform(
+                row2_p, row2_p + dim, row1_p, row2_p,
+                [scale = dot12 / dot11](auto &elt2, const auto &elt1) {
+                    return elt2 - scale * elt1;
+                });
+        }
+    }
+
+    // Normalize each row
+    for (size_t row = 0; row < dim; row++) {
+        ComplexPrecisionT *row_p = res.data() + row * dim;
+        PrecisionT norm2 = std::sqrt(squaredNorm(row_p, dim));
+
+        // normalize row2
+        std::transform(row_p, row_p + dim, row_p, [norm2](const auto c) {
+            return (static_cast<PrecisionT>(1.0) / norm2) * c;
+        });
+    }
+    return res;
+}
 } // namespace Pennylane::Util

From a0df568b1df1f40445f74e46740637172d9186db Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Tue, 1 Mar 2022 23:24:52 -0500
Subject: [PATCH 05/94] Update benchmark

---
 .../src/examples/CMakeLists.txt               |  12 +-
 .../src/examples/run_benchmark.py             | 156 ++++++++++++++++++
 .../src/examples/run_benchmark.sh             |  23 ++-
 pennylane_lightning/src/examples/test.sh      |   2 +-
 .../DefaultKernelsForStateVector.hpp          |   2 +-
 5 files changed, 180 insertions(+), 15 deletions(-)
 create mode 100644 pennylane_lightning/src/examples/run_benchmark.py

diff --git a/pennylane_lightning/src/examples/CMakeLists.txt b/pennylane_lightning/src/examples/CMakeLists.txt
index addb786ad6..8c7d0a900b 100644
--- a/pennylane_lightning/src/examples/CMakeLists.txt
+++ b/pennylane_lightning/src/examples/CMakeLists.txt
@@ -9,10 +9,9 @@ project("gate_benchmark"
         LANGUAGES CXX
 )
 
-# add_executable(gate_benchmark gate_benchmark.cpp)
-# target_link_libraries(gate_benchmark lightning_utils lightning_simulator
-#                                      lightning_compile_options
-#                                      lightning_external_libs)
+################################################################################
+# II. Set dependencies
+################################################################################
 
 add_library(lightning_examples INTERFACE)
 target_link_libraries(lightning_examples INTERFACE lightning_compile_options
@@ -21,6 +20,11 @@ target_link_libraries(lightning_examples INTERFACE lightning_compile_options
                                                    lightning_simulator
                                                    lightning_utils)
 
+
+################################################################################
+# III. set executables
+################################################################################
+
 add_executable(benchmark_gate benchmark_gate.cpp)
 target_link_libraries(benchmark_gate PRIVATE lightning_examples)
 
diff --git a/pennylane_lightning/src/examples/run_benchmark.py b/pennylane_lightning/src/examples/run_benchmark.py
new file mode 100644
index 0000000000..1a7febfa27
--- /dev/null
+++ b/pennylane_lightning/src/examples/run_benchmark.py
@@ -0,0 +1,156 @@
+import subprocess
+import argparse
+import json
+from pathlib import Path
+from typing import final
+import abc
+
+MIN_NUM_QUBITS = 8
+MAX_NUM_QUBITS = 24
+STEP_NUM_QUBITS = 2
+NUM_GATE_REPS = 1000
+
+
+class BenchmarkRunner:
+    def __init__(self, kernel, operation):
+        self.kernel = kernel
+        self.operation = operation
+
+    @final
+    def benchmark(self, res_path):
+        result = []
+        ext_info = self.external_info()
+        if ext_info:
+            result.append(ext_info)
+        try:
+            for num_qubit in range(MIN_NUM_QUBITS, MAX_NUM_QUBITS + 1, STEP_NUM_QUBITS):
+                cmd = self.command(num_qubit)
+                print(f"Run N={num_qubit}, {self.kernel}, {self.operation}")
+                output = subprocess.run([str(c) for c in cmd], capture_output=True, check=True)
+                time = output.stdout.decode("utf-8").strip().split(",")[1]
+                result.append({"N": num_qubit, "time": time})
+        except subprocess.CalledProcessError as err:
+            print("Error from subprocess call. Message:")
+            print(err.stderr.decode("utf-8"))
+        except KeyboardInterrupt:
+            pass
+
+        res_path = Path(res_path)
+        if not res_path.exists():
+            res_path.mkdir(parents=True)
+
+        with res_path.joinpath(self.filename()).open("w") as f:
+            json.dump(result, f, indent=4)
+
+    @abc.abstractmethod
+    def command(self, num_qubits):
+        pass
+
+    @abc.abstractmethod
+    def external_info(self):
+        pass
+
+    @abc.abstractmethod
+    def filename(self):
+        pass
+
+
+class MatrixBenchmarkRunner(BenchmarkRunner):
+    def __init__(self, kernel, operation, num_wires):
+        super().__init__(kernel, operation)
+        self.num_wires = num_wires
+
+    def command(self, num_qubits):
+        return ["./benchmark_matrix", NUM_GATE_REPS, num_qubits, self.kernel, self.num_wires]
+
+    def external_info(self):
+        return {"num_wires": self.num_wires}
+
+    def filename(self):
+        return f"Matrix_{self.kernel}_{self.num_wires}.json"
+
+
+class GateBenchmarkRunner(BenchmarkRunner):
+    def __init__(self, kernel, operation, num_wires=None):
+        super().__init__(kernel, operation)
+        self.num_wires = num_wires
+
+    def command(self, num_qubits):
+        cmd = ["./benchmark_gate", NUM_GATE_REPS, num_qubits, self.kernel, self.operation]
+        if self.num_wires:
+            cmd.append(self.num_wires)
+        return cmd
+
+    def external_info(self):
+        if self.num_wires:
+            return {"num_wires": self.num_wires}
+        else:
+            return None
+
+    def filename(self):
+        if self.num_wires:
+            return f"{self.operation}_{self.kernel}_{self.num_wires}.json"
+        else:
+            return f"{self.operation}_{self.kernel}.json"
+
+
+class GeneratorBenchmarkRunner(BenchmarkRunner):
+    def __init__(self, kernel, operation, num_wires=None):
+        super().__init__(kernel, operation)
+        self.num_wires = num_wires
+
+    def command(self, num_qubits):
+        cmd = ["./benchmark_generator", NUM_GATE_REPS, num_qubits, self.kernel, self.operation[9:]]
+        if self.num_wires is not None:
+            cmd.append(self.num_wires)
+        return cmd
+
+    def external_info(self):
+        if self.num_wires:
+            return {"num_wires": self.num_wires}
+        else:
+            return None
+
+    def filename(self):
+        if self.num_wires:
+            return f"{self.operation}_{self.kernel}_{self.num_wires}.json"
+        else:
+            return f"{self.operation}_{self.kernel}.json"
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run C++ benchmarks")
+    parser.add_argument("kernel", help="Kernel to benchmark")
+    parser.add_argument("operation", help="Operation to benchmark.")
+
+    parser.add_argument(
+        "num_wires",
+        help="Number of wires (optional for multi-qubit operations).",
+        nargs="?",
+        default=None,
+        type=int,
+    )
+
+    args = parser.parse_args()
+
+    compiler_info_file = "compiler_info.txt"
+
+    try:
+        with open(compiler_info_file, "r") as f:
+            res_path = "res_" + f.readline().strip()
+    except OSError:
+        print("Encountered an error while opening '{}'".format(compiler_info_file))
+        sys.exit(1)
+
+    if args.operation == "Matrix":
+        if args.num_wires == 0:
+            raise ValueError(
+                "Parameter num_wires must be provided and larger than 0 for matrix benchmark."
+            )
+        runner = MatrixBenchmarkRunner(args.kernel, args.operation, args.num_wires)
+    elif args.operation.startswith("Generator"):
+        runner = GeneratorBenchmarkRunner(args.kernel, args.operation, args.num_wires)
+    else:
+        runner = GateBenchmarkRunner(args.kernel, args.operation, args.num_wires)
+
+    runner.benchmark(res_path)
diff --git a/pennylane_lightning/src/examples/run_benchmark.sh b/pennylane_lightning/src/examples/run_benchmark.sh
index e8f7daf657..1818100452 100755
--- a/pennylane_lightning/src/examples/run_benchmark.sh
+++ b/pennylane_lightning/src/examples/run_benchmark.sh
@@ -13,24 +13,29 @@ max_num_qubits=24
 num_qubits_increment=2
 num_gate_reps=1000
 kernel="$1"
-gate="$2"
-path_to_binary="./benchmark_gate"
+
+if [[ "$2" == "Matrix" ]]; then
+	path_to_binary="./benchmark_matrix"
+	command_format="$path_to_binary ${num_gate_reps} $kernel ${@:3}"
+elif [[ "$2" =~ "Generator.*" ]]; then
+	path_to_binary="./benchmark_generator"
+	operation=$(echo "$2" | cut -c10-)
+	command_format="$path_to_binary ${num_gate_reps} %d $kernel $operation ${@:3}"
+else
+	path_to_binary="./benchmark_gate"
+	operation="$2"
+	command_format="$path_to_binary ${num_gate_reps} %d $kernel $operation ${@:3}"
+fi
+
 
 compiler_info=$(<compiler_info.txt)
 
 resdir="$currdir/res_${compiler_info}"
 mkdir -p $resdir
-data_file_name="benchmark_${kernel}_${gate}.csv"
 path_to_csv="$resdir/$data_file_name"
 echo "Creating $path_to_csv"
 echo "Num Qubits, Time (milliseconds)" > $path_to_csv
 
-if [[ "$#" -eq 3 ]]; then
-	command_format="$path_to_binary ${num_gate_reps} %d ${kernel} ${gate} $3"
-else
-	command_format="$path_to_binary ${num_gate_reps} %d ${kernel} ${gate}"
-fi
-
 # Generate data
 for ((num_qubits=$min_num_qubits; num_qubits<$max_num_qubits+1; num_qubits+=$num_qubits_increment)); do
 	echo "Gate repetition=$num_gate_reps, num_qubits=$num_qubits, kernel=$kernel, gate=$gate"
diff --git a/pennylane_lightning/src/examples/test.sh b/pennylane_lightning/src/examples/test.sh
index bb03f7a040..fec430ac67 100644
--- a/pennylane_lightning/src/examples/test.sh
+++ b/pennylane_lightning/src/examples/test.sh
@@ -1 +1 @@
-echo $#
+echo ${[]}
diff --git a/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp b/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp
index 5cb25cbd93..f3a33b91bf 100644
--- a/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp
+++ b/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp
@@ -312,7 +312,7 @@ class DefaultKernelsForStateVector {
                                        all_qubit_numbers, KernelType::LM);
         instance.assignKernelForMatrix(MatrixOperation::MultiQubitOp,
                                        all_threading, all_memory_model,
-                                       all_qubit_numbers, KernelType::LM);
+                                       all_qubit_numbers, KernelType::PI);
     }
 
     DefaultKernelsForStateVector() {

From d0265b3f2a8241fdf93fff5083c29461ed16d67c Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Tue, 1 Mar 2022 23:25:19 -0500
Subject: [PATCH 06/94] Update CMake

---
 pennylane_lightning/src/examples/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pennylane_lightning/src/examples/CMakeLists.txt b/pennylane_lightning/src/examples/CMakeLists.txt
index 8c7d0a900b..6754f60fee 100644
--- a/pennylane_lightning/src/examples/CMakeLists.txt
+++ b/pennylane_lightning/src/examples/CMakeLists.txt
@@ -42,8 +42,8 @@ configure_file("compiler_info.in" "compiler_info.txt")
 
 add_custom_command(TARGET benchmark_gate POST_BUILD 
                    COMMAND ${CMAKE_COMMAND} -E copy
-                           ${PROJECT_SOURCE_DIR}/run_benchmark.sh
-                           ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/run_benchmark.sh
+                           ${PROJECT_SOURCE_DIR}/run_benchmark.py
+                           ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/run_benchmark.py
                    COMMAND ${CMAKE_COMMAND} -E create_symlink
                            ${PROJECT_SOURCE_DIR}/plot_benchmark.py
                            ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/plot_benchmark.py

From d4b92d13da5db5e893a790dae1f7d8ce34a017cd Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Tue, 1 Mar 2022 23:39:08 -0500
Subject: [PATCH 07/94] Fix for tidy

---
 pennylane_lightning/src/examples/benchmark_gate.cpp      | 3 ++-
 pennylane_lightning/src/examples/benchmark_generator.cpp | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/pennylane_lightning/src/examples/benchmark_gate.cpp b/pennylane_lightning/src/examples/benchmark_gate.cpp
index 00545b1988..7dff1f507c 100644
--- a/pennylane_lightning/src/examples/benchmark_gate.cpp
+++ b/pennylane_lightning/src/examples/benchmark_gate.cpp
@@ -175,7 +175,7 @@ int main(int argc, char *argv[]) {
     size_t num_wires_for_multi_qubit = 0;
     if (Util::array_has_elt(Constant::multi_qubit_gates, gate_op)) {
         // User provided a multi-qubit gates
-        if (argc != 6) {
+        if (argc != 6) { // NOLINT(readability-magic-numbers)
             std::cerr << "One should provide the number of wires when using "
                          "multi qubit gates."
                       << std::endl;
@@ -183,6 +183,7 @@ int main(int argc, char *argv[]) {
         }
 
         try {
+            // NOLINTNEXTLINE(readability-magic-numbers)
             num_wires_for_multi_qubit = std::stoi(argv[5]);
         } catch (std::exception &e) {
             std::cerr << "Number of wires must be an integer" << std::endl;
diff --git a/pennylane_lightning/src/examples/benchmark_generator.cpp b/pennylane_lightning/src/examples/benchmark_generator.cpp
index 0753b57e6a..5e132d7e25 100644
--- a/pennylane_lightning/src/examples/benchmark_generator.cpp
+++ b/pennylane_lightning/src/examples/benchmark_generator.cpp
@@ -174,7 +174,7 @@ int main(int argc, char *argv[]) {
     size_t num_wires_for_multi_qubit = 0;
     if (Util::array_has_elt(Constant::multi_qubit_generators, gntr_op)) {
         // User provided a multi-qubit gates
-        if (argc != 6) {
+        if (argc != 6) { // NOLINT(readability-magic-numbers)
             std::cerr << "One should provide the number of wires when using "
                          "multi qubit generators."
                       << std::endl;
@@ -182,6 +182,7 @@ int main(int argc, char *argv[]) {
         }
 
         try {
+            // NOLINTNEXTLINE(readability-magic-numbers)
             num_wires_for_multi_qubit = std::stoi(argv[5]);
         } catch (std::exception &e) {
             std::cerr << "Number of wires must be an integer" << std::endl;

From aa8f3122249a3914d3fe5793da6e650be964ca63 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Tue, 1 Mar 2022 23:40:55 -0500
Subject: [PATCH 08/94] Fix for codefac

---
 .github/workflows/dev_version_script.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.github/workflows/dev_version_script.py b/.github/workflows/dev_version_script.py
index 2b3f526f8b..f3a2a7300b 100644
--- a/.github/workflows/dev_version_script.py
+++ b/.github/workflows/dev_version_script.py
@@ -34,10 +34,7 @@ def extract_version(package_path):
 
 def is_dev(version_str):
     m = rgx_dev_ver.fullmatch(version_str)
-    if m:
-        return True
-    else:
-        return False
+    return m is not None:
 
 def update_dev_version(package_path, version_str):
     m = rgx_dev_ver.fullmatch(version_str)

From 717c0683c92208fc1a760a3d320e898998c5e407 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Tue, 1 Mar 2022 23:53:20 -0500
Subject: [PATCH 09/94] Update for Codefactor

---
 pennylane_lightning/src/bindings/Bindings.hpp |  1 -
 .../src/examples/benchmark_generator.cpp      |  1 -
 .../src/examples/run_benchmark.py             | 12 ++---
 .../src/examples/run_benchmark.sh             | 44 -------------------
 pennylane_lightning/src/examples/test.sh      |  1 -
 .../cpu_kernels/GateImplementationsLM.hpp     |  2 -
 .../src/simulator/StateVectorManagedCPU.hpp   |  2 -
 .../src/tests/CreateAllWires.cpp              |  1 -
 .../src/tests/Test_Internal.cpp               |  1 -
 9 files changed, 4 insertions(+), 61 deletions(-)
 delete mode 100755 pennylane_lightning/src/examples/run_benchmark.sh
 delete mode 100644 pennylane_lightning/src/examples/test.sh

diff --git a/pennylane_lightning/src/bindings/Bindings.hpp b/pennylane_lightning/src/bindings/Bindings.hpp
index 84ef5f806c..089aea48d7 100644
--- a/pennylane_lightning/src/bindings/Bindings.hpp
+++ b/pennylane_lightning/src/bindings/Bindings.hpp
@@ -107,7 +107,6 @@ void deallocateArray(void *ptr) { std::free(ptr); }
  * for capsule usage.
  */
 auto allocateAlignedArray(size_t size, pybind11::dtype dt) -> pybind11::array {
-
     auto memory_model = bestCPUMemoryModel();
 
     if (dt.is(pybind11::dtype::of<float>())) {
diff --git a/pennylane_lightning/src/examples/benchmark_generator.cpp b/pennylane_lightning/src/examples/benchmark_generator.cpp
index 5e132d7e25..c1ea726ec3 100644
--- a/pennylane_lightning/src/examples/benchmark_generator.cpp
+++ b/pennylane_lightning/src/examples/benchmark_generator.cpp
@@ -68,7 +68,6 @@ auto generateGeneratorSequence(RandomEngine &re,
     std::uniform_int_distribution<size_t> inverse_dist(0, 1);
 
     for (uint32_t k = 0; k < num_reps; k++) {
-
         bool inverse = static_cast<bool>(inverse_dist(re));
         auto wires = generateNeighboringWires(re, num_qubits, num_wires);
 
diff --git a/pennylane_lightning/src/examples/run_benchmark.py b/pennylane_lightning/src/examples/run_benchmark.py
index 1a7febfa27..f2a770d2c8 100644
--- a/pennylane_lightning/src/examples/run_benchmark.py
+++ b/pennylane_lightning/src/examples/run_benchmark.py
@@ -84,14 +84,12 @@ def command(self, num_qubits):
     def external_info(self):
         if self.num_wires:
             return {"num_wires": self.num_wires}
-        else:
-            return None
+        return None
 
     def filename(self):
         if self.num_wires:
             return f"{self.operation}_{self.kernel}_{self.num_wires}.json"
-        else:
-            return f"{self.operation}_{self.kernel}.json"
+        return f"{self.operation}_{self.kernel}.json"
 
 
 class GeneratorBenchmarkRunner(BenchmarkRunner):
@@ -108,14 +106,12 @@ def command(self, num_qubits):
     def external_info(self):
         if self.num_wires:
             return {"num_wires": self.num_wires}
-        else:
-            return None
+        return None
 
     def filename(self):
         if self.num_wires:
             return f"{self.operation}_{self.kernel}_{self.num_wires}.json"
-        else:
-            return f"{self.operation}_{self.kernel}.json"
+        return f"{self.operation}_{self.kernel}.json"
 
 
 if __name__ == "__main__":
diff --git a/pennylane_lightning/src/examples/run_benchmark.sh b/pennylane_lightning/src/examples/run_benchmark.sh
deleted file mode 100755
index 1818100452..0000000000
--- a/pennylane_lightning/src/examples/run_benchmark.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-
-currdir=$(pwd)
-
-if [ "$#" -lt 2 ]; then
-	echo "Usage: $0 Kernel Gate [Number of wires (for MultiRZ)]"
-	exit 1
-fi
-
-# Parameter initialization
-min_num_qubits=8
-max_num_qubits=24
-num_qubits_increment=2
-num_gate_reps=1000
-kernel="$1"
-
-if [[ "$2" == "Matrix" ]]; then
-	path_to_binary="./benchmark_matrix"
-	command_format="$path_to_binary ${num_gate_reps} $kernel ${@:3}"
-elif [[ "$2" =~ "Generator.*" ]]; then
-	path_to_binary="./benchmark_generator"
-	operation=$(echo "$2" | cut -c10-)
-	command_format="$path_to_binary ${num_gate_reps} %d $kernel $operation ${@:3}"
-else
-	path_to_binary="./benchmark_gate"
-	operation="$2"
-	command_format="$path_to_binary ${num_gate_reps} %d $kernel $operation ${@:3}"
-fi
-
-
-compiler_info=$(<compiler_info.txt)
-
-resdir="$currdir/res_${compiler_info}"
-mkdir -p $resdir
-path_to_csv="$resdir/$data_file_name"
-echo "Creating $path_to_csv"
-echo "Num Qubits, Time (milliseconds)" > $path_to_csv
-
-# Generate data
-for ((num_qubits=$min_num_qubits; num_qubits<$max_num_qubits+1; num_qubits+=$num_qubits_increment)); do
-	echo "Gate repetition=$num_gate_reps, num_qubits=$num_qubits, kernel=$kernel, gate=$gate"
-	command=$(printf "$command_format" "$num_qubits")
-	$command >> $path_to_csv
-done
diff --git a/pennylane_lightning/src/examples/test.sh b/pennylane_lightning/src/examples/test.sh
deleted file mode 100644
index fec430ac67..0000000000
--- a/pennylane_lightning/src/examples/test.sh
+++ /dev/null
@@ -1 +0,0 @@
-echo ${[]}
diff --git a/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp b/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp
index 2a0e75c529..651731f8d2 100644
--- a/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp
+++ b/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp
@@ -260,7 +260,6 @@ class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
 
         if (inverse) {
             for (size_t k = 0; k < Util::exp2(num_qubits); k += dim) {
-
                 for (size_t inner_idx = 0; inner_idx < dim; inner_idx++) {
                     size_t idx = k | inner_idx;
                     size_t n_wires = wires.size();
@@ -285,7 +284,6 @@ class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
             }
         } else {
             for (size_t k = 0; k < Util::exp2(num_qubits); k += dim) {
-
                 for (size_t inner_idx = 0; inner_idx < dim; inner_idx++) {
                     size_t idx = k | inner_idx;
                     size_t n_wires = wires.size();
diff --git a/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp b/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
index 599cb9b91b..dd5d817db7 100644
--- a/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
+++ b/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
@@ -48,7 +48,6 @@ class StateVectorManagedCPU
         size_t num_qubits, Threading threading = bestThreading(),
         CPUMemoryModel memory_model = bestCPUMemoryModel())
         : BaseType{num_qubits, threading, memory_model} {
-
         size_t length = BaseType::getLength();
         data_ = allocateMemory<ComplexPrecisionT>(memory_model, length);
         std::fill(data_.get(), data_.get() + length,
@@ -61,7 +60,6 @@ class StateVectorManagedCPU
         const StateVectorCPU<PrecisionT, OtherDerived> &other)
         : BaseType(other.getNumQubits(), other.threading(),
                    other.memoryModel()) {
-
         size_t length = BaseType::getLength();
         data_ = allocateMemory<ComplexPrecisionT>(other.memoryModel(), length);
 
diff --git a/pennylane_lightning/src/tests/CreateAllWires.cpp b/pennylane_lightning/src/tests/CreateAllWires.cpp
index 4738554b54..ecea28089c 100644
--- a/pennylane_lightning/src/tests/CreateAllWires.cpp
+++ b/pennylane_lightning/src/tests/CreateAllWires.cpp
@@ -2,7 +2,6 @@
 namespace Pennylane {
 auto crateAllWires(size_t n_qubits, Gates::GateOperation gate_op, bool order)
     -> std::vector<std::vector<size_t>> {
-
     if (Util::array_has_elt(Gates::Constant::multi_qubit_gates, gate_op)) {
         // make all possible 2^N permutations
         std::vector<std::vector<size_t>> res;
diff --git a/pennylane_lightning/src/tests/Test_Internal.cpp b/pennylane_lightning/src/tests/Test_Internal.cpp
index b595d5daeb..9ef68bdc13 100644
--- a/pennylane_lightning/src/tests/Test_Internal.cpp
+++ b/pennylane_lightning/src/tests/Test_Internal.cpp
@@ -109,7 +109,6 @@ size_t permSize(size_t n, size_t r) {
  * @brief Test create all wires
  */
 TEST_CASE("createAllWires", "[Test_Internal]") {
-
     SECTION("order = false") {
         const std::vector<std::pair<size_t, size_t>> test_pairs{
             {4, 2},  {8, 3},  {12, 1}, {12, 2}, {12, 3},  {12, 4},  {12, 5},

From fc5dce9440f4b34f55ab76a988967a8bd17a3eec Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Tue, 1 Mar 2022 23:54:06 -0500
Subject: [PATCH 10/94] Fix version script

---
 .github/workflows/dev_version_script.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/dev_version_script.py b/.github/workflows/dev_version_script.py
index f3a2a7300b..82d47e6656 100644
--- a/.github/workflows/dev_version_script.py
+++ b/.github/workflows/dev_version_script.py
@@ -34,7 +34,7 @@ def extract_version(package_path):
 
 def is_dev(version_str):
     m = rgx_dev_ver.fullmatch(version_str)
-    return m is not None:
+    return m is not None
 
 def update_dev_version(package_path, version_str):
     m = rgx_dev_ver.fullmatch(version_str)

From c219a5c076da556d6769ea6af315063fc2970c4b Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Tue, 1 Mar 2022 23:55:21 -0500
Subject: [PATCH 11/94] More fix for codecov

---
 pennylane_lightning/src/examples/benchmark_matrix.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pennylane_lightning/src/examples/benchmark_matrix.cpp b/pennylane_lightning/src/examples/benchmark_matrix.cpp
index 9d297db91c..26d6ec45df 100644
--- a/pennylane_lightning/src/examples/benchmark_matrix.cpp
+++ b/pennylane_lightning/src/examples/benchmark_matrix.cpp
@@ -38,7 +38,6 @@ template <class RandomEngine>
 auto generateMatrixSequence(RandomEngine &re, const size_t num_reps,
                             const size_t num_qubits, const size_t num_wires)
     -> std::vector<MatOpDesc> {
-
     std::vector<MatOpDesc> matrix_seq;
     matrix_seq.reserve(num_reps);
     std::uniform_int_distribution<size_t> inverse_dist(0, 1);

From c882648dcd10ed27ec05e635ffcea4f5ce2966a1 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 2 Mar 2022 00:41:09 -0500
Subject: [PATCH 12/94] tidy test

---
 .github/workflows/format.yml           | 2 +-
 pennylane_lightning/src/CMakeLists.txt | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml
index 347f111cae..319471b3ca 100644
--- a/.github/workflows/format.yml
+++ b/.github/workflows/format.yml
@@ -42,7 +42,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v2
         with:
-          python-version: 3.7
+          python-version: 3.8
 
       - name: Install dependencies
         run: sudo apt update && sudo apt -y install clang-tidy-12 cmake g++
diff --git a/pennylane_lightning/src/CMakeLists.txt b/pennylane_lightning/src/CMakeLists.txt
index b6776ac992..0385f947d6 100644
--- a/pennylane_lightning/src/CMakeLists.txt
+++ b/pennylane_lightning/src/CMakeLists.txt
@@ -11,8 +11,9 @@ if(ENABLE_CLANG_TIDY)
     if(NOT DEFINED CLANG_TIDY_BINARY)
         set(CLANG_TIDY_BINARY clang-tidy)
     endif()
+    message(STATUS "Using CLANG_TIDY_BINARY=${CLANG_TIDY_BINARY}")
     set(CMAKE_CXX_CLANG_TIDY ${CLANG_TIDY_BINARY};
-                            -extra-arg=-std=c++17;
+                             -extra-arg=-std=c++17;
     )
 endif()
 

From 6a8c17953caf648f25e72741307ad546be14fba9 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 2 Mar 2022 10:01:53 -0500
Subject: [PATCH 13/94] Fix for clang

---
 .../src/simulator/DynamicDispatcher.hpp       |  2 +-
 pennylane_lightning/src/tests/TestHelpers.hpp |  2 +-
 .../Test_GateImplementations_Generator.cpp    |  2 --
 .../src/tests/Test_Internal.cpp               |  4 +--
 .../src/tests/Test_OpToMemberFuncPtr.cpp      | 10 ------
 pennylane_lightning/src/util/Memory.hpp       | 33 +++++++++++++++++--
 6 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/pennylane_lightning/src/simulator/DynamicDispatcher.hpp b/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
index e761cdeca4..f25bddf95a 100644
--- a/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
+++ b/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
@@ -323,7 +323,7 @@ template <typename PrecisionT> class DynamicDispatcher {
                 "The size of matrix does not match with the given "
                 "number of wires");
         }
-        applyMatrix(kernel, data, num_qubits, matrix.data(), wires, inverse);
+        applyMatrix(kernel, data, mat_op, num_qubits, matrix.data(), wires, inverse);
     }
 
     /**
diff --git a/pennylane_lightning/src/tests/TestHelpers.hpp b/pennylane_lightning/src/tests/TestHelpers.hpp
index dc6056016b..457d1eebd2 100644
--- a/pennylane_lightning/src/tests/TestHelpers.hpp
+++ b/pennylane_lightning/src/tests/TestHelpers.hpp
@@ -127,7 +127,7 @@ isApproxEqual(const std::vector<Data_t, AllocA> &data1,
               const typename Data_t::value_type eps =
                   std::numeric_limits<typename Data_t::value_type>::epsilon() *
                   100) {
-    return data1 == PLApprox(data2);
+    return data1 == PLApprox(data2).epsilon(eps);
 }
 
 /**
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp
index 2e9cd9cdcb..4b80e5235b 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp
@@ -35,8 +35,6 @@ constexpr std::string_view remove_prefix(const std::string_view &str,
     return {str.data() + len, str.length() - len};
 }
 
-constexpr auto gate_name_to_ops = Util::reverse_pairs(Constant::gate_names);
-
 template <GeneratorOperation gntr_op>
 constexpr auto findGateOpForGenerator() -> GateOperation {
     constexpr auto gntr_name =
diff --git a/pennylane_lightning/src/tests/Test_Internal.cpp b/pennylane_lightning/src/tests/Test_Internal.cpp
index 9ef68bdc13..99233b89d6 100644
--- a/pennylane_lightning/src/tests/Test_Internal.cpp
+++ b/pennylane_lightning/src/tests/Test_Internal.cpp
@@ -114,7 +114,7 @@ TEST_CASE("createAllWires", "[Test_Internal]") {
             {4, 2},  {8, 3},  {12, 1}, {12, 2}, {12, 3},  {12, 4},  {12, 5},
             {12, 6}, {12, 7}, {12, 8}, {12, 9}, {12, 10}, {12, 11}, {12, 12}};
 
-        for (const auto [n, r] : test_pairs) {
+        for (const auto& [n, r] : test_pairs) {
             std::vector<std::set<size_t>> vec;
             auto v = CombinationGenerator(n, r).all_perms();
 
@@ -139,7 +139,7 @@ TEST_CASE("createAllWires", "[Test_Internal]") {
         const std::vector<std::pair<size_t, size_t>> test_pairs{
             {4, 2}, {8, 3}, {12, 1}, {12, 2}, {12, 3}, {12, 4}, {12, 5}};
 
-        for (const auto [n, r] : test_pairs) {
+        for (const auto& [n, r] : test_pairs) {
             auto v = PermutationGenerator(n, r).all_perms();
 
             REQUIRE(v.size() == permSize(n, r));
diff --git a/pennylane_lightning/src/tests/Test_OpToMemberFuncPtr.cpp b/pennylane_lightning/src/tests/Test_OpToMemberFuncPtr.cpp
index 6f3f5bdd4f..99f5494128 100644
--- a/pennylane_lightning/src/tests/Test_OpToMemberFuncPtr.cpp
+++ b/pennylane_lightning/src/tests/Test_OpToMemberFuncPtr.cpp
@@ -95,16 +95,6 @@ class DummyImplementation {
         allGateOps<GeneratorOperation>();
     constexpr static std::string_view name = "Dummy";
 
-    template <class PrecisionT>
-    static void applyMatrix(std::complex<PrecisionT> *arr, size_t num_qubits,
-                            const std::complex<PrecisionT> *matrix,
-                            const std::vector<size_t> &wires, bool inverse) {
-        static_cast<void>(arr);
-        static_cast<void>(num_qubits);
-        static_cast<void>(matrix);
-        static_cast<void>(inverse);
-    }
-
     PENNYLANE_TESTS_DEFINE_GATE_OP(PauliX, 0)
     PENNYLANE_TESTS_DEFINE_GATE_OP(PauliY, 0)
     PENNYLANE_TESTS_DEFINE_GATE_OP(PauliZ, 0)
diff --git a/pennylane_lightning/src/util/Memory.hpp b/pennylane_lightning/src/util/Memory.hpp
index 223d977c0a..0f5e3aa9f7 100644
--- a/pennylane_lightning/src/util/Memory.hpp
+++ b/pennylane_lightning/src/util/Memory.hpp
@@ -19,7 +19,35 @@
 #include "ConstantUtil.hpp"
 #include "TypeList.hpp"
 
+/* Apple clang does not support std::aligned_alloc in Mac 10.14 */
+
 namespace Pennylane {
+/**
+ * @brief Custom aligned allocate function. As appleclang does not support
+ * std::aligned_alloc in Mac OS 10.14, we use posix memalign
+ */
+inline auto alignedAlloc(uint32_t alignment, size_t bytes) -> void* {
+#if defined(__clang__) // probably AppleClang
+    void* p;
+    posix_memalign(&p, alignment, bytes);
+    return p;
+#elif  defined(_MSC_VER)
+    return _aligned_malloc(bytes, alignment);
+#else
+    return std::aligned_alloc(alignment, bytes);
+#endif
+}
+
+inline void alignedFree(void* p) {
+#if defined(__clang__)
+    return free(p);
+#elif  defined(_MSC_VER)
+    return _aligned_free(p);
+#else
+    return std::free(p);
+#endif
+}
+
 template <class T, uint32_t alignment> struct AlignedAllocator {
     static_assert(Util::constIsPerfectPowerOf2(alignment),
                   "Template parameter alignment must be power of 2.");
@@ -39,7 +67,7 @@ template <class T, uint32_t alignment> struct AlignedAllocator {
         if (size == 0) {
             return nullptr;
         }
-        void *p = std::aligned_alloc(alignment, sizeof(T) * size);
+        void *p = alignedAlloc(alignment, sizeof(T) * size);
         if (p == nullptr) {
             throw std::bad_alloc();
         }
@@ -47,8 +75,7 @@ template <class T, uint32_t alignment> struct AlignedAllocator {
     }
 
     void deallocate(T *p, [[maybe_unused]] std::size_t size) noexcept {
-        // NOLINTNEXTLINE(hicpp-no-malloc)
-        std::free(p);
+        alignedFree(p);
     }
 
     template <class U> void construct(U *ptr) { ::new ((void *)ptr) U(); }

From 338cf534711b863e4b4bbaf9c727ab543c5b138b Mon Sep 17 00:00:00 2001
From: Dev version update bot <chae-yeun@xanadu.ai>
Date: Wed, 2 Mar 2022 15:03:34 +0000
Subject: [PATCH 14/94] Auto update version

---
 pennylane_lightning/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/_version.py b/pennylane_lightning/_version.py
index 50254fda3b..b362bf426f 100644
--- a/pennylane_lightning/_version.py
+++ b/pennylane_lightning/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.22.0-dev12"
+__version__ = "0.22.0-dev13"

From 13ed29ebf8e16911effe936dc7dff52e3f382232 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 2 Mar 2022 10:12:53 -0500
Subject: [PATCH 15/94] More fix for appleclang

---
 pennylane_lightning/src/bindings/Bindings.hpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pennylane_lightning/src/bindings/Bindings.hpp b/pennylane_lightning/src/bindings/Bindings.hpp
index 089aea48d7..0c99bd723d 100644
--- a/pennylane_lightning/src/bindings/Bindings.hpp
+++ b/pennylane_lightning/src/bindings/Bindings.hpp
@@ -97,7 +97,7 @@ auto getNumpyArrayAlignment(const pybind11::array &numpyArray)
     return getMemoryModel(numpyArray.request().ptr);
 }
 
-void deallocateArray(void *ptr) { std::free(ptr); }
+void deallocateArray(void *ptr) { alignedFree(ptr); }
 
 /**
  * @brief We return an numpy array whose underlying data is allocated by
@@ -110,20 +110,20 @@ auto allocateAlignedArray(size_t size, pybind11::dtype dt) -> pybind11::array {
     auto memory_model = bestCPUMemoryModel();
 
     if (dt.is(pybind11::dtype::of<float>())) {
-        void *ptr = std::aligned_alloc(getAlignment<float>(memory_model),
+        void *ptr = alignedAlloc(getAlignment<float>(memory_model),
                                        sizeof(float) * size);
         auto capsule = pybind11::capsule(ptr, &deallocateArray);
 
         return pybind11::array{dt, {size}, {sizeof(float)}, ptr, capsule};
     } else if (dt.is(pybind11::dtype::of<double>())) {
-        void *ptr = std::aligned_alloc(getAlignment<double>(memory_model),
+        void *ptr = alignedAlloc(getAlignment<double>(memory_model),
                                        sizeof(double) * size);
         auto capsule = pybind11::capsule(ptr, &deallocateArray);
 
         return pybind11::array{dt, {size}, {sizeof(double)}, ptr, capsule};
     } else if (dt.is(pybind11::dtype::of<std::complex<float>>())) {
         void *ptr =
-            std::aligned_alloc(getAlignment<std::complex<float>>(memory_model),
+            alignedAlloc(getAlignment<std::complex<float>>(memory_model),
                                sizeof(std::complex<float>) * size);
         auto capsule = pybind11::capsule(ptr, &deallocateArray);
 
@@ -131,7 +131,7 @@ auto allocateAlignedArray(size_t size, pybind11::dtype dt) -> pybind11::array {
             dt, {size}, {sizeof(std::complex<float>)}, ptr, capsule};
     } else if (dt.is(pybind11::dtype::of<std::complex<double>>())) {
         void *ptr =
-            std::aligned_alloc(getAlignment<std::complex<double>>(memory_model),
+            alignedAlloc(getAlignment<std::complex<double>>(memory_model),
                                sizeof(std::complex<double>) * size);
         auto capsule = pybind11::capsule(ptr, &deallocateArray);
 

From 09b471e760f21feb75a7017087d5932e0b85322a Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 2 Mar 2022 13:32:11 -0500
Subject: [PATCH 16/94] Fix aligned memory

---
 .../src/algorithms/JacobianTape.hpp           |  6 +-
 pennylane_lightning/src/bindings/Bindings.hpp |  8 +-
 .../src/simulator/CPUMemoryModel.hpp          | 29 ++----
 .../src/simulator/DynamicDispatcher.hpp       |  3 +-
 .../src/simulator/StateVectorManagedCPU.hpp   | 45 +++------
 pennylane_lightning/src/tests/TestHelpers.hpp | 19 ++--
 .../tests/Test_GateImplementations_Param.cpp  | 93 ++++++++++---------
 .../src/tests/Test_Internal.cpp               |  4 +-
 pennylane_lightning/src/util/Memory.hpp       | 59 ++++++------
 9 files changed, 126 insertions(+), 140 deletions(-)

diff --git a/pennylane_lightning/src/algorithms/JacobianTape.hpp b/pennylane_lightning/src/algorithms/JacobianTape.hpp
index ca7d0ac6f7..8a33e89f02 100644
--- a/pennylane_lightning/src/algorithms/JacobianTape.hpp
+++ b/pennylane_lightning/src/algorithms/JacobianTape.hpp
@@ -86,9 +86,9 @@ template <class T = double> class ObsDatum {
     }
 
   private:
-    const std::vector<std::string> obs_name_;
-    const std::vector<param_var_t> obs_params_;
-    const std::vector<std::vector<size_t>> obs_wires_;
+    const std::vector<std::string> obs_name_{};
+    const std::vector<param_var_t> obs_params_{};
+    const std::vector<std::vector<size_t>> obs_wires_{};
 };
 
 /**
diff --git a/pennylane_lightning/src/bindings/Bindings.hpp b/pennylane_lightning/src/bindings/Bindings.hpp
index 0c99bd723d..9e2fe54114 100644
--- a/pennylane_lightning/src/bindings/Bindings.hpp
+++ b/pennylane_lightning/src/bindings/Bindings.hpp
@@ -111,20 +111,20 @@ auto allocateAlignedArray(size_t size, pybind11::dtype dt) -> pybind11::array {
 
     if (dt.is(pybind11::dtype::of<float>())) {
         void *ptr = alignedAlloc(getAlignment<float>(memory_model),
-                                       sizeof(float) * size);
+                                 sizeof(float) * size);
         auto capsule = pybind11::capsule(ptr, &deallocateArray);
 
         return pybind11::array{dt, {size}, {sizeof(float)}, ptr, capsule};
     } else if (dt.is(pybind11::dtype::of<double>())) {
         void *ptr = alignedAlloc(getAlignment<double>(memory_model),
-                                       sizeof(double) * size);
+                                 sizeof(double) * size);
         auto capsule = pybind11::capsule(ptr, &deallocateArray);
 
         return pybind11::array{dt, {size}, {sizeof(double)}, ptr, capsule};
     } else if (dt.is(pybind11::dtype::of<std::complex<float>>())) {
         void *ptr =
             alignedAlloc(getAlignment<std::complex<float>>(memory_model),
-                               sizeof(std::complex<float>) * size);
+                         sizeof(std::complex<float>) * size);
         auto capsule = pybind11::capsule(ptr, &deallocateArray);
 
         return pybind11::array{
@@ -132,7 +132,7 @@ auto allocateAlignedArray(size_t size, pybind11::dtype dt) -> pybind11::array {
     } else if (dt.is(pybind11::dtype::of<std::complex<double>>())) {
         void *ptr =
             alignedAlloc(getAlignment<std::complex<double>>(memory_model),
-                               sizeof(std::complex<double>) * size);
+                         sizeof(std::complex<double>) * size);
         auto capsule = pybind11::capsule(ptr, &deallocateArray);
 
         return pybind11::array{
diff --git a/pennylane_lightning/src/simulator/CPUMemoryModel.hpp b/pennylane_lightning/src/simulator/CPUMemoryModel.hpp
index 97b60cf7f4..b6228401a0 100644
--- a/pennylane_lightning/src/simulator/CPUMemoryModel.hpp
+++ b/pennylane_lightning/src/simulator/CPUMemoryModel.hpp
@@ -18,6 +18,7 @@
  */
 #pragma once
 #include "Macros.hpp"
+#include "Memory.hpp"
 
 #include <cstdint>
 #include <memory>
@@ -52,11 +53,11 @@ constexpr inline auto bestCPUMemoryModel() -> CPUMemoryModel {
     return CPUMemoryModel::Unaligned;
 }
 
-template <class PrecisionT>
-constexpr inline auto getAlignment(CPUMemoryModel memory_model) -> size_t {
+template <class T>
+constexpr inline auto getAlignment(CPUMemoryModel memory_model) -> uint32_t {
     switch (memory_model) {
     case CPUMemoryModel::Unaligned:
-        return alignof(PrecisionT);
+        return alignof(T);
     case CPUMemoryModel::Aligned256:
         return 32U;
     case CPUMemoryModel::Aligned512:
@@ -67,23 +68,9 @@ constexpr inline auto getAlignment(CPUMemoryModel memory_model) -> size_t {
     PL_UNREACHABLE;
 }
 
-template <typename T>
-auto allocateMemory(CPUMemoryModel memory_model, size_t size)
-    // NOLINTNEXTLINE(modernize-avoid-c-arrays,hicpp-avoid-c-arrays)
-    -> std::unique_ptr<T[]> {
-    switch (memory_model) {
-    case CPUMemoryModel::Unaligned:
-        // NOLINTNEXTLINE(modernize-avoid-c-arrays,hicpp-avoid-c-arrays)
-        return std::unique_ptr<T[]>{new T[size]};
-    case CPUMemoryModel::Aligned256:
-        // NOLINTNEXTLINE(modernize-avoid-c-arrays,hicpp-avoid-c-arrays)
-        return std::unique_ptr<T[]>{new (std::align_val_t(32)) T[size]};
-    case CPUMemoryModel::Aligned512:
-        // NOLINTNEXTLINE(modernize-avoid-c-arrays,hicpp-avoid-c-arrays)
-        return std::unique_ptr<T[]>{new (std::align_val_t(64)) T[size]};
-    default:
-        break;
-    }
-    PL_UNREACHABLE;
+template <class T>
+constexpr auto getAllocator(CPUMemoryModel memory_model)
+    -> AlignedAllocator<T> {
+    return AlignedAllocator<T>{getAlignment<T>(memory_model)};
 }
 } // namespace Pennylane
diff --git a/pennylane_lightning/src/simulator/DynamicDispatcher.hpp b/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
index f25bddf95a..71c92bf772 100644
--- a/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
+++ b/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
@@ -323,7 +323,8 @@ template <typename PrecisionT> class DynamicDispatcher {
                 "The size of matrix does not match with the given "
                 "number of wires");
         }
-        applyMatrix(kernel, data, mat_op, num_qubits, matrix.data(), wires, inverse);
+        applyMatrix(kernel, data, mat_op, num_qubits, matrix.data(), wires,
+                    inverse);
     }
 
     /**
diff --git a/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp b/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
index dd5d817db7..ad21a48134 100644
--- a/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
+++ b/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
@@ -26,9 +26,6 @@ namespace Pennylane {
  * @brief StateVector class where data resides in CPU memory. Memory ownership
  * resides within class.
  *
- * We currently use std::unique_ptr to C-style array as we want to choose
- * allocator in runtime. This is impossible with std::vector.
- *
  * @tparam PrecisionT
  */
 template <class PrecisionT = double>
@@ -41,17 +38,15 @@ class StateVectorManagedCPU
     using BaseType = StateVectorCPU<PrecisionT, StateVectorManagedCPU>;
 
     // NOLINTNEXTLINE(modernize-avoid-c-arrays,hicpp-avoid-c-arrays)
-    std::unique_ptr<ComplexPrecisionT[]> data_;
+    std::vector<ComplexPrecisionT, AlignedAllocator<ComplexPrecisionT>> data_;
 
   public:
     explicit StateVectorManagedCPU(
         size_t num_qubits, Threading threading = bestThreading(),
         CPUMemoryModel memory_model = bestCPUMemoryModel())
-        : BaseType{num_qubits, threading, memory_model} {
-        size_t length = BaseType::getLength();
-        data_ = allocateMemory<ComplexPrecisionT>(memory_model, length);
-        std::fill(data_.get(), data_.get() + length,
-                  ComplexPrecisionT{0.0, 0.0});
+        : BaseType{num_qubits, threading, memory_model},
+          data_{Util::exp2(num_qubits), ComplexPrecisionT{0.0, 0.0},
+                getAllocator<ComplexPrecisionT>(this->memory_model_)} {
         data_[0] = {1, 0};
     }
 
@@ -59,24 +54,19 @@ class StateVectorManagedCPU
     explicit StateVectorManagedCPU(
         const StateVectorCPU<PrecisionT, OtherDerived> &other)
         : BaseType(other.getNumQubits(), other.threading(),
-                   other.memoryModel()) {
-        size_t length = BaseType::getLength();
-        data_ = allocateMemory<ComplexPrecisionT>(other.memoryModel(), length);
-
-        std::copy(other.getData(), other.getData() + length, data_.get());
-    }
+                   other.memoryModel()),
+          data_{other.getData(), other.getData() + other.getLength(),
+                getAllocator<ComplexPrecisionT>(this->memory_model_)} {}
 
     StateVectorManagedCPU(const ComplexPrecisionT *other_data,
                           size_t other_size,
                           Threading threading = bestThreading(),
                           CPUMemoryModel memory_model = bestCPUMemoryModel())
-        : BaseType(Util::log2PerfectPower(other_size), threading,
-                   memory_model) {
+        : BaseType(Util::log2PerfectPower(other_size), threading, memory_model),
+          data_{other_data, other_data + other_size,
+                getAllocator<ComplexPrecisionT>(this->memory_model_)} {
         PL_ABORT_IF_NOT(Util::isPerfectPowerOf2(other_size),
                         "The size of provided data must be a power of 2.");
-
-        data_ = allocateMemory<ComplexPrecisionT>(memory_model, other_size);
-        updateData(other_data);
     }
 
     // Clang-tidy gives false positive for delegating constructor
@@ -89,24 +79,19 @@ class StateVectorManagedCPU
         : StateVectorManagedCPU(rhs.data(), rhs.size(), threading,
                                 memory_model) {}
 
-    StateVectorManagedCPU(const StateVectorManagedCPU &rhs) : BaseType(rhs) {
-        size_t length = BaseType::getLength();
-        data_ = allocateMemory<ComplexPrecisionT>(rhs.memory_model_, length);
-        std::copy(rhs.getData(), rhs.getData() + length, data_.get());
-    }
-
+    StateVectorManagedCPU(const StateVectorManagedCPU &rhs) = default;
     StateVectorManagedCPU(StateVectorManagedCPU &&) noexcept = default;
 
-    StateVectorManagedCPU &operator=(const StateVectorManagedCPU &) = delete;
+    StateVectorManagedCPU &operator=(const StateVectorManagedCPU &) = default;
     StateVectorManagedCPU &
     operator=(StateVectorManagedCPU &&) noexcept = default;
 
     ~StateVectorManagedCPU() = default;
 
-    [[nodiscard]] auto getData() -> ComplexPrecisionT * { return data_.get(); }
+    [[nodiscard]] auto getData() -> ComplexPrecisionT * { return data_.data(); }
 
     [[nodiscard]] auto getData() const -> const ComplexPrecisionT * {
-        return data_.get();
+        return data_.data();
     }
 
     /**
@@ -115,7 +100,7 @@ class StateVectorManagedCPU
      * @param new_data std::vector contains data.
      */
     void updateData(const ComplexPrecisionT *data) {
-        std::copy(data, data + BaseType::getLength(), data_.get());
+        std::copy(data, data + BaseType::getLength(), data_.data());
     }
 };
 } // namespace Pennylane
diff --git a/pennylane_lightning/src/tests/TestHelpers.hpp b/pennylane_lightning/src/tests/TestHelpers.hpp
index 457d1eebd2..facb6372c7 100644
--- a/pennylane_lightning/src/tests/TestHelpers.hpp
+++ b/pennylane_lightning/src/tests/TestHelpers.hpp
@@ -150,9 +150,10 @@ isApproxEqual(const Data_t &data1, const Data_t &data2,
 }
 
 template <typename T>
-using TestVector = std::vector<
-    T,
-    PLAllocator<T, Util::common_alignment_v<remove_complex_t<T>, TestKernels>>>;
+constexpr static auto test_allocator =
+    AlignedAllocator<T>{Util::common_alignment_v<T, TestKernels>};
+
+template <typename T> using TestVector = std::vector<T, AlignedAllocator<T>>;
 
 /**
  * @brief Multiplies every value in a dataset by a given complex scalar value.
@@ -192,7 +193,8 @@ void scaleVector(std::vector<std::complex<Data_t>, Alloc> &data,
 template <typename PrecisionT>
 auto createZeroState(size_t num_qubits)
     -> TestVector<std::complex<PrecisionT>> {
-    TestVector<std::complex<PrecisionT>> res(1U << num_qubits, {0.0, 0.0});
+    TestVector<std::complex<PrecisionT>> res(
+        1U << num_qubits, {0.0, 0.0}, test_allocator<std::complex<PrecisionT>>);
     res[0] = std::complex<PrecisionT>{1.0, 0.0};
     return res;
 }
@@ -203,7 +205,8 @@ auto createZeroState(size_t num_qubits)
 template <typename PrecisionT>
 auto createPlusState(size_t num_qubits)
     -> TestVector<std::complex<PrecisionT>> {
-    TestVector<std::complex<PrecisionT>> res(1U << num_qubits, {1.0, 0.0});
+    TestVector<std::complex<PrecisionT>> res(
+        1U << num_qubits, {1.0, 0.0}, test_allocator<std::complex<PrecisionT>>);
     for (auto &elt : res) {
         elt /= std::sqrt(1U << num_qubits);
     }
@@ -218,7 +221,8 @@ auto createRandomState(RandomEngine &re, size_t num_qubits)
     -> TestVector<std::complex<PrecisionT>> {
     using Util::squaredNorm;
 
-    TestVector<std::complex<PrecisionT>> res(1U << num_qubits, {0.0, 0.0});
+    TestVector<std::complex<PrecisionT>> res(
+        1U << num_qubits, {0.0, 0.0}, test_allocator<std::complex<PrecisionT>>);
     std::uniform_real_distribution<PrecisionT> dist;
     for (size_t idx = 0; idx < (1U << num_qubits); idx++) {
         res[idx] = {dist(re), dist(re)};
@@ -238,7 +242,8 @@ template <typename PrecisionT>
 auto createProductState(std::string_view str)
     -> TestVector<std::complex<PrecisionT>> {
     using Pennylane::Util::INVSQRT2;
-    TestVector<std::complex<PrecisionT>> st;
+    TestVector<std::complex<PrecisionT>> st(
+        test_allocator<std::complex<PrecisionT>>);
     st.resize(1U << str.length());
 
     std::vector<PrecisionT> zero{1.0, 0.0};
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Param.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Param.cpp
index 74e6f3a767..530f7916fb 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Param.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Param.cpp
@@ -164,8 +164,9 @@ void testApplyRY() {
             {0.10575112905629831, -0.47593196040758534},
             {-0.8711876098966215, -0.0577721051072477}}};
 
-    const TestVector<ComplexPrecisionT> init_state{{0.8775825618903728, 0.0},
-                                                   {0.0, -0.47942553860420306}};
+    const TestVector<ComplexPrecisionT> init_state{
+        {{0.8775825618903728, 0.0}, {0.0, -0.47942553860420306}},
+        test_allocator<ComplexPrecisionT>};
     DYNAMIC_SECTION(GateImplementation::name
                     << ", RY - " << PrecisionToName<PrecisionT>::value) {
         for (size_t index = 0; index < angles.size(); index++) {
@@ -373,15 +374,17 @@ void testApplyIsingXX() {
                     << PrecisionToName<PrecisionT>::value) {
         const size_t num_qubits = 3;
         const auto ini_st = TestVector<ComplexPrecisionT>{
-            ComplexPrecisionT{0.125681356503, 0.252712197380},
-            ComplexPrecisionT{0.262591068130, 0.370189000494},
-            ComplexPrecisionT{0.129300299863, 0.371057794075},
-            ComplexPrecisionT{0.392248682814, 0.195795523118},
-            ComplexPrecisionT{0.303908059240, 0.082981563244},
-            ComplexPrecisionT{0.189140284321, 0.179512645957},
-            ComplexPrecisionT{0.173146612336, 0.092249594834},
-            ComplexPrecisionT{0.298857179897, 0.269627836165},
-        };
+            {
+                ComplexPrecisionT{0.125681356503, 0.252712197380},
+                ComplexPrecisionT{0.262591068130, 0.370189000494},
+                ComplexPrecisionT{0.129300299863, 0.371057794075},
+                ComplexPrecisionT{0.392248682814, 0.195795523118},
+                ComplexPrecisionT{0.303908059240, 0.082981563244},
+                ComplexPrecisionT{0.189140284321, 0.179512645957},
+                ComplexPrecisionT{0.173146612336, 0.092249594834},
+                ComplexPrecisionT{0.298857179897, 0.269627836165},
+            },
+            test_allocator<ComplexPrecisionT>};
         const std::vector<size_t> wires = {0, 2};
         const ParamT angle = 0.267030328057308;
         std::vector<ComplexPrecisionT> expected{
@@ -507,23 +510,23 @@ void testApplyIsingYY() {
         const size_t num_qubits = 4;
 
         const auto ini_st = TestVector<ComplexPrecisionT>{
-            ComplexPrecisionT{0.276522701942, 0.192601873155},
-            ComplexPrecisionT{0.035951282872, 0.224882549474},
-            ComplexPrecisionT{0.142578003191, 0.016769549184},
-            ComplexPrecisionT{0.207510965432, 0.068085008177},
-            ComplexPrecisionT{0.231177902264, 0.039974505646},
-            ComplexPrecisionT{0.038587049391, 0.058503643276},
-            ComplexPrecisionT{0.023121176451, 0.294843178966},
-            ComplexPrecisionT{0.297936734810, 0.061981734524},
-            ComplexPrecisionT{0.140961289031, 0.061129422308},
-            ComplexPrecisionT{0.204531438234, 0.159178277448},
-            ComplexPrecisionT{0.143828437747, 0.031972463787},
-            ComplexPrecisionT{0.291528706380, 0.138875986482},
-            ComplexPrecisionT{0.297088897520, 0.179914971203},
-            ComplexPrecisionT{0.032991360504, 0.024025500927},
-            ComplexPrecisionT{0.121553926676, 0.263606060346},
-            ComplexPrecisionT{0.177173454285, 0.267447421480},
-        };
+            {ComplexPrecisionT{0.276522701942, 0.192601873155},
+             ComplexPrecisionT{0.035951282872, 0.224882549474},
+             ComplexPrecisionT{0.142578003191, 0.016769549184},
+             ComplexPrecisionT{0.207510965432, 0.068085008177},
+             ComplexPrecisionT{0.231177902264, 0.039974505646},
+             ComplexPrecisionT{0.038587049391, 0.058503643276},
+             ComplexPrecisionT{0.023121176451, 0.294843178966},
+             ComplexPrecisionT{0.297936734810, 0.061981734524},
+             ComplexPrecisionT{0.140961289031, 0.061129422308},
+             ComplexPrecisionT{0.204531438234, 0.159178277448},
+             ComplexPrecisionT{0.143828437747, 0.031972463787},
+             ComplexPrecisionT{0.291528706380, 0.138875986482},
+             ComplexPrecisionT{0.297088897520, 0.179914971203},
+             ComplexPrecisionT{0.032991360504, 0.024025500927},
+             ComplexPrecisionT{0.121553926676, 0.263606060346},
+             ComplexPrecisionT{0.177173454285, 0.267447421480}},
+            test_allocator<ComplexPrecisionT>};
 
         const std::vector<size_t> wires = {0, 1};
         const ParamT angle = 0.312;
@@ -661,23 +664,23 @@ void testApplyIsingZZ() {
         const size_t num_qubits = 4;
 
         TestVector<ComplexPrecisionT> ini_st{
-            ComplexPrecisionT{0.267462841882, 0.010768564798},
-            ComplexPrecisionT{0.228575129706, 0.010564590956},
-            ComplexPrecisionT{0.099492749900, 0.260849823392},
-            ComplexPrecisionT{0.093690204310, 0.189847108173},
-            ComplexPrecisionT{0.033390732374, 0.203836830144},
-            ComplexPrecisionT{0.226979395737, 0.081852150975},
-            ComplexPrecisionT{0.031235505729, 0.176933497281},
-            ComplexPrecisionT{0.294287602843, 0.145156781198},
-            ComplexPrecisionT{0.152742706049, 0.111628061129},
-            ComplexPrecisionT{0.012553863703, 0.120027860480},
-            ComplexPrecisionT{0.237156555364, 0.154658769755},
-            ComplexPrecisionT{0.117001120872, 0.228059505033},
-            ComplexPrecisionT{0.041495873225, 0.065934827444},
-            ComplexPrecisionT{0.089653239407, 0.221581340372},
-            ComplexPrecisionT{0.217892322429, 0.291261296999},
-            ComplexPrecisionT{0.292993251871, 0.186570798697},
-        };
+            {ComplexPrecisionT{0.267462841882, 0.010768564798},
+             ComplexPrecisionT{0.228575129706, 0.010564590956},
+             ComplexPrecisionT{0.099492749900, 0.260849823392},
+             ComplexPrecisionT{0.093690204310, 0.189847108173},
+             ComplexPrecisionT{0.033390732374, 0.203836830144},
+             ComplexPrecisionT{0.226979395737, 0.081852150975},
+             ComplexPrecisionT{0.031235505729, 0.176933497281},
+             ComplexPrecisionT{0.294287602843, 0.145156781198},
+             ComplexPrecisionT{0.152742706049, 0.111628061129},
+             ComplexPrecisionT{0.012553863703, 0.120027860480},
+             ComplexPrecisionT{0.237156555364, 0.154658769755},
+             ComplexPrecisionT{0.117001120872, 0.228059505033},
+             ComplexPrecisionT{0.041495873225, 0.065934827444},
+             ComplexPrecisionT{0.089653239407, 0.221581340372},
+             ComplexPrecisionT{0.217892322429, 0.291261296999},
+             ComplexPrecisionT{0.292993251871, 0.186570798697}},
+            test_allocator<ComplexPrecisionT>};
 
         const std::vector<size_t> wires = {0, 1};
         const ParamT angle = 0.312;
diff --git a/pennylane_lightning/src/tests/Test_Internal.cpp b/pennylane_lightning/src/tests/Test_Internal.cpp
index 99233b89d6..284bf9a77c 100644
--- a/pennylane_lightning/src/tests/Test_Internal.cpp
+++ b/pennylane_lightning/src/tests/Test_Internal.cpp
@@ -114,7 +114,7 @@ TEST_CASE("createAllWires", "[Test_Internal]") {
             {4, 2},  {8, 3},  {12, 1}, {12, 2}, {12, 3},  {12, 4},  {12, 5},
             {12, 6}, {12, 7}, {12, 8}, {12, 9}, {12, 10}, {12, 11}, {12, 12}};
 
-        for (const auto& [n, r] : test_pairs) {
+        for (const auto &[n, r] : test_pairs) {
             std::vector<std::set<size_t>> vec;
             auto v = CombinationGenerator(n, r).all_perms();
 
@@ -139,7 +139,7 @@ TEST_CASE("createAllWires", "[Test_Internal]") {
         const std::vector<std::pair<size_t, size_t>> test_pairs{
             {4, 2}, {8, 3}, {12, 1}, {12, 2}, {12, 3}, {12, 4}, {12, 5}};
 
-        for (const auto& [n, r] : test_pairs) {
+        for (const auto &[n, r] : test_pairs) {
             auto v = PermutationGenerator(n, r).all_perms();
 
             REQUIRE(v.size() == permSize(n, r));
diff --git a/pennylane_lightning/src/util/Memory.hpp b/pennylane_lightning/src/util/Memory.hpp
index 0f5e3aa9f7..235a581a34 100644
--- a/pennylane_lightning/src/util/Memory.hpp
+++ b/pennylane_lightning/src/util/Memory.hpp
@@ -10,13 +10,15 @@
 // limitations under the License.
 #pragma once
 
+#include <cassert>
+#include <cstddef>
 #include <cstdint>
 #include <cstdlib>
 #include <limits>
 #include <memory>
 #include <new>
 
-#include "ConstantUtil.hpp"
+#include "BitUtil.hpp"
 #include "TypeList.hpp"
 
 /* Apple clang does not support std::aligned_alloc in Mac 10.14 */
@@ -24,50 +26,58 @@
 namespace Pennylane {
 /**
  * @brief Custom aligned allocate function. As appleclang does not support
- * std::aligned_alloc in Mac OS 10.14, we use posix memalign
+ * std::aligned_alloc in Mac OS 10.14, we use posix_memalign function.
+ *
+ * Note that alignment must be larger than max_align_t.
  */
-inline auto alignedAlloc(uint32_t alignment, size_t bytes) -> void* {
+inline auto alignedAlloc(uint32_t alignment, size_t bytes) -> void * {
 #if defined(__clang__) // probably AppleClang
-    void* p;
+    void *p;
     posix_memalign(&p, alignment, bytes);
     return p;
-#elif  defined(_MSC_VER)
+#elif defined(_MSC_VER)
     return _aligned_malloc(bytes, alignment);
 #else
     return std::aligned_alloc(alignment, bytes);
 #endif
 }
 
-inline void alignedFree(void* p) {
+inline void alignedFree(void *p) {
 #if defined(__clang__)
-    return free(p);
-#elif  defined(_MSC_VER)
+    return ::free(p); // NOLINT(hicpp-no-malloc)
+#elif defined(_MSC_VER)
     return _aligned_free(p);
 #else
     return std::free(p);
 #endif
 }
 
-template <class T, uint32_t alignment> struct AlignedAllocator {
-    static_assert(Util::constIsPerfectPowerOf2(alignment),
-                  "Template parameter alignment must be power of 2.");
+template <class T> struct AlignedAllocator {
+    uint32_t alignment_;
     using value_type = T;
 
-    AlignedAllocator() = default;
+    constexpr explicit AlignedAllocator(uint32_t alignment)
+        : alignment_{alignment} {
+        // assert(Util::isPerfectPowerOf2(alignment));
+    }
 
-    template <class U> struct rebind {
-        using other = AlignedAllocator<U, alignment>;
-    };
+    template <class U> struct rebind { using other = AlignedAllocator<U>; };
 
     template <typename U>
     explicit constexpr AlignedAllocator(
-        [[maybe_unused]] const AlignedAllocator<U, alignment> &rhs) noexcept {}
+        [[maybe_unused]] const AlignedAllocator<U> &rhs) noexcept
+        : alignment_{rhs.alignment_} {}
 
     [[nodiscard]] T *allocate(std::size_t size) {
         if (size == 0) {
             return nullptr;
         }
-        void *p = alignedAlloc(alignment, sizeof(T) * size);
+        void *p;
+        if (alignment_ > alignof(std::max_align_t)) {
+            p = alignedAlloc(alignment_, sizeof(T) * size);
+        } else {
+            p = malloc(sizeof(T) * size);
+        }
         if (p == nullptr) {
             throw std::bad_alloc();
         }
@@ -86,15 +96,15 @@ template <class T, uint32_t alignment> struct AlignedAllocator {
     }
 };
 
-template <class T, class U, uint32_t alignment>
-bool operator==([[maybe_unused]] const AlignedAllocator<T, alignment> &lhs,
-                [[maybe_unused]] const AlignedAllocator<U, alignment> &rhs) {
+template <class T, class U>
+bool operator==([[maybe_unused]] const AlignedAllocator<T> &lhs,
+                [[maybe_unused]] const AlignedAllocator<U> &rhs) {
     return true;
 }
 
 template <class T, class U, uint32_t alignment>
-bool operator!=([[maybe_unused]] const AlignedAllocator<T, alignment> &lhs,
-                [[maybe_unused]] const AlignedAllocator<U, alignment> &rhs) {
+bool operator!=([[maybe_unused]] const AlignedAllocator<T> &lhs,
+                [[maybe_unused]] const AlignedAllocator<U> &rhs) {
     return false;
 }
 
@@ -117,9 +127,4 @@ template <> struct commonAlignmentHelper<void> {
 template <typename TypeList>
 [[maybe_unused]] constexpr static size_t common_alignment =
     commonAlignmentHelper<TypeList>::value;
-
-template <class T, uint32_t alignment>
-using PLAllocator = std::conditional_t<alignment == 4, std::allocator<T>,
-                                       AlignedAllocator<T, alignment>>;
-
 } // namespace Pennylane

From 30c7d42fb5325c9086429c21ebe1f48d23582f82 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 2 Mar 2022 13:57:13 -0500
Subject: [PATCH 17/94] Clean-up, fix ld warnings for appleclang

---
 CMakeLists.txt                                |   2 +-
 pennylane_lightning/src/bindings/Bindings.hpp | 121 ------------------
 pennylane_lightning/src/gates/Constant.hpp    |  65 ----------
 3 files changed, 1 insertion(+), 187 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 89f665e31e..5e143fd62d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,7 +42,6 @@ option(ENABLE_BLAS "Enable BLAS" OFF)
 option(BUILD_TESTS "Build cpp tests" OFF)
 option(BUILD_EXAMPLES "Build cpp examples" OFF)
 
-
 # Process compile options
 include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/process_options.cmake")
 
@@ -57,6 +56,7 @@ FetchContent_MakeAvailable(pybind11)
 
 # All CMakeLists.txt in subdirectories use pennylane_lightning_compile_options and pennylane_lightning_external_libs
 add_subdirectory(pennylane_lightning/src)
+set(CMAKE_CXX_VISIBILITY_PRESET hidden)
 
 #####################################################
 # Maintain for dependent external package development
diff --git a/pennylane_lightning/src/bindings/Bindings.hpp b/pennylane_lightning/src/bindings/Bindings.hpp
index 9e2fe54114..9a3a4aed1b 100644
--- a/pennylane_lightning/src/bindings/Bindings.hpp
+++ b/pennylane_lightning/src/bindings/Bindings.hpp
@@ -163,127 +163,6 @@ void apply(pybind11::array_t<std::complex<PrecisionT>> &stateNumpyArray,
     state.applyOperations(ops, wires, inverse, params);
 }
 
-/**
- * @brief Return a specific lambda function for the given kernel and gate
- * operation
- *
- * We do not expect template parameters kernel and gate_op can be function
- * parameters as we want the lambda function to be a stateless.
- *
- * @tparam PrecisionT Floating point precision of underlying statevector data
- * @tparam ParamT Floating point type of gate parameters
- * @tparam kernel Kernel to register
- * @tparam gate_op Gate operation
- */
-/*
-template <class PrecisionT, class ParamT, Gates::KernelType kernel,
-          Gates::GateOperation gate_op>
-constexpr auto getLambdaForKernelGateOp() {
-    namespace py = pybind11;
-    using namespace Pennylane::Gates;
-    using GateImplementation = SelectKernel<kernel>;
-
-    static_assert(array_has_elt(GateImplementation::implemented_gates, gate_op),
-                  "The operator to register must be implemented.");
-
-    if constexpr (gate_op != GateOperation::Matrix) {
-        return
-            [](StateVectorRawCPU<PrecisionT> &st, const std::vector<size_t>
-&wires, bool inverse, const std::vector<ParamT> &params) { constexpr auto
-func_ptr = GateOpToMemberFuncPtr<PrecisionT, ParamT, GateImplementation,
-gate_op>::value; callGateOps(func_ptr, st.getData(), st.getNumQubits(), wires,
-                            inverse, params);
-            };
-    } else {
-        return [](StateVectorRawCPU<PrecisionT> &st,
-                  const py::array_t<std::complex<PrecisionT>,
-                                    py::array::c_style | py::array::forcecast>
-                      &matrix,
-                  const std::vector<size_t> &wires, bool inverse = false) {
-            st.template applyMatrix_<kernel>(
-                static_cast<std::complex<PrecisionT> *>(matrix.request().ptr),
-                wires, inverse);
-        };
-    }
-};
-*/
-/*
-/// @cond DEV
-template <class PrecisionT, class ParamT, Gates::KernelType kernel,
-          size_t gate_idx>
-constexpr auto getGateOpLambdaPairsIter() {
-    using Pennylane::Gates::SelectKernel;
-    if constexpr (gate_idx < SelectKernel<kernel>::implemented_gates.size()) {
-        constexpr auto gate_op =
-            SelectKernel<kernel>::implemented_gates[gate_idx];
-        return prepend_to_tuple(
-            std::pair{gate_op, getLambdaForKernelGateOp<PrecisionT, ParamT,
-                                                        kernel, gate_op>()},
-            getGateOpLambdaPairsIter<PrecisionT, ParamT, kernel,
-                                     gate_idx + 1>());
-    } else {
-        return std::tuple{};
-    }
-}
-/// @endcond
-*/
-/**
- * @brief Create a tuple of lambda functions to bind
- *
- * @tparam PrecisionT Floating point precision of underlying statevector data
- * @tparam ParamT Floating point type of gate parameters
- * @tparam kernel Kernel to register
- */
-/*
-template <class PrecisionT, class ParamT, Gates::KernelType kernel>
-constexpr auto getGateOpLambdaPairs() {
-    return getGateOpLambdaPairsIter<PrecisionT, ParamT, kernel, 0>();
-}
-*/
-
-/**
- * @brief For given kernel, register all implemented gate operations and apply
- * matrix.
- *
- * @tparam PrecisionT Floating point precision of underlying statevector data
- * @tparam ParamT Floating point type of gate parameters
- * @tparam Kernel Kernel to register
- * @tparam PyClass Pybind11 class type
- */
-/*
-template <class PrecisionT, class ParamT, class PyClass>
-void registerImplementedGatesForKernel(PyClass &pyclass) {
-    using namespace Pennylane::Gates;
-
-    auto registerToPyclass =
-        [&pyclass](auto &&gate_op_lambda_pair) -> GateOperation {
-        const auto &[gate_op, func] = gate_op_lambda_pair;
-        if (gate_op == GateOperation::Matrix) {
-            const std::string name = "applyMatrix_" + kernel_name;
-            const std::string doc = "Apply a given matrix to wires.";
-            pyclass.def(name.c_str(), func, doc.c_str());
-        } else {
-            const auto gate_name =
-                std::string(lookup(Constant::gate_names, gate_op));
-            const std::string doc = "Apply the " + gate_name + " gate.";
-            auto func = [&gate_name](StateVectorManagedCPU<PrecisionT>& sv,
-                                     const std::vector<size_t> &wires,
-                                     bool inverse,
-                                     const std::vector<ParamT> &params) {
-                sv.applyOperation(gate_name, wires, inverse, params);
-            }
-            pyclass.def(name.c_str(), , doc.c_str());
-        }
-        return gate_op;
-    };
-
-    [[maybe_unused]] const auto registerd_gate_ops = std::apply(
-        [&registerToPyclass](auto... elt) {
-            return std::make_tuple(registerToPyclass(elt)...);
-        },
-        gate_op_lambda_pairs);
-}
-*/
 /// @cond DEV
 template <class PrecisionT, class ParamT, class SVType, class PyClass>
 void registerGatesForStateVector(PyClass &pyclass) {
diff --git a/pennylane_lightning/src/gates/Constant.hpp b/pennylane_lightning/src/gates/Constant.hpp
index 62f0859829..00c061cc43 100644
--- a/pennylane_lightning/src/gates/Constant.hpp
+++ b/pennylane_lightning/src/gates/Constant.hpp
@@ -205,69 +205,4 @@ namespace Pennylane::Gates::Constant {
     std::pair<GateOperation, size_t>{GateOperation::CSWAP, 0},
     std::pair<GateOperation, size_t>{GateOperation::MultiRZ, 1},
 };
-
-/**
- *
- * @brief Define which kernel to use for each gate operation.
- *
- * @rst
- * Check
- * `this repository
- * <https://github.com/PennyLaneAI/pennylane-lightning-compare-kernels>`_ to see
- * the benchmark results for each gate
- * @endrst
- *
- * This value is used for:
- * 1. StateVector apply##GATE_NAME methods. The kernel function is statically
- * binded to the given kernel and cannot be modified.
- * 2. Default kernel functions for DynamicDispatcher. The kernel function is
- * dynamically binded and can be changed using DynamicDispatcher singleton
- * class.
- * 3. For the Python binding.
- */
-[[maybe_unused]] constexpr std::array default_kernel_for_gates = {
-    std::pair{GateOperation::PauliX, KernelType::LM},
-    std::pair{GateOperation::PauliY, KernelType::LM},
-    std::pair{GateOperation::PauliZ, KernelType::LM},
-    std::pair{GateOperation::Hadamard, KernelType::PI},
-    std::pair{GateOperation::S, KernelType::LM},
-    std::pair{GateOperation::T, KernelType::LM},
-    std::pair{GateOperation::RX, KernelType::PI},
-    std::pair{GateOperation::RY, KernelType::PI},
-    std::pair{GateOperation::RZ, KernelType::LM},
-    std::pair{GateOperation::PhaseShift, KernelType::LM},
-    std::pair{GateOperation::Rot, KernelType::LM},
-    std::pair{GateOperation::ControlledPhaseShift, KernelType::PI},
-    std::pair{GateOperation::CNOT, KernelType::LM},
-    std::pair{GateOperation::CY, KernelType::PI},
-    std::pair{GateOperation::CZ, KernelType::LM},
-    std::pair{GateOperation::SWAP, KernelType::LM},
-    std::pair{GateOperation::IsingXX, KernelType::LM},
-    std::pair{GateOperation::IsingYY, KernelType::LM},
-    std::pair{GateOperation::IsingZZ, KernelType::LM},
-    std::pair{GateOperation::CRX, KernelType::LM},
-    std::pair{GateOperation::CRY, KernelType::LM},
-    std::pair{GateOperation::CRZ, KernelType::LM},
-    std::pair{GateOperation::CRot, KernelType::PI},
-    std::pair{GateOperation::Toffoli, KernelType::PI},
-    std::pair{GateOperation::CSWAP, KernelType::PI},
-    std::pair{GateOperation::MultiRZ, KernelType::LM},
-};
-/**
- * @brief Define which kernel to use for each generator operation.
- */
-[[maybe_unused]] constexpr std::array default_kernel_for_generators = {
-    std::pair{GeneratorOperation::PhaseShift, KernelType::PI},
-    std::pair{GeneratorOperation::RX, KernelType::LM},
-    std::pair{GeneratorOperation::RY, KernelType::LM},
-    std::pair{GeneratorOperation::RZ, KernelType::LM},
-    std::pair{GeneratorOperation::IsingXX, KernelType::LM},
-    std::pair{GeneratorOperation::IsingYY, KernelType::LM},
-    std::pair{GeneratorOperation::IsingZZ, KernelType::LM},
-    std::pair{GeneratorOperation::CRX, KernelType::PI},
-    std::pair{GeneratorOperation::CRY, KernelType::PI},
-    std::pair{GeneratorOperation::CRZ, KernelType::PI},
-    std::pair{GeneratorOperation::ControlledPhaseShift, KernelType::PI},
-    std::pair{GeneratorOperation::MultiRZ, KernelType::LM},
-};
 } // namespace Pennylane::Gates::Constant

From f33876e96879dcf29cbb93f45b3f7cbe4b300e7e Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 2 Mar 2022 14:19:10 -0500
Subject: [PATCH 18/94] Fix

---
 .../src/simulator/StateVectorBase.hpp         | 123 ------------------
 .../src/tests/TestAvailableKernels.hpp        |  16 ---
 .../src/tests/TestConstant.hpp                |  21 ---
 3 files changed, 160 deletions(-)

diff --git a/pennylane_lightning/src/simulator/StateVectorBase.hpp b/pennylane_lightning/src/simulator/StateVectorBase.hpp
index 4853754815..1b48512164 100644
--- a/pennylane_lightning/src/simulator/StateVectorBase.hpp
+++ b/pennylane_lightning/src/simulator/StateVectorBase.hpp
@@ -59,17 +59,6 @@
         Gates::SelectKernel<kernel>::apply##GATE_NAME(                         \
             arr, num_qubits_, wires, inverse, std::forward<Ts>(args)...);      \
     }
-
-#define PENNYLANE_STATEVECTOR_DEFINE_DEFAULT_GATE(GATE_NAME)                   \
-    template <typename... Ts>                                                  \
-    inline void apply##GATE_NAME(const std::vector<size_t> &wires,             \
-                                 bool inverse, Ts &&...args) {                 \
-        constexpr auto kernel =                                                \
-            Gates::static_lookup<Gates::GateOperation::GATE_NAME>(             \
-                Gates::Constant::default_kernel_for_gates);                    \
-        apply##GATE_NAME##_<kernel>(wires, inverse,                            \
-                                    std::forward<Ts>(args)...);                \
-    }
 #define PENNYLANE_STATEVECTOR_DEFINE_GENERATOR(GENERATOR_NAME)                 \
     template <KernelType kernel, typename... Ts>                               \
     inline void applyGenerator##GENERATOR_NAME##_(                             \
@@ -464,12 +453,6 @@ template <class T, class Derived> class StateVectorBase {
      */
     PENNYLANE_STATEVECTOR_DEFINE_GATE(PauliX)
 
-    /**
-     * @brief Apply PauliX gate operation using a kernel given in
-     * default_kernel_for_gates
-     */
-    PENNYLANE_STATEVECTOR_DEFINE_DEFAULT_GATE(PauliX)
-
     /**
      * @brief Apply PauliY gate operation to given indices of statevector.
      *
@@ -478,12 +461,6 @@ template <class T, class Derived> class StateVectorBase {
      */
     PENNYLANE_STATEVECTOR_DEFINE_GATE(PauliY)
 
-    /**
-     * @brief Apply PauliY gate operation using a kernel given in
-     * default_kernel_for_gates
-     */
-    PENNYLANE_STATEVECTOR_DEFINE_DEFAULT_GATE(PauliY)
-
     /**
      * @brief Apply PauliZ gate operation to given indices of statevector.
      *
@@ -491,11 +468,6 @@ template <class T, class Derived> class StateVectorBase {
      * @param inverse Take adjoint of given operation.
      */
     PENNYLANE_STATEVECTOR_DEFINE_GATE(PauliZ)
-    /**
-     * @brief Apply PauliZ gate operation using a kernel given in
-     * default_kernel_for_gates
-     */
-    PENNYLANE_STATEVECTOR_DEFINE_DEFAULT_GATE(PauliZ)
 
     /**
      * @brief Apply Hadamard gate operation to given indices of statevector.
@@ -504,11 +476,6 @@ template <class T, class Derived> class StateVectorBase {
      * @param inverse Take adjoint of given operation.
      */
     PENNYLANE_STATEVECTOR_DEFINE_GATE(Hadamard)
-    /**
-     * @brief Apply Hadamard gate operation using a kernel given in
-     * default_kernel_for_gates
-     */
-    PENNYLANE_STATEVECTOR_DEFINE_DEFAULT_GATE(Hadamard)
 
     /**
      * @brief Apply S gate operation to given indices of statevector.
@@ -517,11 +484,6 @@ template <class T, class Derived> class StateVectorBase {
      * @param inverse Take adjoint of given operation.
      */
     PENNYLANE_STATEVECTOR_DEFINE_GATE(S)
-    /**
-     * @brief Apply S gate operation using a kernel given in
-     * default_kernel_for_gates
-     */
-    PENNYLANE_STATEVECTOR_DEFINE_DEFAULT_GATE(S)
 
     /**
      * @brief Apply T gate operation to given indices of statevector.
@@ -530,11 +492,6 @@ template <class T, class Derived> class StateVectorBase {
      * @param inverse Take adjoint of given operation.
      */
     PENNYLANE_STATEVECTOR_DEFINE_GATE(T)
-    /**
-     * @brief Apply T gate operation using a kernel given in
-     * default_kernel_for_gates
-     */
-    PENNYLANE_STATEVECTOR_DEFINE_DEFAULT_GATE(T)
 
     /**
      * @brief Apply RX gate operation to given indices of statevector.
@@ -544,11 +501,6 @@ template <class T, class Derived> class StateVectorBase {
      * @param angle Rotation angle of gate.
      */
     PENNYLANE_STATEVECTOR_DEFINE_GATE(RX)
-    /**
-     * @brief Apply RX gate operation using a kernel given in
-     * default_kernel_for_gates
-     */
-    PENNYLANE_STATEVECTOR_DEFINE_DEFAULT_GATE(RX)
 
     /**
      * @brief Apply RY gate operation to given indices of statevector.
@@ -558,11 +510,6 @@ template <class T, class Derived> class StateVectorBase {
      * @param angle Rotation angle of gate.
      */
     PENNYLANE_STATEVECTOR_DEFINE_GATE(RY)
-    /**
-     * @brief Apply RY gate operation using a kernel given in
-     * default_kernel_for_gates
-     */
-    PENNYLANE_STATEVECTOR_DEFINE_DEFAULT_GATE(RY)
 
     /**
      * @brief Apply RZ gate operation to given indices of statevector.
@@ -572,11 +519,6 @@ template <class T, class Derived> class StateVectorBase {
      * @param angle Rotation angle of gate.
      */
     PENNYLANE_STATEVECTOR_DEFINE_GATE(RZ)
-    /**
-     * @brief Apply RZ gate operation using a kernel given in
-     * default_kernel_for_gates
-     */
-    PENNYLANE_STATEVECTOR_DEFINE_DEFAULT_GATE(RZ)
 
     /**
      * @brief Apply phase shift gate operation to given indices of statevector.
@@ -586,11 +528,6 @@ template <class T, class Derived> class StateVectorBase {
      * @param angle Phase shift angle.
      */
     PENNYLANE_STATEVECTOR_DEFINE_GATE(PhaseShift)
-    /**
-     * @brief Apply PhaseShift gate operation using a kernel given in
-     * default_kernel_for_gates
-     */
-    PENNYLANE_STATEVECTOR_DEFINE_DEFAULT_GATE(PhaseShift)
 
     /*
      * @brief Apply Rot gate \f$RZ(\omega)RY(\theta)RZ(\phi)\f$ to given indices
@@ -603,11 +540,6 @@ template <class T, class Derived> class StateVectorBase {
      * @param omega Gate rotation parameter \f$\omega\f$.
      */
     PENNYLANE_STATEVECTOR_DEFINE_GATE(Rot)
-    /**
-     * @brief Apply Rot gate operation using a kernel given in
-     * default_kernel_for_gates
-     */
-    PENNYLANE_STATEVECTOR_DEFINE_DEFAULT_GATE(Rot)
 
     /**
      * @brief Apply controlled phase shift gate operation to given indices of
@@ -618,11 +550,6 @@ template <class T, class Derived> class StateVectorBase {
      * @param angle Phase shift angle.
      */
     PENNYLANE_STATEVECTOR_DEFINE_GATE(ControlledPhaseShift)
-    /**
-     * @brief Apply controlled phase shift gate operation using a kernel given
-     * in default_kernel_for_gates
-     */
-    PENNYLANE_STATEVECTOR_DEFINE_DEFAULT_GATE(ControlledPhaseShift)
 
     /**
      * @brief Apply CNOT (CX) gate to given indices of statevector.
@@ -631,11 +558,6 @@ template <class T, class Derived> class StateVectorBase {
      * @param inverse Take adjoint of given operation.
      */
     PENNYLANE_STATEVECTOR_DEFINE_GATE(CNOT)
-    /**
-     * @brief Apply CNOT gate operation using a kernel given in
-     * default_kernel_for_gates
-     */
-    PENNYLANE_STATEVECTOR_DEFINE_DEFAULT_GATE(CNOT)
 
     /**
      * @brief Apply CY gate to given indices of statevector.
@@ -644,11 +566,6 @@ template <class T, class Derived> class StateVectorBase {
      * @param inverse Take adjoint of given operation.
      */
     PENNYLANE_STATEVECTOR_DEFINE_GATE(CY)
-    /**
-     * @brief Apply CY gate operation using a kernel given in
-     * default_kernel_for_gates
-     */
-    PENNYLANE_STATEVECTOR_DEFINE_DEFAULT_GATE(CY)
 
     /**
      * @brief Apply CZ gate to given indices of statevector.
@@ -657,11 +574,6 @@ template <class T, class Derived> class StateVectorBase {
      * @param inverse Take adjoint of given operation.
      */
     PENNYLANE_STATEVECTOR_DEFINE_GATE(CZ)
-    /**
-     * @brief Apply CZ gate operation using a kernel given in
-     * default_kernel_for_gates
-     */
-    PENNYLANE_STATEVECTOR_DEFINE_DEFAULT_GATE(CZ)
 
     /**
      * @brief Apply SWAP gate to given indices of statevector.
@@ -670,11 +582,6 @@ template <class T, class Derived> class StateVectorBase {
      * @param inverse Take adjoint of given operation.
      */
     PENNYLANE_STATEVECTOR_DEFINE_GATE(SWAP)
-    /**
-     * @brief Apply SWAP gate operation using a kernel given in
-     * default_kernel_for_gates
-     */
-    PENNYLANE_STATEVECTOR_DEFINE_DEFAULT_GATE(SWAP)
 
     /**
      * @brief Apply CRX gate to given indices of statevector.
@@ -684,11 +591,6 @@ template <class T, class Derived> class StateVectorBase {
      * @param angle Rotation angle of gate.
      */
     PENNYLANE_STATEVECTOR_DEFINE_GATE(CRX)
-    /**
-     * @brief Apply CRX gate operation using a kernel given in
-     * default_kernel_for_gates
-     */
-    PENNYLANE_STATEVECTOR_DEFINE_DEFAULT_GATE(CRX)
 
     /**
      * @brief Apply CRY gate to given indices of statevector.
@@ -698,11 +600,6 @@ template <class T, class Derived> class StateVectorBase {
      * @param angle Rotation angle of gate.
      */
     PENNYLANE_STATEVECTOR_DEFINE_GATE(CRY)
-    /**
-     * @brief Apply CRY gate operation using a kernel given in
-     * default_kernel_for_gates
-     */
-    PENNYLANE_STATEVECTOR_DEFINE_DEFAULT_GATE(CRY)
 
     /**
      * @brief Apply CRZ gate to given indices of statevector.
@@ -712,11 +609,6 @@ template <class T, class Derived> class StateVectorBase {
      * @param angle Rotation angle of gate.
      */
     PENNYLANE_STATEVECTOR_DEFINE_GATE(CRZ)
-    /**
-     * @brief Apply CRZ gate operation using a kernel given in
-     * default_kernel_for_gates
-     */
-    PENNYLANE_STATEVECTOR_DEFINE_DEFAULT_GATE(CRZ)
 
     /**
      * @brief Apply CRot gate (controlled \f$RZ(\omega)RY(\theta)RZ(\phi)\f$) to
@@ -729,11 +621,6 @@ template <class T, class Derived> class StateVectorBase {
      * @param omega Gate rotation parameter \f$\omega\f$.
      */
     PENNYLANE_STATEVECTOR_DEFINE_GATE(CRot)
-    /**
-     * @brief Apply CRot gate operation using a kernel given in
-     * default_kernel_for_gates
-     */
-    PENNYLANE_STATEVECTOR_DEFINE_DEFAULT_GATE(CRot)
 
     /**
      * @brief Apply Toffoli (CCX) gate to given indices of statevector.
@@ -742,11 +629,6 @@ template <class T, class Derived> class StateVectorBase {
      * @param inverse Take adjoint of given operation.
      */
     PENNYLANE_STATEVECTOR_DEFINE_GATE(Toffoli)
-    /**
-     * @brief Apply Toffoli gate operation using a kernel given in
-     * default_kernel_for_gates
-     */
-    PENNYLANE_STATEVECTOR_DEFINE_DEFAULT_GATE(Toffoli)
 
     /**
      * @brief Apply CSWAP gate to given indices of statevector.
@@ -755,11 +637,6 @@ template <class T, class Derived> class StateVectorBase {
      * @param inverse Take adjoint of given operation.
      */
     PENNYLANE_STATEVECTOR_DEFINE_GATE(CSWAP)
-    /**
-     * @brief Apply CSWAP gate operation using a kernel given in
-     * default_kernel_for_gates
-     */
-    PENNYLANE_STATEVECTOR_DEFINE_DEFAULT_GATE(CSWAP)
 };
 
 /**
diff --git a/pennylane_lightning/src/tests/TestAvailableKernels.hpp b/pennylane_lightning/src/tests/TestAvailableKernels.hpp
index 4170856b96..669d98ddc8 100644
--- a/pennylane_lightning/src/tests/TestAvailableKernels.hpp
+++ b/pennylane_lightning/src/tests/TestAvailableKernels.hpp
@@ -68,20 +68,4 @@ static_assert(Util::count_unique(Util::first_elts_of(kernel_id_name_pairs)) ==
 static_assert(Util::count_unique(Util::second_elts_of(kernel_id_name_pairs)) ==
                   Util::length<AvailableKernels>(),
               "Kernel names must be distinct.");
-
-/*******************************************************************************
- * Check all kernels in default_kernel_for_gates are available
- ******************************************************************************/
-
-static_assert(check_kernels_are_available(
-                  Util::second_elts_of(Constant::default_kernel_for_gates)),
-              "default_kernel_for_gates contains an unavailable kernel");
-
-/*******************************************************************************
- * Check all kernels in default_kernel_for_generators are available
- ******************************************************************************/
-
-static_assert(check_kernels_are_available(Util::second_elts_of(
-                  Constant::default_kernel_for_generators)),
-              "default_kernel_for_gates contains an unavailable kernel");
 } // namespace Pennylane::Gates
diff --git a/pennylane_lightning/src/tests/TestConstant.hpp b/pennylane_lightning/src/tests/TestConstant.hpp
index 4d20e25af2..8231fb1a97 100644
--- a/pennylane_lightning/src/tests/TestConstant.hpp
+++ b/pennylane_lightning/src/tests/TestConstant.hpp
@@ -97,25 +97,4 @@ static_assert(
     Util::count_unique(Util::first_elts_of(Constant::generator_wires)) ==
         Constant::generator_wires.size(),
     "First elements of generator_wires must be distinct.");
-
-/*******************************************************************************
- * Check default_kernel_for_gates are defined for all gates
- ******************************************************************************/
-
-static_assert(
-    Util::count_unique(
-        Util::first_elts_of(Constant::default_kernel_for_gates)) ==
-        static_cast<size_t>(GateOperation::END),
-    "Constant default_kernel_for_gates must be defined for all gates.");
-
-/*******************************************************************************
- * Check default_kernel_for_generators are defined for all generators
- ******************************************************************************/
-
-static_assert(Util::count_unique(Util::first_elts_of(
-                  Constant::default_kernel_for_generators)) ==
-                  static_cast<size_t>(GeneratorOperation::END),
-              "Constant default_kernel_for_generators must be defined for all "
-              "generators.");
-
 } // namespace Pennylane::Gates

From af6387b262c6cee53ad8b4d5312d75623e1cc08e Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae.yeun.park@gmail.com>
Date: Wed, 2 Mar 2022 22:53:05 +0000
Subject: [PATCH 19/94] Fix memory allocate mechanism; Fix some tests for MSVX

---
 pennylane_lightning/src/bindings/Bindings.hpp | 41 +++++++-----------
 .../cpu_kernels/GateImplementationsLM.hpp     |  4 +-
 .../src/tests/CreateAllWires.cpp              |  2 +-
 pennylane_lightning/src/tests/TestHelpers.hpp |  8 ++--
 .../src/tests/Test_AdjDiff.cpp                |  6 ++-
 .../src/tests/Test_DynamicDispatcher.cpp      | 43 ++++++++-----------
 ...est_GateImplementations_CompareKernels.cpp | 12 +++---
 .../Test_GateImplementations_Generator.cpp    |  6 +--
 .../Test_GateImplementations_Inverse.cpp      |  2 +-
 pennylane_lightning/src/util/Memory.hpp       |  6 ++-
 10 files changed, 60 insertions(+), 70 deletions(-)

diff --git a/pennylane_lightning/src/bindings/Bindings.hpp b/pennylane_lightning/src/bindings/Bindings.hpp
index 9a3a4aed1b..039d516dde 100644
--- a/pennylane_lightning/src/bindings/Bindings.hpp
+++ b/pennylane_lightning/src/bindings/Bindings.hpp
@@ -97,7 +97,18 @@ auto getNumpyArrayAlignment(const pybind11::array &numpyArray)
     return getMemoryModel(numpyArray.request().ptr);
 }
 
-void deallocateArray(void *ptr) { alignedFree(ptr); }
+template <typename T>
+auto alignedNumpyArray(CPUMemoryModel memory_model, size_t size) -> pybind11::array {
+    if (getAlignment<T>(memory_model) > alignof(std::max_align_t)) {
+        void* ptr = alignedAlloc(getAlignment<T>(memory_model),
+            sizeof(T) * size);
+        auto capsule = pybind11::capsule(ptr, &alignedFree);
+        return pybind11::array{pybind11::dtype::of<T>(), {size}, {sizeof(T)}, ptr, capsule};
+    } // else
+    void* ptr = malloc(sizeof(T) * size);
+    auto capsule = pybind11::capsule(ptr, free);
+    return pybind11::array{ pybind11::dtype::of<T>(), {size}, {sizeof(T)}, ptr, capsule };
+}
 
 /**
  * @brief We return an numpy array whose underlying data is allocated by
@@ -110,33 +121,13 @@ auto allocateAlignedArray(size_t size, pybind11::dtype dt) -> pybind11::array {
     auto memory_model = bestCPUMemoryModel();
 
     if (dt.is(pybind11::dtype::of<float>())) {
-        void *ptr = alignedAlloc(getAlignment<float>(memory_model),
-                                 sizeof(float) * size);
-        auto capsule = pybind11::capsule(ptr, &deallocateArray);
-
-        return pybind11::array{dt, {size}, {sizeof(float)}, ptr, capsule};
+        return alignedNumpyArray<float>(memory_model, size);
     } else if (dt.is(pybind11::dtype::of<double>())) {
-        void *ptr = alignedAlloc(getAlignment<double>(memory_model),
-                                 sizeof(double) * size);
-        auto capsule = pybind11::capsule(ptr, &deallocateArray);
-
-        return pybind11::array{dt, {size}, {sizeof(double)}, ptr, capsule};
+        return alignedNumpyArray<double>(memory_model, size);
     } else if (dt.is(pybind11::dtype::of<std::complex<float>>())) {
-        void *ptr =
-            alignedAlloc(getAlignment<std::complex<float>>(memory_model),
-                         sizeof(std::complex<float>) * size);
-        auto capsule = pybind11::capsule(ptr, &deallocateArray);
-
-        return pybind11::array{
-            dt, {size}, {sizeof(std::complex<float>)}, ptr, capsule};
+        return alignedNumpyArray<std::complex<float>>(memory_model, size);
     } else if (dt.is(pybind11::dtype::of<std::complex<double>>())) {
-        void *ptr =
-            alignedAlloc(getAlignment<std::complex<double>>(memory_model),
-                         sizeof(std::complex<double>) * size);
-        auto capsule = pybind11::capsule(ptr, &deallocateArray);
-
-        return pybind11::array{
-            dt, {size}, {sizeof(std::complex<double>)}, ptr, capsule};
+        return alignedNumpyArray<std::complex<double>>(memory_model, size);
     } else {
         throw pybind11::type_error("Unsupported datatype.");
     }
diff --git a/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp b/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp
index 651731f8d2..04a0df9ab1 100644
--- a/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp
+++ b/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp
@@ -253,7 +253,7 @@ class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
                       const std::vector<size_t> &wires, bool inverse) {
         assert(num_qubits >= wires.size());
 
-        size_t dim = 1U << wires.size();
+        size_t dim = static_cast<size_t>(1U) << wires.size();
         std::vector<size_t> indices;
         indices.resize(dim);
         std::vector<std::complex<PrecisionT>> coeffs_in(dim, 0.0);
@@ -1324,7 +1324,7 @@ class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
         }
 
         for (size_t k = 0; k < Util::exp2(num_qubits); k++) {
-            arr[k] *= (2 * int(Util::popcount(k & wires_parity) % 2) - 1);
+            arr[k] *= static_cast<PrecisionT>(2 * int(Util::popcount(k & wires_parity) % 2) - 1);
         }
         // NOLINTNEXTLINE(readability-magic-numbers)
         return static_cast<PrecisionT>(0.5);
diff --git a/pennylane_lightning/src/tests/CreateAllWires.cpp b/pennylane_lightning/src/tests/CreateAllWires.cpp
index ecea28089c..dd0194a625 100644
--- a/pennylane_lightning/src/tests/CreateAllWires.cpp
+++ b/pennylane_lightning/src/tests/CreateAllWires.cpp
@@ -7,7 +7,7 @@ auto crateAllWires(size_t n_qubits, Gates::GateOperation gate_op, bool order)
         std::vector<std::vector<size_t>> res;
         res.reserve((1U << n_qubits) - 1);
         ;
-        for (size_t k = 1; k < (1U << n_qubits); k++) {
+        for (size_t k = 1; k < (static_cast<size_t>(1U) << n_qubits); k++) {
             std::vector<size_t> wires;
             wires.reserve(Util::popcount(k));
 
diff --git a/pennylane_lightning/src/tests/TestHelpers.hpp b/pennylane_lightning/src/tests/TestHelpers.hpp
index facb6372c7..5bc840a09e 100644
--- a/pennylane_lightning/src/tests/TestHelpers.hpp
+++ b/pennylane_lightning/src/tests/TestHelpers.hpp
@@ -222,9 +222,9 @@ auto createRandomState(RandomEngine &re, size_t num_qubits)
     using Util::squaredNorm;
 
     TestVector<std::complex<PrecisionT>> res(
-        1U << num_qubits, {0.0, 0.0}, test_allocator<std::complex<PrecisionT>>);
+        static_cast<size_t>(1U) << num_qubits, {0.0, 0.0}, test_allocator<std::complex<PrecisionT>>);
     std::uniform_real_distribution<PrecisionT> dist;
-    for (size_t idx = 0; idx < (1U << num_qubits); idx++) {
+    for (size_t idx = 0; idx < (static_cast<size_t>(1U) << num_qubits); idx++) {
         res[idx] = {dist(re), dist(re)};
     }
 
@@ -308,9 +308,9 @@ auto createParams(Gates::GateOperation op) -> std::vector<PrecisionT> {
     case 0:
         return {};
     case 1:
-        return {0.312};
+        return {static_cast<PrecisionT>(0.312)};
     case 3:
-        return {0.128, -0.563, 1.414};
+        return {static_cast<PrecisionT>(0.128), static_cast<PrecisionT>(-0.563), static_cast<PrecisionT>(1.414)};
     default:
         PL_ABORT("The number of parameters for a given gate is unknown.");
     }
diff --git a/pennylane_lightning/src/tests/Test_AdjDiff.cpp b/pennylane_lightning/src/tests/Test_AdjDiff.cpp
index d1f9e94136..92ca6d0cc1 100644
--- a/pennylane_lightning/src/tests/Test_AdjDiff.cpp
+++ b/pennylane_lightning/src/tests/Test_AdjDiff.cpp
@@ -1,5 +1,3 @@
-#define _USE_MATH_DEFINES
-
 #include <algorithm>
 #include <cmath>
 #include <complex>
@@ -18,6 +16,10 @@
 
 #include "TestHelpers.hpp"
 
+#if !defined(_USE_MATH_DEFINES)
+#define _USE_MATH_DEFINES
+#endif
+
 using namespace Pennylane;
 using namespace Pennylane::Algorithms;
 
diff --git a/pennylane_lightning/src/tests/Test_DynamicDispatcher.cpp b/pennylane_lightning/src/tests/Test_DynamicDispatcher.cpp
index 3511a12da9..ddca995be0 100644
--- a/pennylane_lightning/src/tests/Test_DynamicDispatcher.cpp
+++ b/pennylane_lightning/src/tests/Test_DynamicDispatcher.cpp
@@ -28,14 +28,23 @@ using Pennylane::Gates::callGateOps;
  * We just check DynamicDispacther calls the correct functuion by comparing
  * the result from it with that of the direct call.
  */
-template <typename PrecisionT, typename ParamT, class GateImplementation>
+
+template <typename PrecisionT, typename ParamT, class GateImplementation,
+    GateOperation gate_op, class RandomEngine, class Enable = void>
+    struct testDispatchForKernel {
+        static void test(RandomEngine& re, size_t num_qubits) {
+        // Keep source, but allow clang-tidy to pass for unused
+        static_cast<void>(re);
+        static_cast<void>(num_qubits);
+    } // Do nothing if not implemented;
+      // This could probably be replaced with an enable_if or SFINAE-like
+      // pattern.
+};
+template <typename PrecisionT, typename ParamT, class GateImplementation,
+    GateOperation gate_op, class RandomEngine,
+    std::enable_if_t<Util::array_has_elt(GateImplementation::implemented_gates, gate_op)>>
 struct testDispatchForKernel {
-    template <
-        GateOperation gate_op, class RandomEngine,
-        std::enable_if_t<
-            Util::array_has_elt(GateImplementation::implemented_gates, gate_op),
-            bool> = true>
-    static void test(RandomEngine &re, size_t num_qubits) {
+    static void test(RandomEngine& re, size_t num_qubits) {
         const auto ini_st = createRandomState<PrecisionT>(re, num_qubits);
         auto expected = ini_st;
 
@@ -46,9 +55,9 @@ struct testDispatchForKernel {
         // in the GateImplementation
         auto gate_func =
             GateOpToMemberFuncPtr<PrecisionT, ParamT, GateImplementation,
-                                  gate_op>::value;
+            gate_op>::value;
         callGateOps(gate_func, expected.data(), num_qubits, wires, false,
-                    params);
+            params);
 
         // and compare it to the dynamic dispatcher
         auto test_st = ini_st;
@@ -59,19 +68,6 @@ struct testDispatchForKernel {
             gate_name, wires, false, params);
         REQUIRE(test_st == expected);
     }
-
-    template <
-        GateOperation gate_op, class RandomEngine,
-        std::enable_if_t<!Util::array_has_elt(
-                             GateImplementation::implemented_gates, gate_op),
-                         bool> = true>
-    static void test(RandomEngine &re, size_t num_qubits) {
-        // Keep source, but allow clang-tidy to pass for unused
-        static_cast<void>(re);
-        static_cast<void>(num_qubits);
-    } // Do nothing if not implemented;
-      // This could probably be replaced with an enable_if or SFINAE-like
-      // pattern.
 };
 
 template <typename PrecisionT, typename ParamT, class GateImplementation,
@@ -83,8 +79,7 @@ constexpr void testAllGatesForKernelIter(RandomEngine &re,
 
         for (size_t num_qubits = 3; num_qubits <= max_num_qubits;
              num_qubits++) {
-            testDispatchForKernel<PrecisionT, ParamT, GateImplementation>::
-                template test<gate_op>(re, num_qubits);
+            testDispatchForKernel<PrecisionT, ParamT, GateImplementation, gate_op, RandomEngine>::test(re, num_qubits);
         }
 
         testAllGatesForKernelIter<PrecisionT, ParamT, GateImplementation,
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
index eb65520c7b..1960c8d9ae 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
@@ -29,12 +29,10 @@ using std::vector;
 
 template <typename TypeList> std::string kernelsToString() {
     if constexpr (!std::is_same_v<TypeList, void>) {
-        if constexpr (!std::is_same_v<typename TypeList::Next, void>) {
-            return std::string(TypeList::Type::name) + ", " +
-                   kernelsToString<typename TypeList::Next>();
-        }
-        return std::string(TypeList::Type::name);
+        return std::string(TypeList::Type::name) + ", " +
+                kernelsToString<typename TypeList::Next>();
     }
+    return std::string("");
 }
 
 /* Type transformation */
@@ -128,7 +126,7 @@ void testApplyGate(RandomEngine &re, size_t num_qubits) {
                     std::make_index_sequence<length<Kernels>()>()));
 
             for (size_t i = 0; i < results.size() - 1; i++) {
-                REQUIRE(results[i] == PLApprox(results[i + 1]).margin(1e-7));
+                REQUIRE(results[i] == PLApprox(results[i + 1]).margin(static_cast<PrecisionT>(1e-5)));
             }
         }
 
@@ -142,7 +140,7 @@ void testApplyGate(RandomEngine &re, size_t num_qubits) {
                     std::make_index_sequence<length<Kernels>()>()));
 
             for (size_t i = 0; i < results.size() - 1; i++) {
-                REQUIRE(results[i] == PLApprox(results[i + 1]).margin(1e-7));
+                REQUIRE(results[i] == PLApprox(results[i + 1]).margin(static_cast<PrecisionT>(1e-5)));
             }
         }
     }
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp
index 4b80e5235b..b22221ec63 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp
@@ -74,7 +74,7 @@ void testGeneratorForGate(RandomEngine &re, size_t num_qubits) {
     using ComplexPrecisionT = std::complex<PrecisionT>;
     constexpr auto I = Util::IMAG<PrecisionT>();
 
-    constexpr ParamT eps = 1e-4; // For finite difference
+    constexpr ParamT eps = static_cast<ParamT>(1e-4); // For finite difference
 
     constexpr auto gate_op = static_lookup<gntr_op>(generator_gate_pairs);
     constexpr auto gate_name = static_lookup<gate_op>(Constant::gate_names);
@@ -105,7 +105,7 @@ void testGeneratorForGate(RandomEngine &re, size_t num_qubits) {
         gate_func(diff_st_1.data(), num_qubits, wires, false, eps);
         gate_func(diff_st_2.data(), num_qubits, wires, false, -eps);
 
-        std::vector<ComplexPrecisionT> gate_der_st(1U << num_qubits);
+        std::vector<ComplexPrecisionT> gate_der_st(static_cast<size_t>(1U) << num_qubits);
 
         std::transform(
             diff_st_1.cbegin(), diff_st_1.cend(), diff_st_2.cbegin(),
@@ -114,7 +114,7 @@ void testGeneratorForGate(RandomEngine &re, size_t num_qubits) {
 
         scaleVector(gate_der_st, static_cast<PrecisionT>(0.5) / eps);
 
-        REQUIRE(gntr_st == PLApprox(gate_der_st).margin(1e-3));
+        REQUIRE(gntr_st == PLApprox(gate_der_st).margin(static_cast<PrecisionT>(1e-4)));
     }
 }
 template <typename PrecisionT, typename ParamT, class GateImplementation,
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Inverse.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Inverse.cpp
index 4869678201..80d3b4d42b 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Inverse.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Inverse.cpp
@@ -43,7 +43,7 @@ void testInverseKernelGate(RandomEngine &re, size_t num_qubits) {
         callGateOps(func_ptr, st.data(), num_qubits, wires, false, params);
         callGateOps(func_ptr, st.data(), num_qubits, wires, true, params);
 
-        REQUIRE(st == PLApprox(ini_st).margin(1e-7));
+        REQUIRE(st == PLApprox(ini_st).margin(static_cast<PrecisionT>(1e-7)));
     }
 }
 
diff --git a/pennylane_lightning/src/util/Memory.hpp b/pennylane_lightning/src/util/Memory.hpp
index 235a581a34..c41a6004e2 100644
--- a/pennylane_lightning/src/util/Memory.hpp
+++ b/pennylane_lightning/src/util/Memory.hpp
@@ -85,7 +85,11 @@ template <class T> struct AlignedAllocator {
     }
 
     void deallocate(T *p, [[maybe_unused]] std::size_t size) noexcept {
-        alignedFree(p);
+        if (alignment_ > alignof(std::max_align_t)) {
+            alignedFree(p);
+        } else {
+            free(p);
+        }
     }
 
     template <class U> void construct(U *ptr) { ::new ((void *)ptr) U(); }

From 5adac4122c0c0004732de38c1d753765cd9889f4 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 2 Mar 2022 18:57:20 -0500
Subject: [PATCH 20/94] Fix a UB

---
 .../src/algorithms/AdjointDiff.hpp            | 12 +++++-----
 pennylane_lightning/src/bindings/Bindings.hpp | 15 +++++++-----
 .../src/examples/run_benchmark.py             |  1 +
 .../cpu_kernels/GateImplementationsLM.hpp     |  3 ++-
 pennylane_lightning/src/tests/TestHelpers.hpp |  6 +++--
 .../src/tests/Test_DynamicDispatcher.cpp      | 23 +++++++++++--------
 ...est_GateImplementations_CompareKernels.cpp | 10 +++++---
 .../Test_GateImplementations_Generator.cpp    | 17 ++++++++++++--
 .../src/util/LinearAlgebra.hpp                | 12 +++++-----
 pennylane_lightning/src/util/Memory.hpp       |  4 ++--
 10 files changed, 65 insertions(+), 38 deletions(-)
 mode change 100644 => 100755 pennylane_lightning/src/examples/run_benchmark.py

diff --git a/pennylane_lightning/src/algorithms/AdjointDiff.hpp b/pennylane_lightning/src/algorithms/AdjointDiff.hpp
index c717681b40..b27e85f28c 100644
--- a/pennylane_lightning/src/algorithms/AdjointDiff.hpp
+++ b/pennylane_lightning/src/algorithms/AdjointDiff.hpp
@@ -344,8 +344,8 @@ template <class T = double> class AdjointJacobian {
             applyOperations(lambda, ops);
         }
 
-        const auto tp_begin = tp.begin();
-        auto tp_it = tp.end();
+        const auto tp_rend = tp.rend();
+        auto tp_it = tp.rbegin();
 
         StateVectorManagedCPU<T> sv{lambda.getNumQubits(),
                                     Threading::SingleThread};
@@ -368,9 +368,9 @@ template <class T = double> class AdjointJacobian {
                 applyOperationAdj(lambda, ops, op_idx);
 
                 if (ops.hasParams(op_idx)) {
-                    if ((current_param_idx == *(std::prev(tp_it))) ||
-                        std::find(tp_begin, tp_it, current_param_idx) !=
-                            tp_it) {
+                    if ((current_param_idx == *tp_it) ||
+                        std::find(tp_it, tp_rend, current_param_idx) !=
+                            tp_rend) {
                         const T scalingFactor =
                             applyGenerator(mu, ops_name[op_idx],
                                            ops.getOpsWires()[op_idx],
@@ -399,7 +399,7 @@ template <class T = double> class AdjointJacobian {
                                                mu.getData(), mu.getLength()));
                         }
                         trainableParamNumber--;
-                        std::advance(tp_it, -1);
+                        ++tp_it;
                     }
                     current_param_idx--;
                 }
diff --git a/pennylane_lightning/src/bindings/Bindings.hpp b/pennylane_lightning/src/bindings/Bindings.hpp
index 039d516dde..c0e20f5552 100644
--- a/pennylane_lightning/src/bindings/Bindings.hpp
+++ b/pennylane_lightning/src/bindings/Bindings.hpp
@@ -98,16 +98,19 @@ auto getNumpyArrayAlignment(const pybind11::array &numpyArray)
 }
 
 template <typename T>
-auto alignedNumpyArray(CPUMemoryModel memory_model, size_t size) -> pybind11::array {
+auto alignedNumpyArray(CPUMemoryModel memory_model, size_t size)
+    -> pybind11::array {
     if (getAlignment<T>(memory_model) > alignof(std::max_align_t)) {
-        void* ptr = alignedAlloc(getAlignment<T>(memory_model),
-            sizeof(T) * size);
+        void *ptr =
+            alignedAlloc(getAlignment<T>(memory_model), sizeof(T) * size);
         auto capsule = pybind11::capsule(ptr, &alignedFree);
-        return pybind11::array{pybind11::dtype::of<T>(), {size}, {sizeof(T)}, ptr, capsule};
+        return pybind11::array{
+            pybind11::dtype::of<T>(), {size}, {sizeof(T)}, ptr, capsule};
     } // else
-    void* ptr = malloc(sizeof(T) * size);
+    void *ptr = malloc(sizeof(T) * size);
     auto capsule = pybind11::capsule(ptr, free);
-    return pybind11::array{ pybind11::dtype::of<T>(), {size}, {sizeof(T)}, ptr, capsule };
+    return pybind11::array{
+        pybind11::dtype::of<T>(), {size}, {sizeof(T)}, ptr, capsule};
 }
 
 /**
diff --git a/pennylane_lightning/src/examples/run_benchmark.py b/pennylane_lightning/src/examples/run_benchmark.py
old mode 100644
new mode 100755
index f2a770d2c8..ae20d520b0
--- a/pennylane_lightning/src/examples/run_benchmark.py
+++ b/pennylane_lightning/src/examples/run_benchmark.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import subprocess
 import argparse
 import json
diff --git a/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp b/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp
index 04a0df9ab1..87fb7469a8 100644
--- a/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp
+++ b/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp
@@ -1324,7 +1324,8 @@ class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
         }
 
         for (size_t k = 0; k < Util::exp2(num_qubits); k++) {
-            arr[k] *= static_cast<PrecisionT>(2 * int(Util::popcount(k & wires_parity) % 2) - 1);
+            arr[k] *= static_cast<PrecisionT>(
+                2 * int(Util::popcount(k & wires_parity) % 2) - 1);
         }
         // NOLINTNEXTLINE(readability-magic-numbers)
         return static_cast<PrecisionT>(0.5);
diff --git a/pennylane_lightning/src/tests/TestHelpers.hpp b/pennylane_lightning/src/tests/TestHelpers.hpp
index 5bc840a09e..ab383909d4 100644
--- a/pennylane_lightning/src/tests/TestHelpers.hpp
+++ b/pennylane_lightning/src/tests/TestHelpers.hpp
@@ -222,7 +222,8 @@ auto createRandomState(RandomEngine &re, size_t num_qubits)
     using Util::squaredNorm;
 
     TestVector<std::complex<PrecisionT>> res(
-        static_cast<size_t>(1U) << num_qubits, {0.0, 0.0}, test_allocator<std::complex<PrecisionT>>);
+        static_cast<size_t>(1U) << num_qubits, {0.0, 0.0},
+        test_allocator<std::complex<PrecisionT>>);
     std::uniform_real_distribution<PrecisionT> dist;
     for (size_t idx = 0; idx < (static_cast<size_t>(1U) << num_qubits); idx++) {
         res[idx] = {dist(re), dist(re)};
@@ -310,7 +311,8 @@ auto createParams(Gates::GateOperation op) -> std::vector<PrecisionT> {
     case 1:
         return {static_cast<PrecisionT>(0.312)};
     case 3:
-        return {static_cast<PrecisionT>(0.128), static_cast<PrecisionT>(-0.563), static_cast<PrecisionT>(1.414)};
+        return {static_cast<PrecisionT>(0.128), static_cast<PrecisionT>(-0.563),
+                static_cast<PrecisionT>(1.414)};
     default:
         PL_ABORT("The number of parameters for a given gate is unknown.");
     }
diff --git a/pennylane_lightning/src/tests/Test_DynamicDispatcher.cpp b/pennylane_lightning/src/tests/Test_DynamicDispatcher.cpp
index ddca995be0..49e839ea25 100644
--- a/pennylane_lightning/src/tests/Test_DynamicDispatcher.cpp
+++ b/pennylane_lightning/src/tests/Test_DynamicDispatcher.cpp
@@ -30,9 +30,9 @@ using Pennylane::Gates::callGateOps;
  */
 
 template <typename PrecisionT, typename ParamT, class GateImplementation,
-    GateOperation gate_op, class RandomEngine, class Enable = void>
-    struct testDispatchForKernel {
-        static void test(RandomEngine& re, size_t num_qubits) {
+          GateOperation gate_op, class RandomEngine, class Enable = void>
+struct testDispatchForKernel {
+    static void test(RandomEngine &re, size_t num_qubits) {
         // Keep source, but allow clang-tidy to pass for unused
         static_cast<void>(re);
         static_cast<void>(num_qubits);
@@ -41,10 +41,12 @@ template <typename PrecisionT, typename ParamT, class GateImplementation,
       // pattern.
 };
 template <typename PrecisionT, typename ParamT, class GateImplementation,
-    GateOperation gate_op, class RandomEngine,
-    std::enable_if_t<Util::array_has_elt(GateImplementation::implemented_gates, gate_op)>>
-struct testDispatchForKernel {
-    static void test(RandomEngine& re, size_t num_qubits) {
+          GateOperation gate_op, class RandomEngine>
+struct testDispatchForKernel<
+    PrecisionT, ParamT, GateImplementation, gate_op, RandomEngine,
+    std::enable_if_t<Util::array_has_elt(GateImplementation::implemented_gates,
+                                         gate_op)>> {
+    static void test(RandomEngine &re, size_t num_qubits) {
         const auto ini_st = createRandomState<PrecisionT>(re, num_qubits);
         auto expected = ini_st;
 
@@ -55,9 +57,9 @@ struct testDispatchForKernel {
         // in the GateImplementation
         auto gate_func =
             GateOpToMemberFuncPtr<PrecisionT, ParamT, GateImplementation,
-            gate_op>::value;
+                                  gate_op>::value;
         callGateOps(gate_func, expected.data(), num_qubits, wires, false,
-            params);
+                    params);
 
         // and compare it to the dynamic dispatcher
         auto test_st = ini_st;
@@ -79,7 +81,8 @@ constexpr void testAllGatesForKernelIter(RandomEngine &re,
 
         for (size_t num_qubits = 3; num_qubits <= max_num_qubits;
              num_qubits++) {
-            testDispatchForKernel<PrecisionT, ParamT, GateImplementation, gate_op, RandomEngine>::test(re, num_qubits);
+            testDispatchForKernel<PrecisionT, ParamT, GateImplementation,
+                                  gate_op, RandomEngine>::test(re, num_qubits);
         }
 
         testAllGatesForKernelIter<PrecisionT, ParamT, GateImplementation,
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
index 1960c8d9ae..04ff09cc17 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
@@ -30,7 +30,7 @@ using std::vector;
 template <typename TypeList> std::string kernelsToString() {
     if constexpr (!std::is_same_v<TypeList, void>) {
         return std::string(TypeList::Type::name) + ", " +
-                kernelsToString<typename TypeList::Next>();
+               kernelsToString<typename TypeList::Next>();
     }
     return std::string("");
 }
@@ -126,7 +126,9 @@ void testApplyGate(RandomEngine &re, size_t num_qubits) {
                     std::make_index_sequence<length<Kernels>()>()));
 
             for (size_t i = 0; i < results.size() - 1; i++) {
-                REQUIRE(results[i] == PLApprox(results[i + 1]).margin(static_cast<PrecisionT>(1e-5)));
+                REQUIRE(results[i] ==
+                        PLApprox(results[i + 1])
+                            .margin(static_cast<PrecisionT>(1e-5)));
             }
         }
 
@@ -140,7 +142,9 @@ void testApplyGate(RandomEngine &re, size_t num_qubits) {
                     std::make_index_sequence<length<Kernels>()>()));
 
             for (size_t i = 0; i < results.size() - 1; i++) {
-                REQUIRE(results[i] == PLApprox(results[i + 1]).margin(static_cast<PrecisionT>(1e-5)));
+                REQUIRE(results[i] ==
+                        PLApprox(results[i + 1])
+                            .margin(static_cast<PrecisionT>(1e-5)));
             }
         }
     }
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp
index b22221ec63..0dfc1eceb8 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp
@@ -35,6 +35,17 @@ constexpr std::string_view remove_prefix(const std::string_view &str,
     return {str.data() + len, str.length() - len};
 }
 
+template <typename T> constexpr auto testMargin() -> T {
+    static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>);
+    if constexpr (std::is_same_v<T, float>) {
+        return 1e-3F;
+    } else {
+        return 1e-5L;
+    }
+}
+
+template <typename T> constexpr static auto test_margin = testMargin<T>();
+
 template <GeneratorOperation gntr_op>
 constexpr auto findGateOpForGenerator() -> GateOperation {
     constexpr auto gntr_name =
@@ -105,7 +116,8 @@ void testGeneratorForGate(RandomEngine &re, size_t num_qubits) {
         gate_func(diff_st_1.data(), num_qubits, wires, false, eps);
         gate_func(diff_st_2.data(), num_qubits, wires, false, -eps);
 
-        std::vector<ComplexPrecisionT> gate_der_st(static_cast<size_t>(1U) << num_qubits);
+        std::vector<ComplexPrecisionT> gate_der_st(static_cast<size_t>(1U)
+                                                   << num_qubits);
 
         std::transform(
             diff_st_1.cbegin(), diff_st_1.cend(), diff_st_2.cbegin(),
@@ -114,7 +126,8 @@ void testGeneratorForGate(RandomEngine &re, size_t num_qubits) {
 
         scaleVector(gate_der_st, static_cast<PrecisionT>(0.5) / eps);
 
-        REQUIRE(gntr_st == PLApprox(gate_der_st).margin(static_cast<PrecisionT>(1e-4)));
+        REQUIRE(gntr_st ==
+                PLApprox(gate_der_st).margin(test_margin<PrecisionT>));
     }
 }
 template <typename PrecisionT, typename ParamT, class GateImplementation,
diff --git a/pennylane_lightning/src/util/LinearAlgebra.hpp b/pennylane_lightning/src/util/LinearAlgebra.hpp
index 9f6941f891..f6cca5f397 100644
--- a/pennylane_lightning/src/util/LinearAlgebra.hpp
+++ b/pennylane_lightning/src/util/LinearAlgebra.hpp
@@ -463,15 +463,15 @@ inline static void CFTranspose(const std::complex<T> *mat,
  * @param n Number of columns of `mat`.
  * @return mat transpose of shape n * m.
  */
-template <class T>
-inline auto Transpose(const std::vector<std::complex<T>> &mat, size_t m,
-                      size_t n) -> std::vector<std::complex<T>> {
+template <class T, class Alloc>
+inline auto Transpose(const std::vector<std::complex<T>, Alloc> &mat, size_t m,
+                      size_t n) -> std::vector<std::complex<T>, Alloc> {
     if (mat.size() != m * n) {
         throw std::invalid_argument(
             "Invalid number of rows and columns for the input matrix");
     }
 
-    std::vector<std::complex<T>> mat_t(n * m);
+    std::vector<std::complex<T>, Alloc> mat_t(n * m, mat.get_allocator());
     CFTranspose(mat.data(), mat_t.data(), m, n, 0, m, 0, n);
     return mat_t;
 }
@@ -494,7 +494,7 @@ inline auto Transpose(const std::vector<T, Alloc> &mat, size_t m, size_t n)
             "Invalid number of rows and columns for the input matrix");
     }
 
-    std::vector<T, Alloc> mat_t(n * m);
+    std::vector<T, Alloc> mat_t(n * m, mat.get_allocator());
     CFTranspose(mat.data(), mat_t.data(), m, n, 0, m, 0, n);
     return mat_t;
 }
@@ -562,7 +562,7 @@ inline auto vecMatrixProd(const std::vector<T, Alloc> &v_in,
             "Invalid number of rows and columns for the input matrix");
     }
 
-    std::vector<T, Alloc> v_out(n);
+    std::vector<T, Alloc> v_out(n, mat.get_allocator());
     vecMatrixProd(v_in.data(), mat.data(), v_out.data(), m, n);
 
     return v_out;
diff --git a/pennylane_lightning/src/util/Memory.hpp b/pennylane_lightning/src/util/Memory.hpp
index c41a6004e2..ea2c20ec9c 100644
--- a/pennylane_lightning/src/util/Memory.hpp
+++ b/pennylane_lightning/src/util/Memory.hpp
@@ -103,13 +103,13 @@ template <class T> struct AlignedAllocator {
 template <class T, class U>
 bool operator==([[maybe_unused]] const AlignedAllocator<T> &lhs,
                 [[maybe_unused]] const AlignedAllocator<U> &rhs) {
-    return true;
+    return lhs.alignment_ == rhs.alignment_;
 }
 
 template <class T, class U, uint32_t alignment>
 bool operator!=([[maybe_unused]] const AlignedAllocator<T> &lhs,
                 [[maybe_unused]] const AlignedAllocator<U> &rhs) {
-    return false;
+    return lhs.alignment_ != rhs.alignment_;
 }
 
 /**

From 345c2261adbfd356c50038fbce6dfd408b508245 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae.yeun.park@gmail.com>
Date: Thu, 3 Mar 2022 00:13:26 +0000
Subject: [PATCH 21/94] Fix some bugs

---
 .../src/algorithms/AdjointDiff.hpp               |  3 +++
 tests/test_measures.py                           | 16 +++++++++++++---
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/pennylane_lightning/src/algorithms/AdjointDiff.hpp b/pennylane_lightning/src/algorithms/AdjointDiff.hpp
index b27e85f28c..fc02f3c50e 100644
--- a/pennylane_lightning/src/algorithms/AdjointDiff.hpp
+++ b/pennylane_lightning/src/algorithms/AdjointDiff.hpp
@@ -368,6 +368,9 @@ template <class T = double> class AdjointJacobian {
                 applyOperationAdj(lambda, ops, op_idx);
 
                 if (ops.hasParams(op_idx)) {
+                    if (tp_it == tp.rend()) {
+                        break;
+                    }
                     if ((current_param_idx == *tp_it) ||
                         std::find(tp_it, tp_rend, current_param_idx) !=
                             tp_rend) {
diff --git a/tests/test_measures.py b/tests/test_measures.py
index a843253faa..0b8bac88e5 100644
--- a/tests/test_measures.py
+++ b/tests/test_measures.py
@@ -16,6 +16,7 @@
 """
 import numpy as np
 import pennylane as qml
+import math
 from pennylane.measurements import (
     Variance,
     Expectation,
@@ -55,15 +56,18 @@ def dev(self):
 
     def test_probs_dtype64(self, dev):
         """Test if probs changes the state dtype"""
-        dev._state = np.array([1, 0]).astype(np.complex64)
+        dev._state = dev._asarray(np.array([1/math.sqrt(2), 1/math.sqrt(2), 0, 0]).astype(np.complex64))
         p = dev.probability(wires=[0, 1])
 
         assert dev._state.dtype == np.complex64
-        assert np.allclose(p, [1, 1, 0, 0])
+        assert np.allclose(p, [0.5, 0.5, 0, 0])
 
+    @pytest.mark.skipif(
+        not hasattr(np, "complex256"), reason="Numpy only defines complex256 in Linux-like system"
+    )
     def test_probs_dtype_error(self, dev):
         """Test if probs raise error with complex256"""
-        dev._state = np.array([1, 0]).astype(np.complex256)
+        dev._state = dev._asarray(np.array([1, 0]).astype(np.complex256))
 
         with pytest.raises(TypeError, match="Unsupported complex Type:"):
             dev.probability(wires=[0, 1])
@@ -179,6 +183,9 @@ def test_expval_dtype64(self, dev):
         assert dev._state.dtype == np.complex64
         assert np.allclose(e, 0.0)
 
+    @pytest.mark.skipif(
+        not hasattr(np, "complex256"), reason="Numpy only defines complex256 in Linux-like system"
+    )
     def test_expval_dtype_error(self, dev):
         """Test if expval raise error with complex256"""
         dev._state = np.array([1, 0]).astype(np.complex256)
@@ -296,6 +303,9 @@ def test_var_dtype64(self, dev):
         assert dev._state.dtype == np.complex64
         assert np.allclose(v, 1.0)
 
+    @pytest.mark.skipif(
+        not hasattr(np, "complex256"), reason="Numpy only defines complex256 in Linux-like system"
+    )
     def test_expval_dtype_error(self, dev):
         """Test if var raise error with complex256"""
         dev._state = np.array([1, 0]).astype(np.complex256)

From 7881d6fcf1c360374ffe5bc43b8c931897153c82 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 2 Mar 2022 19:34:24 -0500
Subject: [PATCH 22/94] black

---
 tests/test_measures.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test_measures.py b/tests/test_measures.py
index 0b8bac88e5..3c3c356ab7 100644
--- a/tests/test_measures.py
+++ b/tests/test_measures.py
@@ -56,7 +56,9 @@ def dev(self):
 
     def test_probs_dtype64(self, dev):
         """Test if probs changes the state dtype"""
-        dev._state = dev._asarray(np.array([1/math.sqrt(2), 1/math.sqrt(2), 0, 0]).astype(np.complex64))
+        dev._state = dev._asarray(
+            np.array([1 / math.sqrt(2), 1 / math.sqrt(2), 0, 0]).astype(np.complex64)
+        )
         p = dev.probability(wires=[0, 1])
 
         assert dev._state.dtype == np.complex64

From f6866bbeaf0665941213e83f88c54066c6d08e13 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 2 Mar 2022 20:22:55 -0500
Subject: [PATCH 23/94] Fix for coverage

---
 tests/test_adjoint_jacobian.py | 14 ++++----
 tests/test_measures.py         |  2 +-
 tests/test_vjp.py              | 60 ++++++++++++++++------------------
 3 files changed, 37 insertions(+), 39 deletions(-)

diff --git a/tests/test_adjoint_jacobian.py b/tests/test_adjoint_jacobian.py
index 4872212506..e394b39575 100644
--- a/tests/test_adjoint_jacobian.py
+++ b/tests/test_adjoint_jacobian.py
@@ -168,16 +168,16 @@ def test_unsupported_hermitian_expectation(self, dev):
     )
     @pytest.mark.skipif(not lq._CPP_BINARY_AVAILABLE, reason="Lightning binary required")
     def test_unsupported_complex_type(self, dev):
-        with pytest.raises(TypeError, match="Unsupported .*"):
-            dev._state = dev._asarray(dev._state, np.complex256)
+        dev._state = np.zeros(8, np.complex256)  # Directly put unaligned numpy array to device
 
-            with qml.tape.JacobianTape() as tape:
-                qml.QubitStateVector(np.array([1.0, -1.0]) / np.sqrt(2), wires=0)
-                qml.RX(0.3, wires=[0])
-                qml.expval(qml.PauliZ(0))
+        with qml.tape.JacobianTape() as tape:
+            qml.QubitStateVector(np.array([1.0, -1.0]) / np.sqrt(2), wires=0)
+            qml.RX(0.3, wires=[0])
+            qml.expval(qml.PauliZ(0))
 
-            tape.trainable_params = {1}
+        tape.trainable_params = {1}
 
+        with pytest.raises(TypeError, match="Unsupported .*"):
             dev.adjoint_jacobian(tape)
 
     @pytest.mark.parametrize("theta", np.linspace(-2 * np.pi, 2 * np.pi, 7))
diff --git a/tests/test_measures.py b/tests/test_measures.py
index 3c3c356ab7..10c48c5313 100644
--- a/tests/test_measures.py
+++ b/tests/test_measures.py
@@ -69,7 +69,7 @@ def test_probs_dtype64(self, dev):
     )
     def test_probs_dtype_error(self, dev):
         """Test if probs raise error with complex256"""
-        dev._state = dev._asarray(np.array([1, 0]).astype(np.complex256))
+        dev._state = np.array([1, 0, 0, 0]).astype(np.complex256)
 
         with pytest.raises(TypeError, match="Unsupported complex Type:"):
             dev.probability(wires=[0, 1])
diff --git a/tests/test_vjp.py b/tests/test_vjp.py
index 2aa97c6faa..eb73f61a4b 100644
--- a/tests/test_vjp.py
+++ b/tests/test_vjp.py
@@ -39,14 +39,12 @@ def dev(self):
         not hasattr(np, "complex256"), reason="Numpy only defines complex256 in Linux-like system"
     )
     def test_unsupported_complex_type(self, dev):
-        with pytest.raises(TypeError, match="Unsupported .*"):
-            dev._state = dev._asarray(dev._state, np.complex256)
+        dev._state = np.array([1, 0, 0, 0], dtype=np.complex256)
 
-            dy = np.array([[1.0, 2.0], [3.0, 4.0]])
-            jac = np.array(
-                [[[1.0, 0.1, 0.2], [0.2, 0.6, 0.1]], [[0.4, -0.7, 1.2], [-0.5, -0.6, 0.7]]]
-            )
+        dy = np.array([[1.0, 2.0], [3.0, 4.0]])
+        jac = np.array([[[1.0, 0.1, 0.2], [0.2, 0.6, 0.1]], [[0.4, -0.7, 1.2], [-0.5, -0.6, 0.7]]])
 
+        with pytest.raises(TypeError, match="Unsupported .*"):
             dev.compute_vjp(dy, jac)
 
     @pytest.mark.parametrize("C", [np.complex64, np.complex128])
@@ -122,21 +120,21 @@ def dev(self):
         not hasattr(np, "complex256"), reason="Numpy only defines complex256 in Linux-like system"
     )
     def test_unsupported_complex_type(self, dev):
-        with pytest.raises(TypeError, match="Unsupported .*"):
-            dev._state = dev._asarray(dev._state, np.complex256)
+        dev._state = np.array([1, 0, 0, 0], dtype=np.complex256)
 
-            x, y, z = [0.5, 0.3, -0.7]
+        x, y, z = [0.5, 0.3, -0.7]
 
-            with qml.tape.JacobianTape() as tape:
-                qml.RX(0.4, wires=[0])
-                qml.Rot(x, y, z, wires=[0])
-                qml.RY(-0.2, wires=[0])
-                qml.expval(qml.PauliZ(0))
+        with qml.tape.JacobianTape() as tape:
+            qml.RX(0.4, wires=[0])
+            qml.Rot(x, y, z, wires=[0])
+            qml.RY(-0.2, wires=[0])
+            qml.expval(qml.PauliZ(0))
 
-            tape.trainable_params = {1, 2, 3}
+        tape.trainable_params = {1, 2, 3}
 
-            dy = np.array([1.0])
+        dy = np.array([1.0])
 
+        with pytest.raises(TypeError, match="Unsupported .*"):
             dev.vjp(tape, dy)(tape)
 
     @pytest.mark.parametrize("C", [np.complex64, np.complex128])
@@ -470,26 +468,26 @@ def dev(self):
         not hasattr(np, "complex256"), reason="Numpy only defines complex256 in Linux-like system"
     )
     def test_unsupported_complex_type(self, dev):
-        with pytest.raises(TypeError, match="Unsupported .*"):
-            dev._state = dev._asarray(dev._state, np.complex256)
+        dev._state = np.array([1, 0, 0, 0], dtype=np.complex256)
 
-            with qml.tape.QuantumTape() as tape1:
-                qml.RX(0.4, wires=0)
-                qml.CNOT(wires=[0, 1])
-                qml.expval(qml.PauliZ(0))
+        with qml.tape.QuantumTape() as tape1:
+            qml.RX(0.4, wires=0)
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0))
 
-            with qml.tape.JacobianTape() as tape2:
-                qml.RX(0.4, wires=0)
-                qml.RX(0.6, wires=0)
-                qml.CNOT(wires=[0, 1])
-                qml.expval(qml.PauliZ(0))
+        with qml.tape.JacobianTape() as tape2:
+            qml.RX(0.4, wires=0)
+            qml.RX(0.6, wires=0)
+            qml.CNOT(wires=[0, 1])
+            qml.expval(qml.PauliZ(0))
 
-            tape1.trainable_params = {0}
-            tape2.trainable_params = {0, 1}
+        tape1.trainable_params = {0}
+        tape2.trainable_params = {0, 1}
 
-            tapes = [tape1, tape2]
-            dys = [np.array([1.0]), np.array([1.0])]
+        tapes = [tape1, tape2]
+        dys = [np.array([1.0]), np.array([1.0])]
 
+        with pytest.raises(TypeError, match="Unsupported .*"):
             dev.batch_vjp(tapes, dys)
 
     @pytest.mark.parametrize("C", [np.complex64, np.complex128])

From 31d658569245c0058802a4fd2207bb3d1d4369f9 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 2 Mar 2022 20:59:42 -0500
Subject: [PATCH 24/94] Update for -Wpedantic

---
 CMakeLists.txt                                           | 1 -
 pennylane_lightning/src/simulator/Measures.hpp           | 4 ++--
 pennylane_lightning/src/tests/Test_Measures.cpp          | 2 --
 pennylane_lightning/src/tests/Test_OpToMemberFuncPtr.cpp | 4 ++--
 pennylane_lightning/src/util/ConstantUtil.hpp            | 4 ++--
 5 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5e143fd62d..58b4faab21 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,7 +56,6 @@ FetchContent_MakeAvailable(pybind11)
 
 # All CMakeLists.txt in subdirectories use pennylane_lightning_compile_options and pennylane_lightning_external_libs
 add_subdirectory(pennylane_lightning/src)
-set(CMAKE_CXX_VISIBILITY_PRESET hidden)
 
 #####################################################
 # Maintain for dependent external package development
diff --git a/pennylane_lightning/src/simulator/Measures.hpp b/pennylane_lightning/src/simulator/Measures.hpp
index d03031ee36..c158f7ed37 100644
--- a/pennylane_lightning/src/simulator/Measures.hpp
+++ b/pennylane_lightning/src/simulator/Measures.hpp
@@ -178,7 +178,7 @@ class Measures {
         }
 
         return expected_value_list;
-    };
+    }
 
     /**
      * @brief Variance of an observable.
@@ -255,6 +255,6 @@ class Measures {
         }
 
         return expected_value_list;
-    };
+    }
 }; // class Measures
 } // namespace Pennylane
diff --git a/pennylane_lightning/src/tests/Test_Measures.cpp b/pennylane_lightning/src/tests/Test_Measures.cpp
index 6f18a458df..78ecddfd90 100644
--- a/pennylane_lightning/src/tests/Test_Measures.cpp
+++ b/pennylane_lightning/src/tests/Test_Measures.cpp
@@ -10,12 +10,10 @@
 
 using namespace Pennylane;
 
-namespace {
 using std::complex;
 using std::size_t;
 using std::string;
 using std::vector;
-}; // namespace
 
 StateVectorManagedCPU<double> Initializing_StateVector() {
     // Defining a StateVector in a non-trivial configuration:
diff --git a/pennylane_lightning/src/tests/Test_OpToMemberFuncPtr.cpp b/pennylane_lightning/src/tests/Test_OpToMemberFuncPtr.cpp
index 99f5494128..816af428cb 100644
--- a/pennylane_lightning/src/tests/Test_OpToMemberFuncPtr.cpp
+++ b/pennylane_lightning/src/tests/Test_OpToMemberFuncPtr.cpp
@@ -17,7 +17,7 @@ template <typename EnumClass> constexpr auto allGateOps() {
     return Util::tuple_to_array(allGateOpsHelper<EnumClass>(
         std::make_integer_sequence<uint32_t,
                                    static_cast<uint32_t>(EnumClass::END)>{}));
-};
+}
 template <class PrecisionT, class ParamT, class GateImplemenation,
           uint32_t gate_idx>
 constexpr bool testAllGatesImplementedIter() {
@@ -174,7 +174,7 @@ constexpr auto opFuncPtrPairsIter() {
     } else {
         return std::tuple{};
     }
-};
+}
 
 /**
  * @brief Pairs of all implemented gate operations and the corresponding
diff --git a/pennylane_lightning/src/util/ConstantUtil.hpp b/pennylane_lightning/src/util/ConstantUtil.hpp
index aec36602e1..d3995e7642 100644
--- a/pennylane_lightning/src/util/ConstantUtil.hpp
+++ b/pennylane_lightning/src/util/ConstantUtil.hpp
@@ -42,7 +42,7 @@ constexpr auto lookup(const std::array<std::pair<Key, Value>, size> &arr,
         }
     }
     throw std::range_error("The given key does not exist.");
-};
+}
 
 /**
  * @brief Check an array has an element.
@@ -61,7 +61,7 @@ constexpr auto array_has_elt(const std::array<U, size> &arr, const U &elt)
         }
     }
     return false;
-};
+}
 
 /**
  * @brief Extract first elements from the array of pairs.

From 857a8313540dc45442f155711dec670034a2dd55 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 2 Mar 2022 23:52:14 -0500
Subject: [PATCH 25/94] Some fix for tidy

---
 pennylane_lightning/src/.clang-tidy           |  2 +-
 .../DefaultKernelsForStateVector.hpp          | 32 +++++++++++++------
 .../src/simulator/DynamicDispatcher.hpp       | 15 +++++----
 .../src/simulator/StateVectorCPU.hpp          | 12 ++++---
 .../Test_GateImplementations_Generator.cpp    |  2 +-
 pennylane_lightning/src/util/Memory.hpp       |  2 ++
 6 files changed, 41 insertions(+), 24 deletions(-)

diff --git a/pennylane_lightning/src/.clang-tidy b/pennylane_lightning/src/.clang-tidy
index e1fce11707..50b924d24b 100644
--- a/pennylane_lightning/src/.clang-tidy
+++ b/pennylane_lightning/src/.clang-tidy
@@ -1,5 +1,5 @@
 ---
-Checks:          'clang-diagnostic-*,clang-analyzer-*,-*,-llvmlibc-*,modernize-*,-modernize-use-trailing-return-type,clang-analyzer-cplusplus*,openmp-*,performance-*,portability-*,readability-*,hicpp-*,-hicpp-avoid-c-arrays,-hicpp-no-array-decay,bugprone-suspicious-*,llvm-namespace-comment,cppcoreguidelines-avoid-non-const-global-variables,cppcoreguidelines-slicing,cppcoreguidelines-special-member-functions'
+Checks:          '-*,clang-diagnostic-*,clang-analyzer-*,modernize-*,-modernize-use-trailing-return-type,clang-analyzer-cplusplus*,openmp-*,performance-*,portability-*,readability-*,hicpp-*,-hicpp-avoid-c-arrays,-hicpp-no-array-decay,bugprone-suspicious-*,llvm-namespace-comment,cppcoreguidelines-avoid-non-const-global-variables,cppcoreguidelines-slicing,cppcoreguidelines-special-member-functions'
 WarningsAsErrors: '*'
 HeaderFilterRegex: '.*'
 AnalyzeTemporaryDtors: false
diff --git a/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp b/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp
index f3a33b91bf..abedc469f0 100644
--- a/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp
+++ b/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp
@@ -509,8 +509,8 @@ class DefaultKernelsForStateVector {
      * @param threading Threading context
      * @param memory_model Memory model of the underlying data
      */
-    auto getGateKernelMap(size_t num_qubits, Threading threading,
-                          CPUMemoryModel memory_model) const
+    [[nodiscard]] auto getGateKernelMap(size_t num_qubits, Threading threading,
+                                        CPUMemoryModel memory_model) const
         -> std::unordered_map<Gates::GateOperation, Gates::KernelType> {
         uint32_t dispatch_key = toDispatchKey(threading, memory_model);
 
@@ -532,8 +532,9 @@ class DefaultKernelsForStateVector {
      * @param threading Threading context
      * @param memory_model Memory model of the underlying data
      */
-    auto getGeneratorKernelMap(size_t num_qubits, Threading threading,
-                               CPUMemoryModel memory_model) const
+    [[nodiscard]] auto getGeneratorKernelMap(size_t num_qubits,
+                                             Threading threading,
+                                             CPUMemoryModel memory_model) const
         -> std::unordered_map<Gates::GeneratorOperation, Gates::KernelType> {
         uint32_t dispatch_key = toDispatchKey(threading, memory_model);
 
@@ -556,8 +557,9 @@ class DefaultKernelsForStateVector {
      * @param threading Threading context
      * @param memory_model Memory model of the underlying data
      */
-    auto getMatrixKernelMap(size_t num_qubits, Threading threading,
-                            CPUMemoryModel memory_model) const
+    [[nodiscard]] auto getMatrixKernelMap(size_t num_qubits,
+                                          Threading threading,
+                                          CPUMemoryModel memory_model) const
         -> std::unordered_map<Gates::MatrixOperation, Gates::KernelType> {
         uint32_t dispatch_key = toDispatchKey(threading, memory_model);
 
@@ -578,16 +580,26 @@ class DefaultKernelsForStateVector {
                                   CPUMemoryModel memory_model,
                                   uint32_t priority) {
         uint32_t dispatch_key = toDispatchKey(threading, memory_model);
-        gate_kernel_map_[std::make_pair(gate_op, dispatch_key)].clearPriority(
-            priority);
+        const auto key = std::make_pair(gate_op, dispatch_key);
+
+        const auto iter = generator_kernel_map_.find(key);
+        if (iter == gate_kernel_map_.end()) {
+            return;
+        }
+        iter->clearPriority(priority);
     }
 
     void removeKernelForMatrix(Gates::MatrixOperation mat_op,
                                Threading threading, CPUMemoryModel memory_model,
                                uint32_t priority) {
         uint32_t dispatch_key = toDispatchKey(threading, memory_model);
-        matrix_kernel_map_[std::make_pair(mat_op, dispatch_key)].clearPriority(
-            priority);
+        const auto key = std::make_pair(mat_op, dispatch_key);
+
+        const auto iter = matrix_kernel_map_.find(key);
+        if (iter == matrix_kernel_map_.end()) {
+            return;
+        }
+        iter->clearPriority(priority);
     }
 };
 } // namespace Pennylane
diff --git a/pennylane_lightning/src/simulator/DynamicDispatcher.hpp b/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
index 71c92bf772..00d3e7c98c 100644
--- a/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
+++ b/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
@@ -85,20 +85,20 @@ template <typename PrecisionT> class DynamicDispatcher {
                                           const std::vector<size_t> &, bool)>;
 
   private:
-    std::unordered_map<std::string, Gates::GateOperation> str_to_gates_;
-    std::unordered_map<std::string, Gates::GeneratorOperation> str_to_gntrs_;
+    std::unordered_map<std::string, Gates::GateOperation> str_to_gates_{};
+    std::unordered_map<std::string, Gates::GeneratorOperation> str_to_gntrs_{};
 
     std::unordered_map<std::pair<Gates::GateOperation, Gates::KernelType>,
                        GateFunc, Util::PairHash>
-        gates_;
+        gates_{};
 
     std::unordered_map<std::pair<Gates::GeneratorOperation, Gates::KernelType>,
                        GeneratorFunc, Util::PairHash>
-        generators_;
+        generators_{};
 
     std::unordered_map<std::pair<Gates::MatrixOperation, Gates::KernelType>,
                        MatrixFunc, Util::PairHash>
-        matrices_;
+        matrices_{};
 
     constexpr static auto removeGeneratorPrefix(std::string_view op_name)
         -> std::string_view {
@@ -128,11 +128,12 @@ template <typename PrecisionT> class DynamicDispatcher {
         return singleton;
     }
 
-    auto strToGateOp(const std::string &gate_name) const
+    [[nodiscard]] auto strToGateOp(const std::string &gate_name) const
         -> Gates::GateOperation {
         return str_to_gates_.at(gate_name);
     }
-    auto strToGeneratorOp(const std::string &gntr_name) const
+
+    [[nodiscard]] auto strToGeneratorOp(const std::string &gntr_name) const
         -> Gates::GeneratorOperation {
         return str_to_gntrs_.at(gntr_name);
     }
diff --git a/pennylane_lightning/src/simulator/StateVectorCPU.hpp b/pennylane_lightning/src/simulator/StateVectorCPU.hpp
index bb08bf30e1..392957b115 100644
--- a/pennylane_lightning/src/simulator/StateVectorCPU.hpp
+++ b/pennylane_lightning/src/simulator/StateVectorCPU.hpp
@@ -40,11 +40,11 @@ class StateVectorCPU : public StateVectorBase<PrecisionT, Derived> {
     using BaseType = StateVectorBase<PrecisionT, Derived>;
 
     std::unordered_map<Gates::GateOperation, Gates::KernelType>
-        kernel_for_gates_;
+        kernel_for_gates_{};
     std::unordered_map<Gates::GeneratorOperation, Gates::KernelType>
-        kernel_for_generators_;
+        kernel_for_generators_{};
     std::unordered_map<Gates::MatrixOperation, Gates::KernelType>
-        kernel_for_matrices_;
+        kernel_for_matrices_{};
 
     void setKernels(size_t num_qubits, Threading threading,
                     CPUMemoryModel memory_model) {
@@ -83,7 +83,9 @@ class StateVectorCPU : public StateVectorBase<PrecisionT, Derived> {
         return kernel_for_matrices_.at(mat_op);
     }
 
-    inline CPUMemoryModel memoryModel() const { return memory_model_; }
-    inline Threading threading() const { return threading_; }
+    [[nodiscard]] inline CPUMemoryModel memoryModel() const {
+        return memory_model_;
+    }
+    [[nodiscard]] inline Threading threading() const { return threading_; }
 };
 } // namespace Pennylane
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp
index 0dfc1eceb8..ecd52275e2 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp
@@ -85,7 +85,7 @@ void testGeneratorForGate(RandomEngine &re, size_t num_qubits) {
     using ComplexPrecisionT = std::complex<PrecisionT>;
     constexpr auto I = Util::IMAG<PrecisionT>();
 
-    constexpr ParamT eps = static_cast<ParamT>(1e-4); // For finite difference
+    constexpr auto eps = static_cast<ParamT>(1e-4); // For finite difference
 
     constexpr auto gate_op = static_lookup<gntr_op>(generator_gate_pairs);
     constexpr auto gate_name = static_lookup<gate_op>(Constant::gate_names);
diff --git a/pennylane_lightning/src/util/Memory.hpp b/pennylane_lightning/src/util/Memory.hpp
index ea2c20ec9c..e78923b64d 100644
--- a/pennylane_lightning/src/util/Memory.hpp
+++ b/pennylane_lightning/src/util/Memory.hpp
@@ -76,6 +76,7 @@ template <class T> struct AlignedAllocator {
         if (alignment_ > alignof(std::max_align_t)) {
             p = alignedAlloc(alignment_, sizeof(T) * size);
         } else {
+            // NOLINTNEXTLINE(hicpp-no-malloc)
             p = malloc(sizeof(T) * size);
         }
         if (p == nullptr) {
@@ -88,6 +89,7 @@ template <class T> struct AlignedAllocator {
         if (alignment_ > alignof(std::max_align_t)) {
             alignedFree(p);
         } else {
+            // NOLINTNEXTLINE(hicpp-no-malloc)
             free(p);
         }
     }

From 838b691cc901ab67f11aaed47fa5f24c0f441cb3 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Thu, 3 Mar 2022 00:01:48 -0500
Subject: [PATCH 26/94] Fix

---
 .../DefaultKernelsForStateVector.hpp          | 22 ++++++++++++++-----
 pennylane_lightning/src/tests/.clang-tidy     |  2 +-
 .../Test_DefaultKernelsForStateVector.cpp     |  6 ++---
 3 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp b/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp
index abedc469f0..020b0288f8 100644
--- a/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp
+++ b/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp
@@ -575,18 +575,30 @@ class DefaultKernelsForStateVector {
         return kernel_for_matrices;
     }
 
-    void removeKernelForGenerator(Gates::GateOperation gate_op,
+    void removeKernelForGate(Gates::GateOperation gate_op, Threading threading,
+                             CPUMemoryModel memory_model, uint32_t priority) {
+        uint32_t dispatch_key = toDispatchKey(threading, memory_model);
+        const auto key = std::make_pair(gate_op, dispatch_key);
+
+        const auto iter = gate_kernel_map_.find(key);
+        if (iter == gate_kernel_map_.end()) {
+            return;
+        }
+        (iter->second).clearPriority(priority);
+    }
+
+    void removeKernelForGenerator(Gates::GeneratorOperation gntr_op,
                                   Threading threading,
                                   CPUMemoryModel memory_model,
                                   uint32_t priority) {
         uint32_t dispatch_key = toDispatchKey(threading, memory_model);
-        const auto key = std::make_pair(gate_op, dispatch_key);
+        const auto key = std::make_pair(gntr_op, dispatch_key);
 
         const auto iter = generator_kernel_map_.find(key);
-        if (iter == gate_kernel_map_.end()) {
+        if (iter == generator_kernel_map_.end()) {
             return;
         }
-        iter->clearPriority(priority);
+        (iter->second).clearPriority(priority);
     }
 
     void removeKernelForMatrix(Gates::MatrixOperation mat_op,
@@ -599,7 +611,7 @@ class DefaultKernelsForStateVector {
         if (iter == matrix_kernel_map_.end()) {
             return;
         }
-        iter->clearPriority(priority);
+        (iter->second).clearPriority(priority);
     }
 };
 } // namespace Pennylane
diff --git a/pennylane_lightning/src/tests/.clang-tidy b/pennylane_lightning/src/tests/.clang-tidy
index 3b5744a4b0..75afabace1 100644
--- a/pennylane_lightning/src/tests/.clang-tidy
+++ b/pennylane_lightning/src/tests/.clang-tidy
@@ -1,5 +1,5 @@
 ---
-Checks:          'clang-diagnostic-*,clang-analyzer-*,-*,-llvmlibc-*,modernize-*,-modernize-use-trailing-return-type,clang-analyzer-cplusplus*,openmp-*,performance-*,portability-*,readability-*,-modernize-avoid-c-arrays,-readability-magic-numbers,hicpp-*,-hicpp-no-array-decay,-hicpp-avoid-c-arrays,bugprone-suspicious-*,llvm-namespace-comment,cppcoreguidelines-slicing,cppcoreguidelines-special-member-functions'
+Checks:          '-*,clang-diagnostic-*,clang-analyzer-*,-llvmlibc-*,modernize-*,-modernize-use-trailing-return-type,clang-analyzer-cplusplus*,openmp-*,performance-*,portability-*,readability-*,-modernize-avoid-c-arrays,-readability-magic-numbers,hicpp-*,-hicpp-no-array-decay,-hicpp-avoid-c-arrays,bugprone-suspicious-*,llvm-namespace-comment,cppcoreguidelines-slicing,cppcoreguidelines-special-member-functions'
 WarningsAsErrors: '*'
 HeaderFilterRegex: '.*'
 AnalyzeTemporaryDtors: false
diff --git a/pennylane_lightning/src/tests/Test_DefaultKernelsForStateVector.cpp b/pennylane_lightning/src/tests/Test_DefaultKernelsForStateVector.cpp
index 663c76e51c..32b9d0a8f4 100644
--- a/pennylane_lightning/src/tests/Test_DefaultKernelsForStateVector.cpp
+++ b/pennylane_lightning/src/tests/Test_DefaultKernelsForStateVector.cpp
@@ -92,9 +92,9 @@ TEST_CASE("Test priority works", "[Test_DefaultKernelsForStateVector]") {
                     CPUMemoryModel::Unaligned)[GateOperation::PauliX] ==
                 KernelType::PI);
 
-        instance.removeKernelForGenerator(GateOperation::PauliX,
-                                          Threading::SingleThread,
-                                          CPUMemoryModel::Unaligned, 100);
+        instance.removeKernelForGate(GateOperation::PauliX,
+                                     Threading::SingleThread,
+                                     CPUMemoryModel::Unaligned, 100);
         REQUIRE(instance.getGateKernelMap(
                     24, Threading::SingleThread,
                     CPUMemoryModel::Unaligned)[GateOperation::PauliX] ==

From c09625840a4fc21602ae9566b61afe51bdfeda67 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Thu, 3 Mar 2022 00:26:05 -0500
Subject: [PATCH 27/94] Add omp for clang

---
 .github/workflows/format.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml
index 319471b3ca..a4f04e6334 100644
--- a/.github/workflows/format.yml
+++ b/.github/workflows/format.yml
@@ -45,7 +45,7 @@ jobs:
           python-version: 3.8
 
       - name: Install dependencies
-        run: sudo apt update && sudo apt -y install clang-tidy-12 cmake g++
+        run: sudo apt update && sudo apt -y install clang-tidy-12 cmake g++ libomp-dev
         env:
           DEBIAN_FRONTEND: noninteractive
 

From 73a812b96bed9ac882223bbcf318fc012d745ef3 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Thu, 3 Mar 2022 00:32:17 -0500
Subject: [PATCH 28/94] Specify version

---
 .github/workflows/format.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml
index a4f04e6334..c7688703ca 100644
--- a/.github/workflows/format.yml
+++ b/.github/workflows/format.yml
@@ -45,7 +45,7 @@ jobs:
           python-version: 3.8
 
       - name: Install dependencies
-        run: sudo apt update && sudo apt -y install clang-tidy-12 cmake g++ libomp-dev
+        run: sudo apt update && sudo apt -y install clang-tidy-12 cmake g++ libomp-12-dev
         env:
           DEBIAN_FRONTEND: noninteractive
 

From 41adfe064a442b454ec3b4e2d14d975570954896 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Thu, 3 Mar 2022 09:09:53 -0500
Subject: [PATCH 29/94] Fix

---
 .../src/simulator/DynamicDispatcher.hpp              | 10 +++++-----
 pennylane_lightning/src/simulator/StateVectorCPU.hpp |  6 +++---
 tests/test_vjp.py                                    | 12 ++++++++++++
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/pennylane_lightning/src/simulator/DynamicDispatcher.hpp b/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
index 00d3e7c98c..47f73e5e5e 100644
--- a/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
+++ b/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
@@ -85,20 +85,20 @@ template <typename PrecisionT> class DynamicDispatcher {
                                           const std::vector<size_t> &, bool)>;
 
   private:
-    std::unordered_map<std::string, Gates::GateOperation> str_to_gates_{};
-    std::unordered_map<std::string, Gates::GeneratorOperation> str_to_gntrs_{};
+    std::unordered_map<std::string, Gates::GateOperation> str_to_gates_;
+    std::unordered_map<std::string, Gates::GeneratorOperation> str_to_gntrs_;
 
     std::unordered_map<std::pair<Gates::GateOperation, Gates::KernelType>,
                        GateFunc, Util::PairHash>
-        gates_{};
+        gates_;
 
     std::unordered_map<std::pair<Gates::GeneratorOperation, Gates::KernelType>,
                        GeneratorFunc, Util::PairHash>
-        generators_{};
+        generators_;
 
     std::unordered_map<std::pair<Gates::MatrixOperation, Gates::KernelType>,
                        MatrixFunc, Util::PairHash>
-        matrices_{};
+        matrices_;
 
     constexpr static auto removeGeneratorPrefix(std::string_view op_name)
         -> std::string_view {
diff --git a/pennylane_lightning/src/simulator/StateVectorCPU.hpp b/pennylane_lightning/src/simulator/StateVectorCPU.hpp
index 392957b115..e0f944ad25 100644
--- a/pennylane_lightning/src/simulator/StateVectorCPU.hpp
+++ b/pennylane_lightning/src/simulator/StateVectorCPU.hpp
@@ -40,11 +40,11 @@ class StateVectorCPU : public StateVectorBase<PrecisionT, Derived> {
     using BaseType = StateVectorBase<PrecisionT, Derived>;
 
     std::unordered_map<Gates::GateOperation, Gates::KernelType>
-        kernel_for_gates_{};
+        kernel_for_gates_;
     std::unordered_map<Gates::GeneratorOperation, Gates::KernelType>
-        kernel_for_generators_{};
+        kernel_for_generators_;
     std::unordered_map<Gates::MatrixOperation, Gates::KernelType>
-        kernel_for_matrices_{};
+        kernel_for_matrices_;
 
     void setKernels(size_t num_qubits, Threading threading,
                     CPUMemoryModel memory_model) {
diff --git a/tests/test_vjp.py b/tests/test_vjp.py
index eb73f61a4b..6bc1bfc884 100644
--- a/tests/test_vjp.py
+++ b/tests/test_vjp.py
@@ -108,6 +108,18 @@ def test_zero_dy(self, dev, C):
         vjp = dev.compute_vjp(dy, jac)
         assert np.all(vjp == np.zeros([3]))
 
+    @pytest.mark.parametrize("C", [np.complex64, np.complex128])
+    def test_non_numpy_dy(self, dev, C):
+        "Test when dy is torch.tensor"
+        torch = pytest.importorskip("torch")
+        dev._state = dev._asarray(dev._state, C)
+
+        dy = torch.zeros(2, 2)
+        jac = np.array([[[1.0, 0.1, 0.2], [0.2, 0.6, 0.1]], [[0.4, -0.7, 1.2], [-0.5, -0.6, 0.7]]])
+
+        vjp = dev.compute_vjp(dy, jac)
+        assert torch.equal(vjp, torch.zeros([3], dtype=torch.double))
+
 
 class TestVectorJacobianProduct:
     """Tests for the `vjp` function"""

From 5dd0919a333328865430105bc132054bf0239e92 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Thu, 3 Mar 2022 10:13:28 -0500
Subject: [PATCH 30/94] Trigger


From 63b52c7d2c25852f14fc6fc71b6b7b0ba8adb4cb Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Thu, 3 Mar 2022 14:35:20 -0500
Subject: [PATCH 31/94] Fix

---
 pennylane_lightning/_serialize.py                              | 1 -
 .../src/simulator/DefaultKernelsForStateVector.hpp             | 3 +--
 tests/test_vjp.py                                              | 2 +-
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/pennylane_lightning/_serialize.py b/pennylane_lightning/_serialize.py
index db7d78cb8e..7c4f483c2b 100644
--- a/pennylane_lightning/_serialize.py
+++ b/pennylane_lightning/_serialize.py
@@ -34,7 +34,6 @@
         ObsStructC64,
         StateVectorC128,
         ObsStructC128,
-        DEFAULT_KERNEL_FOR_OPS,
     )
 except ImportError:
     pass
diff --git a/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp b/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp
index 020b0288f8..0dbe545044 100644
--- a/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp
+++ b/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp
@@ -108,8 +108,7 @@ class PriorityDispatchSet {
 
 class DefaultKernelsForStateVector {
   private:
-    const static inline std::unordered_map<CPUMemoryModel,
-                                           std::vector<Gates::KernelType>>
+    const std::unordered_map<CPUMemoryModel, std::vector<Gates::KernelType>>
         allowed_kernels{
             {CPUMemoryModel::Unaligned,
              {Gates::KernelType::LM, Gates::KernelType::PI}},
diff --git a/tests/test_vjp.py b/tests/test_vjp.py
index 6bc1bfc884..d944f7bbde 100644
--- a/tests/test_vjp.py
+++ b/tests/test_vjp.py
@@ -110,7 +110,7 @@ def test_zero_dy(self, dev, C):
 
     @pytest.mark.parametrize("C", [np.complex64, np.complex128])
     def test_non_numpy_dy(self, dev, C):
-        "Test when dy is torch.tensor"
+        "Test compute_vjp works when dy is torch.tensor"
         torch = pytest.importorskip("torch")
         dev._state = dev._asarray(dev._state, C)
 

From 1102ad07abb544a497a44591416ef5f2d71e941e Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Thu, 3 Mar 2022 17:44:01 -0500
Subject: [PATCH 32/94] KernelMap refactor

---
 .../src/simulator/CMakeLists.txt              |   2 +-
 .../DefaultKernelsForStateVector.hpp          | 616 ------------------
 .../src/simulator/DynamicDispatcher.hpp       |   6 +-
 .../src/simulator/KernelMap.cpp               | 188 ++++++
 .../src/simulator/KernelMap.hpp               | 282 ++++++++
 .../src/simulator/StateVectorCPU.hpp          |  19 +-
 pennylane_lightning/src/tests/CMakeLists.txt  |   2 +-
 ...sForStateVector.cpp => Test_KernelMap.cpp} |  77 ++-
 .../src/util/IntegerInterval.hpp              |   2 +-
 9 files changed, 531 insertions(+), 663 deletions(-)
 delete mode 100644 pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp
 create mode 100644 pennylane_lightning/src/simulator/KernelMap.cpp
 create mode 100644 pennylane_lightning/src/simulator/KernelMap.hpp
 rename pennylane_lightning/src/tests/{Test_DefaultKernelsForStateVector.cpp => Test_KernelMap.cpp} (50%)

diff --git a/pennylane_lightning/src/simulator/CMakeLists.txt b/pennylane_lightning/src/simulator/CMakeLists.txt
index ff07211f3a..452d5353e3 100644
--- a/pennylane_lightning/src/simulator/CMakeLists.txt
+++ b/pennylane_lightning/src/simulator/CMakeLists.txt
@@ -1,7 +1,7 @@
 project(lightning_simulator)
 set(CMAKE_CXX_STANDARD 17)
 
-set(SIMULATOR_FILES DynamicDispatcher.cpp CACHE INTERNAL "" FORCE)
+set(SIMULATOR_FILES DynamicDispatcher.cpp KernelMap.cpp CACHE INTERNAL "" FORCE)
 
 add_library(lightning_simulator STATIC ${SIMULATOR_FILES})
 
diff --git a/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp b/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp
deleted file mode 100644
index 0dbe545044..0000000000
--- a/pennylane_lightning/src/simulator/DefaultKernelsForStateVector.hpp
+++ /dev/null
@@ -1,616 +0,0 @@
-// Copyright 2022 Xanadu Quantum Technologies Inc.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//     http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-/**
- * @file
- * Set/get Default kernels for statevector
- */
-#include "DispatchKeys.hpp"
-#include "GateOperation.hpp"
-#include "IntegerInterval.hpp"
-#include "KernelType.hpp"
-#include "Util.hpp"
-
-#include <functional>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-
-namespace Pennylane {
-
-///@cond DEV
-struct DispatchElement {
-    uint32_t priority;
-    Util::IntegerInterval<size_t> interval;
-    Gates::KernelType kernel;
-};
-
-inline bool lower_priority(const DispatchElement &lhs,
-                           const DispatchElement &rhs) {
-    return lhs.priority < rhs.priority;
-}
-
-inline bool higher_priority(const DispatchElement &lhs,
-                            const DispatchElement &rhs) {
-    return lhs.priority > rhs.priority;
-}
-
-/**
- * @brief Maintain dispatch element using a vector decreasingly-ordered by
- * priority.
- */
-class PriorityDispatchSet {
-  private:
-    std::vector<DispatchElement> ordered_vec_;
-
-  public:
-    [[nodiscard]] bool
-    conflict(uint32_t test_priority,
-             const Util::IntegerInterval<size_t> &test_interval) const {
-        const auto test_elt = DispatchElement{test_priority, test_interval,
-                                              Gates::KernelType::None};
-        const auto [b, e] =
-            std::equal_range(ordered_vec_.begin(), ordered_vec_.end(), test_elt,
-                             higher_priority);
-        for (auto iter = b; iter != e; ++iter) {
-            if (!is_disjoint(iter->interval, test_interval)) {
-                return true;
-            }
-        }
-        return false;
-    }
-
-    void insert(const DispatchElement &elt) {
-        const auto iter_to_insert = std::upper_bound(
-            ordered_vec_.begin(), ordered_vec_.end(), elt, &higher_priority);
-        ordered_vec_.insert(iter_to_insert, elt);
-    }
-
-    template <typename... Ts> void emplace(Ts &&...args) {
-        const auto elt = DispatchElement{std::forward<Ts>(args)...};
-        const auto iter_to_insert = std::upper_bound(
-            ordered_vec_.begin(), ordered_vec_.end(), elt, &higher_priority);
-        ordered_vec_.insert(iter_to_insert, elt);
-    }
-
-    [[nodiscard]] Gates::KernelType getKernel(size_t num_qubits) const {
-        for (const auto &elt : ordered_vec_) {
-            if (elt.interval(num_qubits)) {
-                return elt.kernel;
-            }
-        }
-        throw std::range_error(
-            "Cannot find a kernel for the given number of qubits.");
-    }
-
-    void clearPriority(uint32_t remove_priority) {
-        const auto begin = std::lower_bound(
-            ordered_vec_.begin(), ordered_vec_.end(), remove_priority,
-            [](const auto &elt, uint32_t p) { return elt.priority > p; });
-        const auto end = std::upper_bound(
-            ordered_vec_.begin(), ordered_vec_.end(), remove_priority,
-            [](uint32_t p, const auto &elt) { return p > elt.priority; });
-        ordered_vec_.erase(begin, end);
-    }
-};
-
-///@endcond
-
-class DefaultKernelsForStateVector {
-  private:
-    const std::unordered_map<CPUMemoryModel, std::vector<Gates::KernelType>>
-        allowed_kernels{
-            {CPUMemoryModel::Unaligned,
-             {Gates::KernelType::LM, Gates::KernelType::PI}},
-            {CPUMemoryModel::Aligned256,
-             {Gates::KernelType::LM, Gates::KernelType::PI}},
-            {CPUMemoryModel::Aligned512,
-             {Gates::KernelType::LM, Gates::KernelType::PI}},
-        };
-
-    std::unordered_map<
-        std::pair<Gates::GateOperation, uint32_t /* dispatch_key */>,
-        PriorityDispatchSet, Util::PairHash>
-        gate_kernel_map_;
-
-    std::unordered_map<
-        std::pair<Gates::GeneratorOperation, uint32_t /* dispatch_key */>,
-        PriorityDispatchSet, Util::PairHash>
-        generator_kernel_map_;
-
-    std::unordered_map<
-        std::pair<Gates::MatrixOperation, uint32_t /* dispatch_key */>,
-        PriorityDispatchSet, Util::PairHash>
-        matrix_kernel_map_;
-
-    void registerDefaultGates() {
-        using Gates::GateOperation;
-        using Util::full_domain;
-        using Util::in_between_closed;
-        using Util::larger_than;
-        using Util::larger_than_equal_to;
-        using Util::less_than;
-        using Util::less_than_equal_to;
-
-        auto &instance = *this;
-        auto all_qubit_numbers = full_domain<size_t>();
-        /* Single-qubit gates */
-        instance.assignKernelForGate(GateOperation::PauliX, all_threading,
-                                     all_memory_model, all_qubit_numbers,
-                                     Gates::KernelType::LM);
-        instance.assignKernelForGate(GateOperation::PauliY, all_threading,
-                                     all_memory_model, all_qubit_numbers,
-                                     Gates::KernelType::LM);
-        instance.assignKernelForGate(GateOperation::PauliZ, all_threading,
-                                     all_memory_model, all_qubit_numbers,
-                                     Gates::KernelType::LM);
-        instance.assignKernelForGate(GateOperation::Hadamard, all_threading,
-                                     all_memory_model, all_qubit_numbers,
-                                     Gates::KernelType::LM);
-        instance.assignKernelForGate(GateOperation::S, all_threading,
-                                     all_memory_model, all_qubit_numbers,
-                                     Gates::KernelType::LM);
-        instance.assignKernelForGate(GateOperation::T, all_threading,
-                                     all_memory_model, all_qubit_numbers,
-                                     Gates::KernelType::LM);
-        instance.assignKernelForGate(GateOperation::PhaseShift, all_threading,
-                                     all_memory_model, all_qubit_numbers,
-                                     Gates::KernelType::LM);
-        instance.assignKernelForGate(GateOperation::RX, all_threading,
-                                     all_memory_model, all_qubit_numbers,
-                                     Gates::KernelType::LM);
-        instance.assignKernelForGate(GateOperation::RY, all_threading,
-                                     all_memory_model, all_qubit_numbers,
-                                     Gates::KernelType::LM);
-        instance.assignKernelForGate(GateOperation::RZ, all_threading,
-                                     all_memory_model, all_qubit_numbers,
-                                     Gates::KernelType::LM);
-        instance.assignKernelForGate(GateOperation::Rot, all_threading,
-                                     all_memory_model, all_qubit_numbers,
-                                     Gates::KernelType::LM);
-        /* Two-qubit gates */
-        instance.assignKernelForGate(GateOperation::CNOT, all_threading,
-                                     all_memory_model, all_qubit_numbers,
-                                     Gates::KernelType::LM);
-        instance.assignKernelForGate(GateOperation::CY, all_threading,
-                                     all_memory_model, all_qubit_numbers,
-                                     Gates::KernelType::LM);
-        instance.assignKernelForGate(GateOperation::CZ, all_threading,
-                                     all_memory_model, all_qubit_numbers,
-                                     Gates::KernelType::LM);
-        instance.assignKernelForGate(GateOperation::ControlledPhaseShift,
-                                     all_threading, all_memory_model,
-                                     all_qubit_numbers, Gates::KernelType::LM);
-        instance.assignKernelForGate(GateOperation::SWAP, all_threading,
-                                     all_memory_model, all_qubit_numbers,
-                                     Gates::KernelType::LM);
-
-        instance.assignKernelForGate(
-            GateOperation::IsingXX, all_threading,
-            // NOLINTNEXTLINE(readability-magic-numbers)
-            all_memory_model, less_than<size_t>(12), Gates::KernelType::LM);
-        instance.assignKernelForGate(
-            GateOperation::IsingXX, all_threading, all_memory_model,
-            // NOLINTNEXTLINE(readability-magic-numbers)
-            in_between_closed<size_t>(12, 20), Gates::KernelType::PI);
-        instance.assignKernelForGate(
-            GateOperation::IsingXX, all_threading,
-            // NOLINTNEXTLINE(readability-magic-numbers)
-            all_memory_model, larger_than<size_t>(20), Gates::KernelType::LM);
-
-        instance.assignKernelForGate(GateOperation::IsingYY, all_threading,
-                                     all_memory_model, all_qubit_numbers,
-                                     Gates::KernelType::LM);
-        instance.assignKernelForGate(GateOperation::IsingZZ, all_threading,
-                                     all_memory_model, all_qubit_numbers,
-                                     Gates::KernelType::LM);
-        instance.assignKernelForGate(GateOperation::CRX, all_threading,
-                                     all_memory_model, all_qubit_numbers,
-                                     Gates::KernelType::LM);
-        instance.assignKernelForGate(GateOperation::CRY, all_threading,
-                                     all_memory_model, all_qubit_numbers,
-                                     Gates::KernelType::LM);
-        instance.assignKernelForGate(GateOperation::CRZ, all_threading,
-                                     all_memory_model, all_qubit_numbers,
-                                     Gates::KernelType::LM);
-        instance.assignKernelForGate(GateOperation::CRot, all_threading,
-                                     all_memory_model, all_qubit_numbers,
-                                     Gates::KernelType::LM);
-        instance.assignKernelForGate(GateOperation::Toffoli, all_threading,
-                                     all_memory_model, all_qubit_numbers,
-                                     Gates::KernelType::PI);
-        instance.assignKernelForGate(GateOperation::CSWAP, all_threading,
-                                     all_memory_model, all_qubit_numbers,
-                                     Gates::KernelType::PI);
-        instance.assignKernelForGate(GateOperation::MultiRZ, all_threading,
-                                     all_memory_model, all_qubit_numbers,
-                                     Gates::KernelType::LM);
-    }
-
-    void registerDefaultGenerators() {
-        using Gates::GateOperation;
-        using Gates::GeneratorOperation;
-        using Gates::KernelType;
-        using Util::full_domain;
-        using Util::in_between_closed;
-        using Util::larger_than;
-        using Util::larger_than_equal_to;
-        using Util::less_than;
-        using Util::less_than_equal_to;
-
-        auto &instance = *this;
-        auto all_qubit_numbers = full_domain<size_t>();
-
-        instance.assignKernelForGenerator(GeneratorOperation::PhaseShift,
-                                          all_threading, all_memory_model,
-                                          all_qubit_numbers, KernelType::LM);
-        instance.assignKernelForGenerator(GeneratorOperation::RX, all_threading,
-                                          all_memory_model, all_qubit_numbers,
-                                          KernelType::LM);
-        instance.assignKernelForGenerator(GeneratorOperation::RY, all_threading,
-                                          all_memory_model, all_qubit_numbers,
-                                          KernelType::LM);
-        instance.assignKernelForGenerator(GeneratorOperation::RZ, all_threading,
-                                          all_memory_model, all_qubit_numbers,
-                                          KernelType::LM);
-        instance.assignKernelForGenerator(GeneratorOperation::IsingXX,
-                                          all_threading, all_memory_model,
-                                          all_qubit_numbers, KernelType::LM);
-        instance.assignKernelForGenerator(GeneratorOperation::IsingYY,
-                                          all_threading, all_memory_model,
-                                          all_qubit_numbers, KernelType::LM);
-        instance.assignKernelForGenerator(GeneratorOperation::IsingZZ,
-                                          all_threading, all_memory_model,
-                                          all_qubit_numbers, KernelType::LM);
-        instance.assignKernelForGenerator(GeneratorOperation::CRX,
-                                          all_threading, all_memory_model,
-                                          all_qubit_numbers, KernelType::LM);
-        instance.assignKernelForGenerator(GeneratorOperation::CRY,
-                                          all_threading, all_memory_model,
-                                          all_qubit_numbers, KernelType::LM);
-        instance.assignKernelForGenerator(GeneratorOperation::CRZ,
-                                          all_threading, all_memory_model,
-                                          all_qubit_numbers, KernelType::LM);
-        instance.assignKernelForGenerator(
-            GeneratorOperation::ControlledPhaseShift, all_threading,
-            all_memory_model, all_qubit_numbers, KernelType::LM);
-        instance.assignKernelForGenerator(GeneratorOperation::MultiRZ,
-                                          all_threading, all_memory_model,
-                                          all_qubit_numbers, KernelType::LM);
-    }
-
-    void registerDefaultMatrices() {
-        using Gates::GateOperation;
-        using Gates::KernelType;
-        using Gates::MatrixOperation;
-        using Util::full_domain;
-        using Util::in_between_closed;
-        using Util::larger_than;
-        using Util::larger_than_equal_to;
-        using Util::less_than;
-        using Util::less_than_equal_to;
-
-        auto &instance = *this;
-        auto all_qubit_numbers = full_domain<size_t>();
-
-        instance.assignKernelForMatrix(MatrixOperation::SingleQubitOp,
-                                       all_threading, all_memory_model,
-                                       all_qubit_numbers, KernelType::LM);
-        instance.assignKernelForMatrix(MatrixOperation::TwoQubitOp,
-                                       all_threading, all_memory_model,
-                                       all_qubit_numbers, KernelType::LM);
-        instance.assignKernelForMatrix(MatrixOperation::MultiQubitOp,
-                                       all_threading, all_memory_model,
-                                       all_qubit_numbers, KernelType::PI);
-    }
-
-    DefaultKernelsForStateVector() {
-        registerDefaultGates();
-        registerDefaultGenerators();
-        registerDefaultMatrices();
-    }
-
-  public:
-    struct AllThreading {};
-
-    struct AllMemoryModel {};
-
-    constexpr static AllThreading all_threading{};
-    constexpr static AllMemoryModel all_memory_model{};
-
-    static auto getInstance() -> DefaultKernelsForStateVector & {
-        static DefaultKernelsForStateVector instance;
-
-        return instance;
-    }
-
-    void assignKernelForGate(Gates::GateOperation gate_op, Threading threading,
-                             CPUMemoryModel memory_model, uint32_t priority,
-                             const Util::IntegerInterval<size_t> &interval,
-                             Gates::KernelType kernel) {
-        if (std::find(allowed_kernels.at(memory_model).cbegin(),
-                      allowed_kernels.at(memory_model).cend(),
-                      kernel) == allowed_kernels.at(memory_model).cend()) {
-            throw std::invalid_argument("The given kernel is now allowed for "
-                                        "the given memory model.");
-        }
-        const auto dispatch_key = toDispatchKey(threading, memory_model);
-        auto &set = gate_kernel_map_[std::make_pair(gate_op, dispatch_key)];
-
-        if (set.conflict(priority, interval)) {
-            throw std::invalid_argument("The given interval conflicts with "
-                                        "existing intervals.");
-        }
-        set.emplace(priority, interval, kernel);
-    }
-
-    void assignKernelForGate(Gates::GateOperation gate_op,
-                             [[maybe_unused]] AllThreading dummy,
-                             CPUMemoryModel memory_model,
-                             const Util::IntegerInterval<size_t> &interval,
-                             Gates::KernelType kernel) {
-        /* Priority for all threading is 1 */
-        Util::for_each_enum<Threading>([=](Threading threading) {
-            assignKernelForGate(gate_op, threading, memory_model, 1, interval,
-                                kernel);
-        });
-    }
-
-    void assignKernelForGate(Gates::GateOperation gate_op, Threading threading,
-                             [[maybe_unused]] AllMemoryModel dummy,
-                             const Util::IntegerInterval<size_t> &interval,
-                             Gates::KernelType kernel) {
-        /* Priority for all memory model is 2 */
-        Util::for_each_enum<CPUMemoryModel>([=](CPUMemoryModel memory_model) {
-            assignKernelForGate(gate_op, threading, memory_model, 2, interval,
-                                kernel);
-        });
-    }
-
-    void assignKernelForGate(Gates::GateOperation gate_op,
-                             [[maybe_unused]] AllThreading dummy1,
-                             [[maybe_unused]] AllMemoryModel dummy2,
-                             const Util::IntegerInterval<size_t> &interval,
-                             Gates::KernelType kernel) {
-        /* Priority is 0 */
-        Util::for_each_enum<Threading, CPUMemoryModel>(
-            [=](Threading threading, CPUMemoryModel memory_model) {
-                assignKernelForGate(gate_op, threading, memory_model, 0,
-                                    interval, kernel);
-            });
-    }
-
-    void assignKernelForGenerator(Gates::GeneratorOperation gntr_op,
-                                  Threading threading,
-                                  CPUMemoryModel memory_model,
-                                  uint32_t priority,
-                                  const Util::IntegerInterval<size_t> &interval,
-                                  Gates::KernelType kernel) {
-        if (std::find(allowed_kernels.at(memory_model).cbegin(),
-                      allowed_kernels.at(memory_model).cend(),
-                      kernel) == allowed_kernels.at(memory_model).cend()) {
-            throw std::invalid_argument("The given kernel is now allowed for "
-                                        "the given memory model.");
-        }
-        const auto dispatch_key = toDispatchKey(threading, memory_model);
-        auto &set =
-            generator_kernel_map_[std::make_pair(gntr_op, dispatch_key)];
-
-        if (set.conflict(priority, interval)) {
-            throw std::invalid_argument("The given interval conflicts with "
-                                        "existing intervals.");
-        }
-        set.emplace(priority, interval, kernel);
-    }
-
-    void assignKernelForGenerator(Gates::GeneratorOperation gntr_op,
-                                  [[maybe_unused]] AllThreading dummy,
-                                  CPUMemoryModel memory_model,
-                                  const Util::IntegerInterval<size_t> &interval,
-                                  Gates::KernelType kernel) {
-        Util::for_each_enum<Threading>([=](Threading threading) {
-            assignKernelForGenerator(gntr_op, threading, memory_model, 1,
-                                     interval, kernel);
-        });
-    }
-
-    void assignKernelForGenerator(Gates::GeneratorOperation gntr_op,
-                                  Threading threading,
-                                  [[maybe_unused]] AllMemoryModel dummy,
-                                  const Util::IntegerInterval<size_t> &interval,
-                                  Gates::KernelType kernel) {
-        Util::for_each_enum<CPUMemoryModel>([=](CPUMemoryModel memory_model) {
-            assignKernelForGenerator(gntr_op, threading, memory_model, 2,
-                                     interval, kernel);
-        });
-    }
-
-    void assignKernelForGenerator(Gates::GeneratorOperation gntr_op,
-                                  [[maybe_unused]] AllThreading dummy1,
-                                  [[maybe_unused]] AllMemoryModel dummy2,
-                                  const Util::IntegerInterval<size_t> &interval,
-                                  Gates::KernelType kernel) {
-        Util::for_each_enum<Threading, CPUMemoryModel>(
-            [=](Threading threading, CPUMemoryModel memory_model) {
-                assignKernelForGenerator(gntr_op, threading, memory_model, 0,
-                                         interval, kernel);
-            });
-    }
-
-    void assignKernelForMatrix(Gates::MatrixOperation mat_op,
-                               Threading threading, CPUMemoryModel memory_model,
-                               uint32_t priority,
-                               const Util::IntegerInterval<size_t> &interval,
-                               Gates::KernelType kernel) {
-        if (std::find(allowed_kernels.at(memory_model).cbegin(),
-                      allowed_kernels.at(memory_model).cend(),
-                      kernel) == allowed_kernels.at(memory_model).cend()) {
-            throw std::invalid_argument("The given kernel is now allowed for "
-                                        "the given memory model.");
-        }
-        const auto dispatch_key = toDispatchKey(threading, memory_model);
-        auto &set = matrix_kernel_map_[std::make_pair(mat_op, dispatch_key)];
-
-        if (set.conflict(priority, interval)) {
-            throw std::invalid_argument("The given interval conflicts with "
-                                        "existing intervals.");
-        }
-        set.emplace(priority, interval, kernel);
-    }
-
-    void assignKernelForMatrix(Gates::MatrixOperation mat_op,
-                               [[maybe_unused]] AllThreading dummy,
-                               CPUMemoryModel memory_model,
-                               const Util::IntegerInterval<size_t> &interval,
-                               Gates::KernelType kernel) {
-        Util::for_each_enum<Threading>([=](Threading threading) {
-            assignKernelForMatrix(mat_op, threading, memory_model, 1, interval,
-                                  kernel);
-        });
-    }
-
-    void assignKernelForMatrix(Gates::MatrixOperation mat_op,
-                               Threading threading,
-                               [[maybe_unused]] AllMemoryModel dummy,
-                               const Util::IntegerInterval<size_t> &interval,
-                               Gates::KernelType kernel) {
-        Util::for_each_enum<CPUMemoryModel>([=](CPUMemoryModel memory_model) {
-            assignKernelForMatrix(mat_op, threading, memory_model, 2, interval,
-                                  kernel);
-        });
-    }
-
-    void assignKernelForMatrix(Gates::MatrixOperation mat_op,
-                               [[maybe_unused]] AllThreading dummy1,
-                               [[maybe_unused]] AllMemoryModel dummy2,
-                               const Util::IntegerInterval<size_t> &interval,
-                               Gates::KernelType kernel) {
-        Util::for_each_enum<Threading, CPUMemoryModel>(
-            [=](Threading threading, CPUMemoryModel memory_model) {
-                assignKernelForMatrix(mat_op, threading, memory_model, 0,
-                                      interval, kernel);
-            });
-    }
-
-    /**
-     * @brief Create default kernels for all gates
-     * @param num_qubits Number of qubits
-     * @param threading Threading context
-     * @param memory_model Memory model of the underlying data
-     */
-    [[nodiscard]] auto getGateKernelMap(size_t num_qubits, Threading threading,
-                                        CPUMemoryModel memory_model) const
-        -> std::unordered_map<Gates::GateOperation, Gates::KernelType> {
-        uint32_t dispatch_key = toDispatchKey(threading, memory_model);
-
-        std::unordered_map<Gates::GateOperation, Gates::KernelType>
-            kernel_for_gates;
-
-        Util::for_each_enum<Gates::GateOperation>(
-            [&](Gates::GateOperation gate_op) {
-                const auto key = std::make_pair(gate_op, dispatch_key);
-                const auto &set = gate_kernel_map_.at(key);
-                kernel_for_gates.emplace(gate_op, set.getKernel(num_qubits));
-            });
-        return kernel_for_gates;
-    }
-
-    /**
-     * @brief Create default kernels for all generators
-     * @param num_qubits Number of qubits
-     * @param threading Threading context
-     * @param memory_model Memory model of the underlying data
-     */
-    [[nodiscard]] auto getGeneratorKernelMap(size_t num_qubits,
-                                             Threading threading,
-                                             CPUMemoryModel memory_model) const
-        -> std::unordered_map<Gates::GeneratorOperation, Gates::KernelType> {
-        uint32_t dispatch_key = toDispatchKey(threading, memory_model);
-
-        std::unordered_map<Gates::GeneratorOperation, Gates::KernelType>
-            kernel_for_generators;
-
-        Util::for_each_enum<Gates::GeneratorOperation>(
-            [&](Gates::GeneratorOperation gntr_op) {
-                const auto key = std::make_pair(gntr_op, dispatch_key);
-                const auto &set = generator_kernel_map_.at(key);
-                kernel_for_generators.emplace(gntr_op,
-                                              set.getKernel(num_qubits));
-            });
-        return kernel_for_generators;
-    }
-
-    /**
-     * @brief Create default kernels for all matrix operations
-     * @param num_qubits Number of qubits
-     * @param threading Threading context
-     * @param memory_model Memory model of the underlying data
-     */
-    [[nodiscard]] auto getMatrixKernelMap(size_t num_qubits,
-                                          Threading threading,
-                                          CPUMemoryModel memory_model) const
-        -> std::unordered_map<Gates::MatrixOperation, Gates::KernelType> {
-        uint32_t dispatch_key = toDispatchKey(threading, memory_model);
-
-        std::unordered_map<Gates::MatrixOperation, Gates::KernelType>
-            kernel_for_matrices;
-
-        Util::for_each_enum<Gates::MatrixOperation>(
-            [&](Gates::MatrixOperation mat_op) {
-                const auto key = std::make_pair(mat_op, dispatch_key);
-                const auto &set = matrix_kernel_map_.at(key);
-                kernel_for_matrices.emplace(mat_op, set.getKernel(num_qubits));
-            });
-        return kernel_for_matrices;
-    }
-
-    void removeKernelForGate(Gates::GateOperation gate_op, Threading threading,
-                             CPUMemoryModel memory_model, uint32_t priority) {
-        uint32_t dispatch_key = toDispatchKey(threading, memory_model);
-        const auto key = std::make_pair(gate_op, dispatch_key);
-
-        const auto iter = gate_kernel_map_.find(key);
-        if (iter == gate_kernel_map_.end()) {
-            return;
-        }
-        (iter->second).clearPriority(priority);
-    }
-
-    void removeKernelForGenerator(Gates::GeneratorOperation gntr_op,
-                                  Threading threading,
-                                  CPUMemoryModel memory_model,
-                                  uint32_t priority) {
-        uint32_t dispatch_key = toDispatchKey(threading, memory_model);
-        const auto key = std::make_pair(gntr_op, dispatch_key);
-
-        const auto iter = generator_kernel_map_.find(key);
-        if (iter == generator_kernel_map_.end()) {
-            return;
-        }
-        (iter->second).clearPriority(priority);
-    }
-
-    void removeKernelForMatrix(Gates::MatrixOperation mat_op,
-                               Threading threading, CPUMemoryModel memory_model,
-                               uint32_t priority) {
-        uint32_t dispatch_key = toDispatchKey(threading, memory_model);
-        const auto key = std::make_pair(mat_op, dispatch_key);
-
-        const auto iter = matrix_kernel_map_.find(key);
-        if (iter == matrix_kernel_map_.end()) {
-            return;
-        }
-        (iter->second).clearPriority(priority);
-    }
-};
-} // namespace Pennylane
diff --git a/pennylane_lightning/src/simulator/DynamicDispatcher.hpp b/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
index 47f73e5e5e..d4cd500bd7 100644
--- a/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
+++ b/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
@@ -53,14 +53,14 @@ namespace Pennylane {
  * @brief These functions are only used to register kernels to the dynamic
  * dispatcher.
  */
-template <class PrecisionT, class ParamT> struct registerBeforeMain;
+template <class PrecisionT, class ParamT> struct RegisterBeforeMain;
 
-template <> struct registerBeforeMain<float, float> {
+template <> struct RegisterBeforeMain<float, float> {
     static inline const int dummy =
         Internal::registerAllAvailableKernels<float, float>();
 };
 
-template <> struct registerBeforeMain<double, double> {
+template <> struct RegisterBeforeMain<double, double> {
     static inline const int dummy =
         Internal::registerAllAvailableKernels<double, double>();
 };
diff --git a/pennylane_lightning/src/simulator/KernelMap.cpp b/pennylane_lightning/src/simulator/KernelMap.cpp
new file mode 100644
index 0000000000..73f5a75d88
--- /dev/null
+++ b/pennylane_lightning/src/simulator/KernelMap.cpp
@@ -0,0 +1,188 @@
+// Copyright 2022 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "KernelMap.hpp"
+
+#include "GateOperation.hpp"
+#include "KernelType.hpp"
+
+using namespace Pennylane;
+using namespace Pennylane::KernelMap;
+
+using Gates::GateOperation;
+using Gates::GeneratorOperation;
+using Gates::KernelType;
+using Gates::MatrixOperation;
+using Util::full_domain;
+using Util::in_between_closed;
+using Util::larger_than;
+using Util::larger_than_equal_to;
+using Util::less_than;
+using Util::less_than_equal_to;
+
+namespace Pennylane::KernelMap::Internal {
+
+constexpr static auto all_qubit_numbers = Util::full_domain<size_t>();
+
+int assignDefaultKernelsForGateOp() {
+    auto &instance = OperationKernelMap<GateOperation>::getInstance();
+
+    instance.assignKernelForOp(GateOperation::PauliX, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               Gates::KernelType::LM);
+    instance.assignKernelForOp(GateOperation::PauliY, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               Gates::KernelType::LM);
+    instance.assignKernelForOp(GateOperation::PauliZ, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               Gates::KernelType::LM);
+    instance.assignKernelForOp(GateOperation::Hadamard, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               Gates::KernelType::LM);
+    instance.assignKernelForOp(GateOperation::S, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               Gates::KernelType::LM);
+    instance.assignKernelForOp(GateOperation::T, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               Gates::KernelType::LM);
+    instance.assignKernelForOp(GateOperation::PhaseShift, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               Gates::KernelType::LM);
+    instance.assignKernelForOp(GateOperation::RX, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               Gates::KernelType::LM);
+    instance.assignKernelForOp(GateOperation::RY, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               Gates::KernelType::LM);
+    instance.assignKernelForOp(GateOperation::RZ, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               Gates::KernelType::LM);
+    instance.assignKernelForOp(GateOperation::Rot, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               Gates::KernelType::LM);
+    /* Two-qubit gates */
+    instance.assignKernelForOp(GateOperation::CNOT, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               Gates::KernelType::LM);
+    instance.assignKernelForOp(GateOperation::CY, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               Gates::KernelType::LM);
+    instance.assignKernelForOp(GateOperation::CZ, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               Gates::KernelType::LM);
+    instance.assignKernelForOp(GateOperation::ControlledPhaseShift,
+                               all_threading, all_memory_model,
+                               all_qubit_numbers, Gates::KernelType::LM);
+    instance.assignKernelForOp(GateOperation::SWAP, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               Gates::KernelType::LM);
+
+    instance.assignKernelForOp(GateOperation::IsingXX, all_threading,
+                               // NOLINTNEXTLINE(readability-magic-numbers)
+                               all_memory_model, less_than<size_t>(12),
+                               Gates::KernelType::LM);
+    instance.assignKernelForOp(
+        GateOperation::IsingXX, all_threading, all_memory_model,
+        // NOLINTNEXTLINE(readability-magic-numbers)
+        in_between_closed<size_t>(12, 20), Gates::KernelType::PI);
+    instance.assignKernelForOp(GateOperation::IsingXX, all_threading,
+                               // NOLINTNEXTLINE(readability-magic-numbers)
+                               all_memory_model, larger_than<size_t>(20),
+                               Gates::KernelType::LM);
+
+    instance.assignKernelForOp(GateOperation::IsingYY, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               Gates::KernelType::LM);
+    instance.assignKernelForOp(GateOperation::IsingZZ, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               Gates::KernelType::LM);
+    instance.assignKernelForOp(GateOperation::CRX, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               Gates::KernelType::LM);
+    instance.assignKernelForOp(GateOperation::CRY, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               Gates::KernelType::LM);
+    instance.assignKernelForOp(GateOperation::CRZ, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               Gates::KernelType::LM);
+    instance.assignKernelForOp(GateOperation::CRot, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               Gates::KernelType::LM);
+    instance.assignKernelForOp(GateOperation::Toffoli, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               Gates::KernelType::PI);
+    instance.assignKernelForOp(GateOperation::CSWAP, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               Gates::KernelType::PI);
+    instance.assignKernelForOp(GateOperation::MultiRZ, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               Gates::KernelType::LM);
+    return 1;
+}
+
+int assignDefaultKernelsForGeneratorOp() {
+    auto &instance = OperationKernelMap<GeneratorOperation>::getInstance();
+
+    instance.assignKernelForOp(GeneratorOperation::PhaseShift, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               KernelType::LM);
+    instance.assignKernelForOp(GeneratorOperation::RX, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               KernelType::LM);
+    instance.assignKernelForOp(GeneratorOperation::RY, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               KernelType::LM);
+    instance.assignKernelForOp(GeneratorOperation::RZ, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               KernelType::LM);
+    instance.assignKernelForOp(GeneratorOperation::IsingXX, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               KernelType::LM);
+    instance.assignKernelForOp(GeneratorOperation::IsingYY, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               KernelType::LM);
+    instance.assignKernelForOp(GeneratorOperation::IsingZZ, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               KernelType::LM);
+    instance.assignKernelForOp(GeneratorOperation::CRX, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               KernelType::LM);
+    instance.assignKernelForOp(GeneratorOperation::CRY, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               KernelType::LM);
+    instance.assignKernelForOp(GeneratorOperation::CRZ, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               KernelType::LM);
+    instance.assignKernelForOp(GeneratorOperation::ControlledPhaseShift,
+                               all_threading, all_memory_model,
+                               all_qubit_numbers, KernelType::LM);
+    instance.assignKernelForOp(GeneratorOperation::MultiRZ, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               KernelType::LM);
+    return 1;
+}
+int assignDefaultKernelsForMatrixOp() {
+    auto &instance = OperationKernelMap<MatrixOperation>::getInstance();
+
+    instance.assignKernelForOp(MatrixOperation::SingleQubitOp, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               KernelType::LM);
+    instance.assignKernelForOp(MatrixOperation::TwoQubitOp, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               KernelType::LM);
+    instance.assignKernelForOp(MatrixOperation::MultiQubitOp, all_threading,
+                               all_memory_model, all_qubit_numbers,
+                               KernelType::PI);
+    return 1;
+}
+} // namespace Pennylane::KernelMap::Internal
diff --git a/pennylane_lightning/src/simulator/KernelMap.hpp b/pennylane_lightning/src/simulator/KernelMap.hpp
new file mode 100644
index 0000000000..51532b9c8c
--- /dev/null
+++ b/pennylane_lightning/src/simulator/KernelMap.hpp
@@ -0,0 +1,282 @@
+// Copyright 2022 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/**
+ * @file
+ * Set/get Default kernels for statevector
+ */
+#include "DispatchKeys.hpp"
+#include "GateOperation.hpp"
+#include "IntegerInterval.hpp"
+#include "KernelType.hpp"
+#include "Util.hpp"
+
+#include <deque>
+#include <functional>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+
+namespace Pennylane::KernelMap {
+///@cond DEV
+namespace Internal {
+
+int assignDefaultKernelsForGateOp();
+int assignDefaultKernelsForGeneratorOp();
+int assignDefaultKernelsForMatrixOp();
+
+template <class Operation> struct AssignKernelForOp;
+
+template <> struct AssignKernelForOp<Gates::GateOperation> {
+    static inline const int dummy = assignDefaultKernelsForGateOp();
+};
+template <> struct AssignKernelForOp<Gates::GeneratorOperation> {
+    static inline const int dummy = assignDefaultKernelsForGeneratorOp();
+};
+template <> struct AssignKernelForOp<Gates::MatrixOperation> {
+    static inline const int dummy = assignDefaultKernelsForMatrixOp();
+};
+} // namespace Internal
+///@endcond
+
+///@cond DEV
+struct DispatchElement {
+    uint32_t priority;
+    Util::IntegerInterval<size_t> interval;
+    Gates::KernelType kernel;
+};
+
+inline bool lower_priority(const DispatchElement &lhs,
+                           const DispatchElement &rhs) {
+    return lhs.priority < rhs.priority;
+}
+
+inline bool higher_priority(const DispatchElement &lhs,
+                            const DispatchElement &rhs) {
+    return lhs.priority > rhs.priority;
+}
+
+/**
+ * @brief Maintain dispatch element using a vector decreasingly-ordered by
+ * priority.
+ */
+class PriorityDispatchSet {
+  private:
+    std::vector<DispatchElement> ordered_vec_;
+
+  public:
+    [[nodiscard]] bool
+    conflict(uint32_t test_priority,
+             const Util::IntegerInterval<size_t> &test_interval) const {
+        const auto test_elt = DispatchElement{test_priority, test_interval,
+                                              Gates::KernelType::None};
+        const auto [b, e] =
+            std::equal_range(ordered_vec_.begin(), ordered_vec_.end(), test_elt,
+                             higher_priority);
+        for (auto iter = b; iter != e; ++iter) {
+            if (!is_disjoint(iter->interval, test_interval)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    void insert(const DispatchElement &elt) {
+        const auto iter_to_insert = std::upper_bound(
+            ordered_vec_.begin(), ordered_vec_.end(), elt, &higher_priority);
+        ordered_vec_.insert(iter_to_insert, elt);
+    }
+
+    template <typename... Ts> void emplace(Ts &&...args) {
+        const auto elt = DispatchElement{std::forward<Ts>(args)...};
+        const auto iter_to_insert = std::upper_bound(
+            ordered_vec_.begin(), ordered_vec_.end(), elt, &higher_priority);
+        ordered_vec_.insert(iter_to_insert, elt);
+    }
+
+    [[nodiscard]] Gates::KernelType getKernel(size_t num_qubits) const {
+        for (const auto &elt : ordered_vec_) {
+            if (elt.interval(num_qubits)) {
+                return elt.kernel;
+            }
+        }
+        throw std::range_error(
+            "Cannot find a kernel for the given number of qubits.");
+    }
+
+    void clearPriority(uint32_t remove_priority) {
+        const auto begin = std::lower_bound(
+            ordered_vec_.begin(), ordered_vec_.end(), remove_priority,
+            [](const auto &elt, uint32_t p) { return elt.priority > p; });
+        const auto end = std::upper_bound(
+            ordered_vec_.begin(), ordered_vec_.end(), remove_priority,
+            [](uint32_t p, const auto &elt) { return p > elt.priority; });
+        ordered_vec_.erase(begin, end);
+    }
+};
+
+///@endcond
+
+struct AllThreading {};
+struct AllMemoryModel {};
+
+constexpr static AllThreading all_threading{};
+constexpr static AllMemoryModel all_memory_model{};
+
+/**
+ * @brief This class manages all data related to kernel map statevector uses.
+ *
+ * For a given number of qubit, threading, and memory model, this class
+ * returns the best kernels for each gate/generator/matrix operation.
+ */
+template <class Operation, size_t cache_size = 16> class OperationKernelMap {
+  public:
+    using EnumDispatchKernalMap =
+        std::unordered_map<std::pair<Operation, uint32_t /* dispatch_key */>,
+                           PriorityDispatchSet, Util::PairHash>;
+    using EnumKernelMap = std::unordered_map<Operation, Gates::KernelType>;
+
+  private:
+    EnumDispatchKernalMap kernel_map_;
+    mutable std::deque<std::tuple<size_t, uint32_t, EnumKernelMap>> cache_;
+
+    /**
+     * @brief Allowed kernels for a given memory model
+     */
+    const std::unordered_map<CPUMemoryModel, std::vector<Gates::KernelType>>
+        allowed_kernels_;
+
+    OperationKernelMap()
+        : allowed_kernels_{
+              {CPUMemoryModel::Unaligned,
+               {Gates::KernelType::LM, Gates::KernelType::PI}},
+              {CPUMemoryModel::Aligned256,
+               {Gates::KernelType::LM, Gates::KernelType::PI}},
+              {CPUMemoryModel::Aligned512,
+               {Gates::KernelType::LM, Gates::KernelType::PI}},
+          } {}
+
+  public:
+    static auto getInstance() -> OperationKernelMap & {
+        static OperationKernelMap instance;
+
+        return instance;
+    }
+
+    void assignKernelForOp(Operation op, Threading threading,
+                           CPUMemoryModel memory_model, uint32_t priority,
+                           const Util::IntegerInterval<size_t> &interval,
+                           Gates::KernelType kernel) {
+        if (std::find(allowed_kernels_.at(memory_model).cbegin(),
+                      allowed_kernels_.at(memory_model).cend(),
+                      kernel) == allowed_kernels_.at(memory_model).cend()) {
+            throw std::invalid_argument("The given kernel is now allowed for "
+                                        "the given memory model.");
+        }
+        const auto dispatch_key = toDispatchKey(threading, memory_model);
+        auto &set = kernel_map_[std::make_pair(op, dispatch_key)];
+
+        if (set.conflict(priority, interval)) {
+            throw std::invalid_argument("The given interval conflicts with "
+                                        "existing intervals.");
+        }
+
+        // Reset cache
+        cache_.clear();
+
+        set.emplace(priority, interval, kernel);
+    }
+
+    void assignKernelForOp(Operation op, [[maybe_unused]] AllThreading dummy,
+                           CPUMemoryModel memory_model,
+                           const Util::IntegerInterval<size_t> &interval,
+                           Gates::KernelType kernel) {
+        /* Priority for all threading is 1 */
+        Util::for_each_enum<Threading>([=](Threading threading) {
+            assignKernelForOp(op, threading, memory_model, 1, interval, kernel);
+        });
+    }
+
+    void assignKernelForOp(Operation op, Threading threading,
+                           [[maybe_unused]] AllMemoryModel dummy,
+                           const Util::IntegerInterval<size_t> &interval,
+                           Gates::KernelType kernel) {
+        /* Priority for all memory model is 2 */
+        Util::for_each_enum<CPUMemoryModel>([=](CPUMemoryModel memory_model) {
+            assignKernelForOp(op, threading, memory_model, 2, interval, kernel);
+        });
+    }
+
+    void assignKernelForOp(Operation op, [[maybe_unused]] AllThreading dummy1,
+                           [[maybe_unused]] AllMemoryModel dummy2,
+                           const Util::IntegerInterval<size_t> &interval,
+                           Gates::KernelType kernel) {
+        /* Priority is 0 */
+        Util::for_each_enum<Threading, CPUMemoryModel>(
+            [=](Threading threading, CPUMemoryModel memory_model) {
+                assignKernelForOp(op, threading, memory_model, 0, interval,
+                                  kernel);
+            });
+    }
+
+    void removeKernelForOp(Operation op, Threading threading,
+                           CPUMemoryModel memory_model, uint32_t priority) {
+        uint32_t dispatch_key = toDispatchKey(threading, memory_model);
+        const auto key = std::make_pair(op, dispatch_key);
+
+        const auto iter = kernel_map_.find(key);
+        if (iter == kernel_map_.end()) {
+            return;
+        }
+        (iter->second).clearPriority(priority);
+
+        // Reset cache
+        cache_.clear();
+    }
+
+    /**
+     * @brief Create map contains default kernels for operation
+     *
+     * @param num_qubits Number of qubits
+     * @param threading Threading context
+     * @param memory_model Memory model of the underlying data
+     */
+    [[nodiscard]] auto getKernelMap(size_t num_qubits, Threading threading,
+                                    CPUMemoryModel memory_model) const
+        -> EnumKernelMap {
+        // Add mutex for cache_ when we goto multithread.
+        const uint32_t dispatch_key = toDispatchKey(threading, memory_model);
+
+        const auto cache_iter =
+            std::find_if(cache_.begin(), cache_.end(), [=](const auto &elt) {
+                return (std::get<0>(elt) == num_qubits) &&
+                       (std::get<1>(elt) == dispatch_key);
+            });
+        if (cache_iter == cache_.end()) {
+            std::unordered_map<Operation, Gates::KernelType> kernel_for_op;
+
+            Util::for_each_enum<Operation>([&](Operation op) {
+                const auto key = std::make_pair(op, dispatch_key);
+                const auto &set = kernel_map_.at(key);
+                kernel_for_op.emplace(op, set.getKernel(num_qubits));
+            });
+            if (cache_.size() == cache_size) {
+                cache_.pop_front();
+            }
+            cache_.emplace_back(num_qubits, dispatch_key, kernel_for_op);
+            return kernel_for_op;
+        }
+        return std::get<2>(*cache_iter);
+    }
+};
+} // namespace Pennylane::KernelMap
diff --git a/pennylane_lightning/src/simulator/StateVectorCPU.hpp b/pennylane_lightning/src/simulator/StateVectorCPU.hpp
index e0f944ad25..ab5d55a800 100644
--- a/pennylane_lightning/src/simulator/StateVectorCPU.hpp
+++ b/pennylane_lightning/src/simulator/StateVectorCPU.hpp
@@ -11,9 +11,9 @@
 #pragma once
 
 #include "BitUtil.hpp"
-#include "DefaultKernelsForStateVector.hpp"
 #include "DispatchKeys.hpp"
 #include "Gates.hpp"
+#include "KernelMap.hpp"
 #include "KernelType.hpp"
 #include "Memory.hpp"
 #include "StateVectorBase.hpp"
@@ -48,13 +48,16 @@ class StateVectorCPU : public StateVectorBase<PrecisionT, Derived> {
 
     void setKernels(size_t num_qubits, Threading threading,
                     CPUMemoryModel memory_model) {
-        auto &default_kernels = DefaultKernelsForStateVector::getInstance();
-        kernel_for_gates_ = default_kernels.getGateKernelMap(
-            num_qubits, threading, memory_model);
-        kernel_for_generators_ = default_kernels.getGeneratorKernelMap(
-            num_qubits, threading, memory_model);
-        kernel_for_matrices_ = default_kernels.getMatrixKernelMap(
-            num_qubits, threading, memory_model);
+        using KernelMap::OperationKernelMap;
+        kernel_for_gates_ =
+            OperationKernelMap<Gates::GateOperation>::getInstance()
+                .getKernelMap(num_qubits, threading, memory_model);
+        kernel_for_generators_ =
+            OperationKernelMap<Gates::GeneratorOperation>::getInstance()
+                .getKernelMap(num_qubits, threading, memory_model);
+        kernel_for_matrices_ =
+            OperationKernelMap<Gates::MatrixOperation>::getInstance()
+                .getKernelMap(num_qubits, threading, memory_model);
     }
 
   protected:
diff --git a/pennylane_lightning/src/tests/CMakeLists.txt b/pennylane_lightning/src/tests/CMakeLists.txt
index 64ebd3a39d..b5cceb02be 100644
--- a/pennylane_lightning/src/tests/CMakeLists.txt
+++ b/pennylane_lightning/src/tests/CMakeLists.txt
@@ -71,7 +71,6 @@ target_link_libraries(compile_time_tests lightning_gates lightning_utils)
 set(TEST_SOURCES CreateAllWires.cpp
                  Test_AdjDiff.cpp
                  Test_DynamicDispatcher.cpp
-                 Test_DefaultKernelsForStateVector.cpp
                  Test_GateImplementations_CompareKernels.cpp
                  Test_GateImplementations_Generator.cpp
                  Test_GateImplementations_Inverse.cpp
@@ -80,6 +79,7 @@ set(TEST_SOURCES CreateAllWires.cpp
                  Test_GateImplementations_Param.cpp
                  Test_GateUtil.cpp
                  Test_Internal.cpp
+                 Test_KernelMap.cpp
                  Test_Measures.cpp
                  Test_OpToMemberFuncPtr.cpp
                  Test_StateVectorCPU.cpp
diff --git a/pennylane_lightning/src/tests/Test_DefaultKernelsForStateVector.cpp b/pennylane_lightning/src/tests/Test_KernelMap.cpp
similarity index 50%
rename from pennylane_lightning/src/tests/Test_DefaultKernelsForStateVector.cpp
rename to pennylane_lightning/src/tests/Test_KernelMap.cpp
index 32b9d0a8f4..39c5daa024 100644
--- a/pennylane_lightning/src/tests/Test_DefaultKernelsForStateVector.cpp
+++ b/pennylane_lightning/src/tests/Test_KernelMap.cpp
@@ -1,55 +1,67 @@
 #include "Constant.hpp"
 #include "ConstantUtil.hpp"
-#include "DefaultKernelsForStateVector.hpp"
+#include "KernelMap.hpp"
 #include "Util.hpp"
 
 #include <catch2/catch.hpp>
 
 using namespace Pennylane;
+using namespace Pennylane::KernelMap;
 
 TEST_CASE("Test default kernels for gates are well defined",
-          "[Test_DefaultKernelsForStateVector]") {
-    auto &instance = DefaultKernelsForStateVector::getInstance();
+          "[Test_KernelMap]") {
+    auto &instance = OperationKernelMap<Gates::GateOperation>::getInstance();
     Util::for_each_enum<Threading, CPUMemoryModel>(
         [&instance](Threading threading, CPUMemoryModel memory_model) {
             for (size_t num_qubits = 1; num_qubits < 27; num_qubits++) {
-                REQUIRE_NOTHROW(instance.getGateKernelMap(num_qubits, threading,
-                                                          memory_model));
+                REQUIRE_NOTHROW(
+                    instance.getKernelMap(num_qubits, threading, memory_model));
             }
         });
 }
 
 TEST_CASE("Test default kernels for generators are well defined",
-          "[Test_DefaultKernelsForStateVector]") {
-    auto &instance = DefaultKernelsForStateVector::getInstance();
+          "[Test_KernelMap]") {
+    auto &instance =
+        OperationKernelMap<Gates::GeneratorOperation>::getInstance();
     Util::for_each_enum<Threading, CPUMemoryModel>(
         [&instance](Threading threading, CPUMemoryModel memory_model) {
             for (size_t num_qubits = 1; num_qubits < 27; num_qubits++) {
-                REQUIRE_NOTHROW(instance.getGeneratorKernelMap(
-                    num_qubits, threading, memory_model));
+                REQUIRE_NOTHROW(
+                    instance.getKernelMap(num_qubits, threading, memory_model));
             }
         });
 }
 
-TEST_CASE("Test unallowed kernel", "[Test_DefaultKernelsForStateVector]") {
+TEST_CASE("Test default kernels for matrix operation are well defined",
+          "[Test_KernelMap]") {
+    auto &instance = OperationKernelMap<Gates::MatrixOperation>::getInstance();
+    Util::for_each_enum<Threading, CPUMemoryModel>(
+        [&instance](Threading threading, CPUMemoryModel memory_model) {
+            for (size_t num_qubits = 1; num_qubits < 27; num_qubits++) {
+                REQUIRE_NOTHROW(
+                    instance.getKernelMap(num_qubits, threading, memory_model));
+            }
+        });
+}
+
+TEST_CASE("Test unallowed kernel", "[Test_KernelMap]") {
     using Gates::GateOperation;
-    using Gates::GeneratorOperation;
     using Gates::KernelType;
-    auto &instance = DefaultKernelsForStateVector::getInstance();
-    REQUIRE_THROWS(instance.assignKernelForGate(
+    auto &instance = OperationKernelMap<Gates::GateOperation>::getInstance();
+    REQUIRE_THROWS(instance.assignKernelForOp(
         GateOperation::PauliX, Threading::SingleThread,
         CPUMemoryModel::Unaligned, 0, Util::full_domain<size_t>(),
         KernelType::None));
 }
 
-TEST_CASE("Test few limiting cases of default kernels",
-          "[Test_DefaultKernelsForStateVector]") {
-    auto &instance = DefaultKernelsForStateVector::getInstance();
+TEST_CASE("Test few limiting cases of default kernels", "[Test_KernelMap]") {
+    auto &instance = OperationKernelMap<Gates::GateOperation>::getInstance();
     SECTION("Single thread, large number of qubits") {
         // For large N, single thread calls "LM" for all single- and two-qubit
         // gates. For three-qubit gates, we use PI.
-        auto gate_map = instance.getGateKernelMap(24, Threading::SingleThread,
-                                                  CPUMemoryModel::Unaligned);
+        auto gate_map = instance.getKernelMap(24, Threading::SingleThread,
+                                              CPUMemoryModel::Unaligned);
         Util::for_each_enum<Gates::GateOperation>(
             [&gate_map](Gates::GateOperation gate_op) {
                 INFO(Util::lookup(Gates::Constant::gate_names, gate_op));
@@ -65,37 +77,36 @@ TEST_CASE("Test few limiting cases of default kernels",
     }
     SECTION("Single thread, N = 14") {
         // For large N = 14, IsingXX with "PI" is slightly faster
-        auto gate_map = instance.getGateKernelMap(14, Threading::SingleThread,
-                                                  CPUMemoryModel::Unaligned);
+        auto gate_map = instance.getKernelMap(14, Threading::SingleThread,
+                                              CPUMemoryModel::Unaligned);
         REQUIRE(gate_map[Gates::GateOperation::IsingXX] ==
                 Gates::KernelType::PI);
     }
 }
 
-TEST_CASE("Test priority works", "[Test_DefaultKernelsForStateVector]") {
+TEST_CASE("Test priority works", "[Test_KernelMap]") {
     using Gates::GateOperation;
-    using Gates::GeneratorOperation;
     using Gates::KernelType;
-    auto &instance = DefaultKernelsForStateVector::getInstance();
+    auto &instance = OperationKernelMap<Gates::GateOperation>::getInstance();
     SECTION("Test assignKernelForGate") {
-        auto original_kernel = instance.getGateKernelMap(
+        auto original_kernel = instance.getKernelMap(
             24, Threading::SingleThread,
             CPUMemoryModel::Unaligned)[GateOperation::PauliX];
 
-        instance.assignKernelForGate(
-            GateOperation::PauliX, Threading::SingleThread,
-            CPUMemoryModel::Unaligned, 100, Util::full_domain<size_t>(),
-            KernelType::PI);
+        instance.assignKernelForOp(GateOperation::PauliX,
+                                   Threading::SingleThread,
+                                   CPUMemoryModel::Unaligned, 100,
+                                   Util::full_domain<size_t>(), KernelType::PI);
 
-        REQUIRE(instance.getGateKernelMap(
+        REQUIRE(instance.getKernelMap(
                     24, Threading::SingleThread,
                     CPUMemoryModel::Unaligned)[GateOperation::PauliX] ==
                 KernelType::PI);
 
-        instance.removeKernelForGate(GateOperation::PauliX,
-                                     Threading::SingleThread,
-                                     CPUMemoryModel::Unaligned, 100);
-        REQUIRE(instance.getGateKernelMap(
+        instance.removeKernelForOp(GateOperation::PauliX,
+                                   Threading::SingleThread,
+                                   CPUMemoryModel::Unaligned, 100);
+        REQUIRE(instance.getKernelMap(
                     24, Threading::SingleThread,
                     CPUMemoryModel::Unaligned)[GateOperation::PauliX] ==
                 original_kernel);
diff --git a/pennylane_lightning/src/util/IntegerInterval.hpp b/pennylane_lightning/src/util/IntegerInterval.hpp
index 24f14959b9..06002ca3b0 100644
--- a/pennylane_lightning/src/util/IntegerInterval.hpp
+++ b/pennylane_lightning/src/util/IntegerInterval.hpp
@@ -70,7 +70,7 @@ auto in_between_closed(IntegerType from, IntegerType to)
     return IntegerInterval<IntegerType>{from, to + 1};
 }
 template <typename IntegerType>
-auto full_domain() -> IntegerInterval<IntegerType> {
+constexpr auto full_domain() -> IntegerInterval<IntegerType> {
     return IntegerInterval<IntegerType>{
         0, std::numeric_limits<IntegerType>::max()};
 }

From e51ffb59da1bcb717db6d266981bf40840a4f231 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Thu, 3 Mar 2022 17:56:12 -0500
Subject: [PATCH 33/94] Fix for mac

---
 pennylane_lightning/src/util/Memory.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pennylane_lightning/src/util/Memory.hpp b/pennylane_lightning/src/util/Memory.hpp
index e78923b64d..cb271da036 100644
--- a/pennylane_lightning/src/util/Memory.hpp
+++ b/pennylane_lightning/src/util/Memory.hpp
@@ -10,6 +10,7 @@
 // limitations under the License.
 #pragma once
 
+#include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>

From 0c991d4be92e86d7eee4c958b1e4ce706fb67ff2 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Fri, 4 Mar 2022 09:21:43 -0500
Subject: [PATCH 34/94] Simplified a bit

---
 .github/workflows/format.yml                                | 2 +-
 pennylane_lightning/src/algorithms/JacobianTape.hpp         | 6 +++---
 pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp | 2 --
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml
index c7688703ca..b4a91f94a5 100644
--- a/.github/workflows/format.yml
+++ b/.github/workflows/format.yml
@@ -42,7 +42,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v2
         with:
-          python-version: 3.8
+          python-version: 3.7
 
       - name: Install dependencies
         run: sudo apt update && sudo apt -y install clang-tidy-12 cmake g++ libomp-12-dev
diff --git a/pennylane_lightning/src/algorithms/JacobianTape.hpp b/pennylane_lightning/src/algorithms/JacobianTape.hpp
index 8a33e89f02..ca7d0ac6f7 100644
--- a/pennylane_lightning/src/algorithms/JacobianTape.hpp
+++ b/pennylane_lightning/src/algorithms/JacobianTape.hpp
@@ -86,9 +86,9 @@ template <class T = double> class ObsDatum {
     }
 
   private:
-    const std::vector<std::string> obs_name_{};
-    const std::vector<param_var_t> obs_params_{};
-    const std::vector<std::vector<size_t>> obs_wires_{};
+    const std::vector<std::string> obs_name_;
+    const std::vector<param_var_t> obs_params_;
+    const std::vector<std::vector<size_t>> obs_wires_;
 };
 
 /**
diff --git a/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp b/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
index ad21a48134..9b1e5d1630 100644
--- a/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
+++ b/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
@@ -37,7 +37,6 @@ class StateVectorManagedCPU
   private:
     using BaseType = StateVectorCPU<PrecisionT, StateVectorManagedCPU>;
 
-    // NOLINTNEXTLINE(modernize-avoid-c-arrays,hicpp-avoid-c-arrays)
     std::vector<ComplexPrecisionT, AlignedAllocator<ComplexPrecisionT>> data_;
 
   public:
@@ -71,7 +70,6 @@ class StateVectorManagedCPU
 
     // Clang-tidy gives false positive for delegating constructor
     template <class Alloc>
-    // NOLINTNEXTLINE(hicpp-member-init)
     explicit StateVectorManagedCPU(
         const std::vector<std::complex<PrecisionT>, Alloc> &rhs,
         Threading threading = bestThreading(),

From 4b6f82d5bfb831661207a90fe68c7cec1845afdc Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Sat, 5 Mar 2022 21:03:49 -0500
Subject: [PATCH 35/94] Add runtime/compiletime info

---
 bin/cpp-files                                 |   7 +-
 bin/utils.py                                  |  15 ++-
 cmake/process_options.cmake                   |  14 +++
 doc/_ext/edit_on_github.py                    |  27 ++---
 doc/conf.py                                   |  94 +++++++--------
 doc/directives.py                             |  28 ++---
 .../src/algorithms/AdjointDiff.hpp            |   9 +-
 pennylane_lightning/src/bindings/Bindings.cpp |   6 +
 pennylane_lightning/src/bindings/Bindings.hpp | 109 +++++++++++++++++-
 pennylane_lightning/src/gates/KernelType.hpp  |   8 --
 .../src/simulator/CPUMemoryModel.hpp          |  21 +++-
 .../src/simulator/DynamicDispatcher.hpp       |   2 +
 .../src/simulator/KernelMap.hpp               |  49 +++++++-
 .../src/simulator/StateVectorCPU.hpp          |  38 ++++++
 .../src/simulator/StateVectorManagedCPU.hpp   |  11 ++
 pennylane_lightning/src/tests/CMakeLists.txt  |   1 +
 .../src/tests/TestAvailableKernels.hpp        |  17 ---
 ...est_GateImplementations_CompareKernels.cpp |  19 +--
 .../src/tests/Test_RuntimeInfo.cpp            |  13 +++
 .../src/tests/Test_StateVectorCPU.cpp         |   1 +
 pennylane_lightning/src/util/BitUtil.hpp      |   3 +-
 pennylane_lightning/src/util/CMakeLists.txt   |   8 +-
 pennylane_lightning/src/util/Macros.hpp       |  77 ++++++++++++-
 pennylane_lightning/src/util/Memory.hpp       |  71 ++++++++++--
 pennylane_lightning/src/util/RuntimeInfo.cpp  |  68 +++++++++++
 pennylane_lightning/src/util/RuntimeInfo.hpp  |  52 +++++++++
 pennylane_lightning/src/util/TypeList.hpp     |  22 +++-
 27 files changed, 640 insertions(+), 150 deletions(-)
 create mode 100644 pennylane_lightning/src/tests/Test_RuntimeInfo.cpp
 create mode 100644 pennylane_lightning/src/util/RuntimeInfo.cpp
 create mode 100644 pennylane_lightning/src/util/RuntimeInfo.hpp

diff --git a/bin/cpp-files b/bin/cpp-files
index b09cc88cf1..7ccd202783 100755
--- a/bin/cpp-files
+++ b/bin/cpp-files
@@ -14,6 +14,9 @@ if __name__ == '__main__':
     parser = argparse.ArgumentParser(
         description="Output C/C++ files in json list"
     )
+    parser.add_argument(
+        "--header-only", action='store_true', dest='header_only', help="whether only include header files"
+    )
     parser.add_argument(
         "paths", nargs="+", metavar="DIR", help="paths to the root source directories"
     )
@@ -23,9 +26,9 @@ if __name__ == '__main__':
 
     args = parser.parse_args()
 
-    files = set(get_cpp_files(args.paths))
+    files = set(get_cpp_files(args.paths, header_only = args.header_only))
     if args.exclude_dirs:
-        files_excludes = set(get_cpp_files(args.exclude_dirs))
+        files_excludes = set(get_cpp_files(args.exclude_dirs, header_only = args.header_only))
         files -= files_excludes
 
     json.dump(list(files), sys.stdout)
diff --git a/bin/utils.py b/bin/utils.py
index 90d1693031..6d9dab9420 100644
--- a/bin/utils.py
+++ b/bin/utils.py
@@ -2,13 +2,14 @@
 import re
 import fnmatch
 
-SRCFILE_EXT = ("c", "cc", "cpp", "cxx", "h", "hh", "hpp", "hxx", "cu", "cuh")
+SRCFILE_EXT = ["c", "cc", "cpp", "cxx", "cu"]
+HEADERFILE_EXT = ["h", "hh", "hpp", "hxx", "cuh"]
 
 LIGHTNING_SOURCE_DIR = Path(__file__).resolve().parent.parent
 
 rgx_gitignore_comment = re.compile("#.*$")
 
-def get_cpp_files_from_path(path, ignore_patterns = None, use_gitignore = True):
+def get_cpp_files_from_path(path, ignore_patterns = None, use_gitignore = True, header_only = False):
     """return set of C++ source files from a path
 
     Args:
@@ -18,7 +19,11 @@ def get_cpp_files_from_path(path, ignore_patterns = None, use_gitignore = True):
     """
     path = Path(path)
     files_rel = set() # file paths relative to path
-    for ext in SRCFILE_EXT:
+
+    exts = HEADERFILE_EXT
+    if not header_only:
+        exts += SRCFILE_EXT
+    for ext in exts:
         for file_path in path.rglob(f"*.{ext}"):
             files_rel.add(file_path.relative_to(path))
 
@@ -46,7 +51,7 @@ def get_cpp_files_from_path(path, ignore_patterns = None, use_gitignore = True):
 
     return set(str(path.joinpath(f)) for f in files_rel)
     
-def get_cpp_files(paths, ignore_patterns = None, use_gitignore = True):
+def get_cpp_files(paths, ignore_patterns = None, use_gitignore = True, header_only = False):
     """return list of C++ source files from paths.
 
     Args:
@@ -56,5 +61,5 @@ def get_cpp_files(paths, ignore_patterns = None, use_gitignore = True):
     """
     files = set()
     for path in paths:
-        files |= get_cpp_files_from_path(path, ignore_patterns, use_gitignore)
+        files |= get_cpp_files_from_path(path, ignore_patterns, use_gitignore, header_only)
     return list(files)
diff --git a/cmake/process_options.cmake b/cmake/process_options.cmake
index d3ecccd3f9..815a04e43d 100644
--- a/cmake/process_options.cmake
+++ b/cmake/process_options.cmake
@@ -62,6 +62,20 @@ else()
     message(STATUS "ENABLE_AVX is OFF")
 endif()
 
+if(ENABLE_AVX2)
+    message(STATUS "ENABLE_AVX2 is ON.")
+    target_compile_options(lightning_compile_options INTERFACE -mavx2)
+else()
+    message(STATUS "ENABLE_AVX2 is OFF")
+endif()
+
+if(ENABLE_AVX512)
+    message(STATUS "ENABLE_AVX512 is ON.")
+    target_compile_options(lightning_compile_options INTERFACE -mavx512f) # Now we only use avx512f
+else()
+    message(STATUS "ENABLE_AVX512 is OFF")
+endif()
+
 if(ENABLE_OPENMP)
     message(STATUS "ENABLE_OPENMP is ON.")
     find_package(OpenMP)
diff --git a/doc/_ext/edit_on_github.py b/doc/_ext/edit_on_github.py
index b69348d97d..954ed00ab4 100644
--- a/doc/_ext/edit_on_github.py
+++ b/doc/_ext/edit_on_github.py
@@ -8,19 +8,20 @@
 import warnings
 
 
-__licence__ = 'BSD (3 clause)'
+__licence__ = "BSD (3 clause)"
 
 
 def get_github_url(app, view, path):
-    return 'https://github.com/{project}/{view}/{branch}/{path}'.format(
+    return "https://github.com/{project}/{view}/{branch}/{path}".format(
         project=app.config.edit_on_github_project,
         view=view,
         branch=app.config.edit_on_github_branch,
-        path=path)
+        path=path,
+    )
 
 
 def html_page_context(app, pagename, templatename, context, doctree):
-    if templatename != 'page.html':
+    if templatename != "page.html":
         return
 
     if not app.config.edit_on_github_project:
@@ -29,16 +30,16 @@ def html_page_context(app, pagename, templatename, context, doctree):
 
     if not doctree:
         return
-    
-    path = os.path.relpath(doctree.get('source'), app.builder.srcdir)
-    show_url = get_github_url(app, 'blob', path)
-    edit_url = get_github_url(app, 'edit', path)
 
-    context['show_on_github_url'] = show_url
-    context['edit_on_github_url'] = edit_url
+    path = os.path.relpath(doctree.get("source"), app.builder.srcdir)
+    show_url = get_github_url(app, "blob", path)
+    edit_url = get_github_url(app, "edit", path)
+
+    context["show_on_github_url"] = show_url
+    context["edit_on_github_url"] = edit_url
 
 
 def setup(app):
-    app.add_config_value('edit_on_github_project', '', True)
-    app.add_config_value('edit_on_github_branch', 'master', True)
-    app.connect('html-page-context', html_page_context)
\ No newline at end of file
+    app.add_config_value("edit_on_github_project", "", True)
+    app.add_config_value("edit_on_github_branch", "master", True)
+    app.connect("html-page-context", html_page_context)
diff --git a/doc/conf.py b/doc/conf.py
index 770f5434dc..37a6be4452 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -20,46 +20,50 @@
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-sys.path.insert(0, os.path.abspath(''))
-sys.path.insert(0, os.path.abspath('_ext'))
-sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath('doc')), 'doc'))
+sys.path.insert(0, os.path.abspath(""))
+sys.path.insert(0, os.path.abspath("_ext"))
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath("doc")), "doc"))
 
 
 # For obtaining all relevant C++ source files
-currdir = Path(__file__).resolve().parent # PROJECT_SOURCE_DIR/docs
+currdir = Path(__file__).resolve().parent  # PROJECT_SOURCE_DIR/docs
 PROJECT_SOURCE_DIR = currdir.parent
-CPP_SOURCE_DIR = PROJECT_SOURCE_DIR.joinpath('pennylane_lightning/src')
-CPP_EXCLUDE_DIRS = ['examples', 'tests'] # relative to CPP_SOURCE_DIR
+CPP_SOURCE_DIR = PROJECT_SOURCE_DIR.joinpath("pennylane_lightning/src")
+CPP_EXCLUDE_DIRS = ["examples", "tests"]  # relative to CPP_SOURCE_DIR
+
 
 def obtain_cpp_files():
-    script_path = PROJECT_SOURCE_DIR.joinpath('bin/cpp-files')
+    script_path = PROJECT_SOURCE_DIR.joinpath("bin/cpp-files")
 
     if not script_path.exists():
-        print('The project directory structure is corrupted.')
+        print("The project directory structure is corrupted.")
         sys.exit(1)
 
     exclude_dirs = [CPP_SOURCE_DIR.joinpath(exclude_dir) for exclude_dir in CPP_EXCLUDE_DIRS]
 
-    p = subprocess.run([str(script_path), CPP_SOURCE_DIR, '--exclude-dirs', *exclude_dirs], capture_output = True)
+    p = subprocess.run(
+        [str(script_path), "--header-only", CPP_SOURCE_DIR, "--exclude-dirs", *exclude_dirs],
+        capture_output=True,
+    )
     file_list = json.loads(p.stdout)
 
-    file_list = ['../' + str(Path(f).relative_to(PROJECT_SOURCE_DIR)) for f in file_list]
+    file_list = ["../" + str(Path(f).relative_to(PROJECT_SOURCE_DIR)) for f in file_list]
     return file_list
 
+
 CPP_FILES = obtain_cpp_files()
 print(CPP_FILES)
 
 
-
 class Mock(MagicMock):
-    __name__ = 'foo'
+    __name__ = "foo"
 
     @classmethod
     def __getattr__(cls, name):
         return MagicMock()
 
 
-MOCK_MODULES = ['pennylane_lightning.lightning_qubit_ops']
+MOCK_MODULES = ["pennylane_lightning.lightning_qubit_ops"]
 
 mock = Mock()
 for mod_name in MOCK_MODULES:
@@ -68,7 +72,7 @@ def __getattr__(cls, name):
 # -- General configuration ------------------------------------------------
 
 # If your documentation needs a minimal Sphinx version, state it here.
-needs_sphinx = '1.6'
+needs_sphinx = "1.6"
 
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
@@ -85,9 +89,9 @@ def __getattr__(cls, name):
     "sphinx.ext.mathjax",
     "sphinx.ext.napoleon",
     "sphinx.ext.todo",
-    'sphinx.ext.viewcode',
+    "sphinx.ext.viewcode",
     "sphinx_automodapi.automodapi",
-    'sphinx_automodapi.smart_resolver'
+    "sphinx_automodapi.smart_resolver",
 ]
 
 intersphinx_mapping = {"https://pennylane.readthedocs.io/en/stable/": None}
@@ -114,10 +118,7 @@ def __getattr__(cls, name):
     # TIP: if using the sphinx-bootstrap-theme, you need
     # "treeViewIsBootstrap": True,
     "exhaleExecutesDoxygen": True,
-    "exhaleDoxygenStdin": (
-        "INPUT = " + ' '.join(CPP_FILES) + ' '
-        "EXCLUDE_SYMBOLS = std::* "
-    ),
+    "exhaleDoxygenStdin": ("INPUT = " + " ".join(CPP_FILES) + " " "EXCLUDE_SYMBOLS = std::* "),
     "afterTitleDescription": inspect.cleandoc(
         """
         The Pennylane Lightning C++ API is intended to be called from Python through Pybind11. Direct use of the C++ API is currently unsupported and is provided for reference only.
@@ -126,21 +127,21 @@ def __getattr__(cls, name):
 }
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates', 'xanadu_theme']
+templates_path = ["_templates", "xanadu_theme"]
 
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
 #
 # source_suffix = ['.rst', '.md']
-source_suffix = '.rst'
+source_suffix = ".rst"
 
 # The master toctree document.
-master_doc = 'index'
+master_doc = "index"
 
 # General information about the project.
-project = 'PennyLane-Lightning'
+project = "PennyLane-Lightning"
 copyright = "Copyright 2021"
-author = 'Xanadu Inc.'
+author = "Xanadu Inc."
 
 add_module_names = False
 
@@ -149,11 +150,12 @@ def __getattr__(cls, name):
 # built documents.
 
 import pennylane_lightning
+
 # The full version, including alpha/beta/rc tags.
 release = pennylane_lightning.__version__
 
 # The short X.Y version.
-version = re.match(r'^(\d+\.\d+)', release).expand(r'\1')
+version = re.match(r"^(\d+\.\d+)", release).expand(r"\1")
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -163,19 +165,19 @@ def __getattr__(cls, name):
 language = None
 
 # today_fmt is used as the format for a strftime call.
-today_fmt = '%Y-%m-%d'
+today_fmt = "%Y-%m-%d"
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This patterns also effect to html_static_path and html_extra_path
-exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 
 # If true, sectionauthor and moduleauthor directives will be shown in the
 # output. They are ignored by default.
 show_authors = True
 
 # The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+pygments_style = "sphinx"
 
 # If true, `todo` and `todoList` produce output, else they produce nothing.
 todo_include_todos = True
@@ -186,12 +188,12 @@ def __getattr__(cls, name):
 # The name of an image file (relative to this directory) to use as a favicon of
 # the docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
 # pixels large.
-html_favicon = '_static/favicon.ico'
+html_favicon = "_static/favicon.ico"
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
 
 # Custom sidebar templates, must be a dictionary that maps document names
 # to template names.
@@ -199,26 +201,24 @@ def __getattr__(cls, name):
 # This is required for the alabaster theme
 # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
 html_sidebars = {
-    '**' : [
-        'logo-text.html',
-        'searchbox.html',
-        'globaltoc.html',
+    "**": [
+        "logo-text.html",
+        "searchbox.html",
+        "globaltoc.html",
     ]
 }
 
 
 # -- Xanadu theme ---------------------------------------------------------
-html_theme = 'xanadu_theme'
-html_theme_path = ['.']
+html_theme = "xanadu_theme"
+html_theme_path = ["."]
 
 # xanadu theme options (see theme.conf for more information)
 html_theme_options = {
     # Set the name of the project to appear in the left sidebar.
     "project_nav_name": "PennyLane-Lightning",
-
     # Path to a touch icon
     "touch_icon": "logo_new.png",
-
     "large_toc": True,
     "navigation_button": "#19b37b",
     "navigation_button_hover": "#0e714d",
@@ -229,22 +229,22 @@ def __getattr__(cls, name):
     "download_button": "#19b37b",
 }
 
-edit_on_github_project = 'XanaduAI/pennylane-lightning'
-edit_on_github_branch = 'master/doc'
+edit_on_github_project = "XanaduAI/pennylane-lightning"
+edit_on_github_branch = "master/doc"
 
-#============================================================
+# ============================================================
 
 # the order in which autodoc lists the documented members
-autodoc_member_order = 'bysource'
+autodoc_member_order = "bysource"
 
 # inheritance_diagram graphviz attributes
-inheritance_node_attrs = dict(color='lightskyblue1', style='filled')
+inheritance_node_attrs = dict(color="lightskyblue1", style="filled")
 
-#autodoc_default_flags = ['members']
+# autodoc_default_flags = ['members']
 autosummary_generate = True
 
 from directives import CustomDeviceGalleryItemDirective
 
-def setup(app):
-    app.add_directive('devicegalleryitem', CustomDeviceGalleryItemDirective)
 
+def setup(app):
+    app.add_directive("devicegalleryitem", CustomDeviceGalleryItemDirective)
diff --git a/doc/directives.py b/doc/directives.py
index 953c5d38ba..3dfe1cc5d7 100644
--- a/doc/directives.py
+++ b/doc/directives.py
@@ -49,25 +49,27 @@ class CustomDeviceGalleryItemDirective(Directive):
     required_arguments = 0
     optional_arguments = 4
     final_argument_whitespace = True
-    option_spec = {'name': directives.unchanged,
-                   'description': directives.unchanged,
-                   'link': directives.unchanged}
+    option_spec = {
+        "name": directives.unchanged,
+        "description": directives.unchanged,
+        "link": directives.unchanged,
+    }
 
     has_content = False
     add_index = False
 
     def run(self):
         try:
-            if 'name' in self.options:
-                name = self.options['name']
+            if "name" in self.options:
+                name = self.options["name"]
 
-            if 'description' in self.options:
-                description = self.options['description']
+            if "description" in self.options:
+                description = self.options["description"]
             else:
-                raise ValueError('description not found')
+                raise ValueError("description not found")
 
-            if 'link' in self.options:
-                link = self.options['link']
+            if "link" in self.options:
+                link = self.options["link"]
             else:
                 link = "code/qml_templates"
 
@@ -79,10 +81,8 @@ def run(self):
             raise
             return []
 
-        thumbnail_rst = GALLERY_TEMPLATE.format(name=name,
-                                                description=description,
-                                                link=link)
-        thumbnail = StringList(thumbnail_rst.split('\n'))
+        thumbnail_rst = GALLERY_TEMPLATE.format(name=name, description=description, link=link)
+        thumbnail = StringList(thumbnail_rst.split("\n"))
         thumb = nodes.paragraph()
         self.state.nested_parse(thumbnail, self.content_offset, thumb)
         return [thumb]
diff --git a/pennylane_lightning/src/algorithms/AdjointDiff.hpp b/pennylane_lightning/src/algorithms/AdjointDiff.hpp
index fc02f3c50e..d58dad3e7b 100644
--- a/pennylane_lightning/src/algorithms/AdjointDiff.hpp
+++ b/pennylane_lightning/src/algorithms/AdjointDiff.hpp
@@ -71,8 +71,7 @@ template <class T = double> class AdjointJacobian {
                                size_t param_index) {
         jac[obs_index][param_index] =
             -2 * scaling_coeff *
-            std::imag(
-                innerProdC(sv1.getData(), sv2.getData(), sv1.getLength()));
+            std::imag(innerProdC(sv1.getDataVector(), sv2.getDataVector()));
     }
 
     /**
@@ -397,9 +396,9 @@ template <class T = double> class AdjointJacobian {
                              obs_idx++) {
                             jac[mat_row_idx + obs_idx] =
                                 -2 * scalingFactor *
-                                std::imag(
-                                    innerProdC(H_lambda[obs_idx].getData(),
-                                               mu.getData(), mu.getLength()));
+                                std::imag(innerProdC(
+                                    H_lambda[obs_idx].getDataVector(),
+                                    mu.getDataVector()));
                         }
                         trainableParamNumber--;
                         ++tp_it;
diff --git a/pennylane_lightning/src/bindings/Bindings.cpp b/pennylane_lightning/src/bindings/Bindings.cpp
index b935734087..0acaa45d83 100644
--- a/pennylane_lightning/src/bindings/Bindings.cpp
+++ b/pennylane_lightning/src/bindings/Bindings.cpp
@@ -378,6 +378,12 @@ PYBIND11_MODULE(lightning_qubit_ops, // NOLINT: No control over Pybind internals
     m.def("best_alignment", &bestCPUMemoryModel,
           "Best memory alignment. for the simulator.");
 
+    /* Add compile info */
+    m.def("compile_info", &getCompileInfo, "Compiled binary information.");
+
+    /* Add compile info */
+    m.def("runtime_info", &getRuntimeInfo, "Runtime information.");
+
     lightning_class_bindings<float, float>(m);
     lightning_class_bindings<double, double>(m);
 }
diff --git a/pennylane_lightning/src/bindings/Bindings.hpp b/pennylane_lightning/src/bindings/Bindings.hpp
index c0e20f5552..fe1bdfa4b3 100644
--- a/pennylane_lightning/src/bindings/Bindings.hpp
+++ b/pennylane_lightning/src/bindings/Bindings.hpp
@@ -20,9 +20,11 @@
 #include "AdjointDiff.hpp"
 #include "CPUMemoryModel.hpp"
 #include "JacobianProd.hpp"
+#include "Macros.hpp"
 #include "Measures.hpp"
 #include "Memory.hpp"
 #include "OpToMemberFuncPtr.hpp"
+#include "RuntimeInfo.hpp"
 #include "StateVectorManagedCPU.hpp"
 
 #include "pybind11/complex.h"
@@ -40,11 +42,12 @@
 
 namespace Pennylane {
 /**
- * @brief Create a `%StateVector` object from a 1D numpy complex data array.
+ * @brief Create a @ref Pennylane::StateVectorRawCPU object from a 1D numpy
+ * complex data array.
  *
  * @tparam PrecisionT Precision data type
  * @param numpyArray Numpy data array.
- * @return StateVector<PrecisionT> `%StateVector` object.
+ * @return StateVectorRawCPU object.
  */
 template <class PrecisionT = double>
 auto createRaw(const pybind11::array_t<std::complex<PrecisionT>> &numpyArray)
@@ -65,6 +68,14 @@ auto createRaw(const pybind11::array_t<std::complex<PrecisionT>> &numpyArray)
         {data_ptr, static_cast<size_t>(numpyArrayInfo.shape[0])});
 }
 
+/**
+ * @brief Create a StateVectorManagedCPU object from a 1D numpy array
+ * by copying the internal data.
+ *
+ * @tparam PrecisionT Floating point precision type
+ * @param numpyArray Numpy array data-type
+ * @return StateVectorManagedCPU object.
+ */
 template <class PrecisionT = double>
 auto createManaged(
     const pybind11::array_t<std::complex<PrecisionT>> &numpyArray)
@@ -85,6 +96,14 @@ auto createManaged(
         {data_ptr, static_cast<size_t>(numpyArrayInfo.shape[0])});
 }
 
+/**
+ * @brief Create numpy array view for the underlying data of
+ * `%StateVectorManagedCPU` object.
+ *
+ * @tparam PrecisionT Floating point data type
+ * @param sv `%StateVectorManagedCPU` object
+ * @return A numpy array
+ */
 template <class PrecisionT = double>
 auto toNumpyArray(const StateVectorManagedCPU<PrecisionT> &sv)
     -> pybind11::array_t<std::complex<PrecisionT>> {
@@ -92,11 +111,26 @@ auto toNumpyArray(const StateVectorManagedCPU<PrecisionT> &sv)
         {sv.getLength()}, {2 * sizeof(PrecisionT)}, sv.getData());
 }
 
+/**
+ * @brief Get memory alignment of a given numpy array.
+ *
+ * @param NumpyArray Pybind11's numpy array type.
+ * @return Memory model describing alignment
+ */
 auto getNumpyArrayAlignment(const pybind11::array &numpyArray)
     -> CPUMemoryModel {
     return getMemoryModel(numpyArray.request().ptr);
 }
 
+/**
+ * @brief Create an aligned numpy array for a given type, memory model and array
+ * size.
+ *
+ * @tparam T Datatype of numpy array to create
+ * @param memory_model Memory model to use
+ * @param size Size of the array to create
+ * @return Numpy array
+ */
 template <typename T>
 auto alignedNumpyArray(CPUMemoryModel memory_model, size_t size)
     -> pybind11::array {
@@ -114,11 +148,14 @@ auto alignedNumpyArray(CPUMemoryModel memory_model, size_t size)
 }
 
 /**
- * @brief We return an numpy array whose underlying data is allocated by
+ * @brief Create a numpy array whose underlying data is allocated by
  * lightning.
  *
  * See https://github.com/pybind/pybind11/issues/1042#issuecomment-325941022
  * for capsule usage.
+ *
+ * @param size Size of the array to create
+ * @param dt Pybind11's datatype object
  */
 auto allocateAlignedArray(size_t size, pybind11::dtype dt) -> pybind11::array {
     auto memory_model = bestCPUMemoryModel();
@@ -138,7 +175,7 @@ auto allocateAlignedArray(size_t size, pybind11::dtype dt) -> pybind11::array {
 
 /**
  * @brief Apply given list of operations to Numpy data array using C++
- * `%StateVector` class.
+ * StateVectorRawCPU class.
  *
  * @tparam PrecisionT Precision data type
  * @param stateNumpyArray Complex numpy data array representing statevector.
@@ -157,7 +194,16 @@ void apply(pybind11::array_t<std::complex<PrecisionT>> &stateNumpyArray,
     state.applyOperations(ops, wires, inverse, params);
 }
 
-/// @cond DEV
+/**
+ * @brief Register StateVector class to pybind.
+ *
+ * @tparam PrecisionT Floating point type for statevector
+ * @tparam ParamT Parameter type of gate operations for statevector
+ * @tparam SVType Statevector type to register
+ * @tparam Pyclass Pybind11's class object type
+ *
+ * @param pyclass Pybind11's class object to bind statevector
+ */
 template <class PrecisionT, class ParamT, class SVType, class PyClass>
 void registerGatesForStateVector(PyClass &pyclass) {
     using Gates::GateOperation;
@@ -192,4 +238,57 @@ void registerGatesForStateVector(PyClass &pyclass) {
         pyclass.def(gate_name.c_str(), func, doc.c_str());
     });
 }
+
+/**
+ * @brief Return basic information of the compiled binary.
+ */
+auto getCompileInfo() -> pybind11::dict {
+    using namespace Util::Constant;
+    using namespace pybind11::literals;
+
+    const std::string_view cpu_arch_str = [] {
+        switch (cpu_arch) {
+        case CPUArch::AMD64:
+            return "AMD64";
+        case CPUArch::PPC64:
+            return "PPC64";
+        case CPUArch::ARM:
+            return "ARM";
+        default:
+            return "Unknown";
+        }
+    }();
+
+    const std::string_view compiler_name_str = [] {
+        switch (compiler) {
+        case Compiler::GCC:
+            return "GCC";
+        case Compiler::Clang:
+            return "Clang";
+        case Compiler::MSVC:
+            return "MSVC";
+        case Compiler::Unknown:
+            return "Unknown";
+        }
+    }();
+
+    const auto compiler_version_str = getCompilerVersion<compiler>();
+
+    return pybind11::dict("cpu.arch"_a = cpu_arch_str,
+                          "compiler.name"_a = compiler_name_str,
+                          "compiler.version"_a = compiler_version_str,
+                          "AVX2"_a = use_avx2, "AVX512F"_a = use_avx512f);
+}
+
+/**
+ * @brief Return basic information of runtime environment
+ */
+auto getRuntimeInfo() -> pybind11::dict {
+    using namespace Util::Constant;
+    using namespace pybind11::literals;
+
+    return pybind11::dict("AVX"_a = RuntimeInfo::AVX(),
+                          "AVX2"_a = RuntimeInfo::AVX2(),
+                          "AVX512F"_a = RuntimeInfo::AVX512F());
+}
 } // namespace Pennylane
diff --git a/pennylane_lightning/src/gates/KernelType.hpp b/pennylane_lightning/src/gates/KernelType.hpp
index f517cc1f61..d65d65235e 100644
--- a/pennylane_lightning/src/gates/KernelType.hpp
+++ b/pennylane_lightning/src/gates/KernelType.hpp
@@ -27,11 +27,3 @@ namespace Pennylane::Gates {
  */
 enum class KernelType { PI, LM, None };
 } // namespace Pennylane::Gates
-
-namespace Pennylane {
-/**
- * @brief List of kernels binds to Python.
- */
-[[maybe_unused]] constexpr std::array kernels_to_pyexport = {
-    Gates::KernelType::PI, Gates::KernelType::LM};
-} // namespace Pennylane
diff --git a/pennylane_lightning/src/simulator/CPUMemoryModel.hpp b/pennylane_lightning/src/simulator/CPUMemoryModel.hpp
index b6228401a0..282a80390f 100644
--- a/pennylane_lightning/src/simulator/CPUMemoryModel.hpp
+++ b/pennylane_lightning/src/simulator/CPUMemoryModel.hpp
@@ -1,4 +1,3 @@
-
 // Copyright 2022 Xanadu Quantum Technologies Inc.
 
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,6 +18,7 @@
 #pragma once
 #include "Macros.hpp"
 #include "Memory.hpp"
+#include "RuntimeInfo.hpp"
 
 #include <cstdint>
 #include <memory>
@@ -44,11 +44,22 @@ inline auto getMemoryModel(const void *ptr) -> CPUMemoryModel {
     return CPUMemoryModel::Unaligned;
 }
 
-constexpr inline auto bestCPUMemoryModel() -> CPUMemoryModel {
+/**
+ * @brief Choose the best memory model to use using runtime/compile-time
+ * information.
+ */
+inline auto bestCPUMemoryModel() -> CPUMemoryModel {
     if constexpr (use_avx512f) {
-        return CPUMemoryModel::Aligned512;
-    } else if (use_avx2) {
-        return CPUMemoryModel::Aligned256;
+        // If the binary is compiled with AVX512F support
+        if (Util::RuntimeInfo::AVX512F()) {
+            // and the CPU support it as well
+            return CPUMemoryModel::Aligned512;
+        }
+    }
+    if constexpr (use_avx2) {
+        if (Util::RuntimeInfo::AVX2()) {
+            return CPUMemoryModel::Aligned256;
+        }
     }
     return CPUMemoryModel::Unaligned;
 }
diff --git a/pennylane_lightning/src/simulator/DynamicDispatcher.hpp b/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
index d4cd500bd7..d232c32ce6 100644
--- a/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
+++ b/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
@@ -55,6 +55,7 @@ namespace Pennylane {
  */
 template <class PrecisionT, class ParamT> struct RegisterBeforeMain;
 
+/// @cond DEV
 template <> struct RegisterBeforeMain<float, float> {
     static inline const int dummy =
         Internal::registerAllAvailableKernels<float, float>();
@@ -64,6 +65,7 @@ template <> struct RegisterBeforeMain<double, double> {
     static inline const int dummy =
         Internal::registerAllAvailableKernels<double, double>();
 };
+/// @endcond
 
 /**
  * @brief DynamicDispatcher class
diff --git a/pennylane_lightning/src/simulator/KernelMap.hpp b/pennylane_lightning/src/simulator/KernelMap.hpp
index 51532b9c8c..f6c3d6632a 100644
--- a/pennylane_lightning/src/simulator/KernelMap.hpp
+++ b/pennylane_lightning/src/simulator/KernelMap.hpp
@@ -124,7 +124,6 @@ class PriorityDispatchSet {
         ordered_vec_.erase(begin, end);
     }
 };
-
 ///@endcond
 
 struct AllThreading {};
@@ -167,12 +166,32 @@ template <class Operation, size_t cache_size = 16> class OperationKernelMap {
           } {}
 
   public:
+    /**
+     * @brief Get a singleton instance.
+     *
+     * return A singleton instance.
+     */
     static auto getInstance() -> OperationKernelMap & {
         static OperationKernelMap instance;
 
         return instance;
     }
 
+    /**
+     * @brief Assign a kernel for a given operation, threading, and memory
+     * model.
+     *
+     * Variable `%priority` set the priority of the given kernel when multiple
+     * choices are available. The given `%interval` must be disjoint
+     * with all existing intervals with a given priority.
+     *
+     * @param op Operation to use as a dispatch key
+     * @param threading Threading option to use as a dispatch key
+     * @param memory_model Memory model to use as a dispatch key
+     * @param priority Priority of this assignment
+     * @param interval Range of the number of qubits to use this kernel
+     * @param kernel Kernel to assign
+     */
     void assignKernelForOp(Operation op, Threading threading,
                            CPUMemoryModel memory_model, uint32_t priority,
                            const Util::IntegerInterval<size_t> &interval,
@@ -197,6 +216,10 @@ template <class Operation, size_t cache_size = 16> class OperationKernelMap {
         set.emplace(priority, interval, kernel);
     }
 
+    /**
+     * @brief Assign kernel for given operation and memory model for all
+     * threading options. The priority of this assignment is 1.
+     */
     void assignKernelForOp(Operation op, [[maybe_unused]] AllThreading dummy,
                            CPUMemoryModel memory_model,
                            const Util::IntegerInterval<size_t> &interval,
@@ -207,6 +230,10 @@ template <class Operation, size_t cache_size = 16> class OperationKernelMap {
         });
     }
 
+    /**
+     * @brief Assign kernel for given operation and threading option for all
+     * memory models. The priority of this assignment is 2.
+     */
     void assignKernelForOp(Operation op, Threading threading,
                            [[maybe_unused]] AllMemoryModel dummy,
                            const Util::IntegerInterval<size_t> &interval,
@@ -217,6 +244,10 @@ template <class Operation, size_t cache_size = 16> class OperationKernelMap {
         });
     }
 
+    /**
+     * @brief Assign kernel for a given operation for all memory model and all
+     * threading options. The priority of this assignment is 0.
+     */
     void assignKernelForOp(Operation op, [[maybe_unused]] AllThreading dummy1,
                            [[maybe_unused]] AllMemoryModel dummy2,
                            const Util::IntegerInterval<size_t> &interval,
@@ -229,6 +260,15 @@ template <class Operation, size_t cache_size = 16> class OperationKernelMap {
             });
     }
 
+    /**
+     * @brief Remove an assigned kernel for the given operation, threading,
+     * and memory model.
+     *
+     * @param op Operation
+     * @param threading Threading option
+     * @param memory_model Memory model
+     * @param priority Priority to remove
+     */
     void removeKernelForOp(Operation op, Threading threading,
                            CPUMemoryModel memory_model, uint32_t priority) {
         uint32_t dispatch_key = toDispatchKey(threading, memory_model);
@@ -250,11 +290,12 @@ template <class Operation, size_t cache_size = 16> class OperationKernelMap {
      * @param num_qubits Number of qubits
      * @param threading Threading context
      * @param memory_model Memory model of the underlying data
+     * @return A kernel map for given keys
      */
     [[nodiscard]] auto getKernelMap(size_t num_qubits, Threading threading,
                                     CPUMemoryModel memory_model) const
         -> EnumKernelMap {
-        // Add mutex for cache_ when we goto multithread.
+        // TODO: Add mutex for cache_ when we goto multithread.
         const uint32_t dispatch_key = toDispatchKey(threading, memory_model);
 
         const auto cache_iter =
@@ -271,9 +312,9 @@ template <class Operation, size_t cache_size = 16> class OperationKernelMap {
                 kernel_for_op.emplace(op, set.getKernel(num_qubits));
             });
             if (cache_.size() == cache_size) {
-                cache_.pop_front();
+                cache_.pop_back();
             }
-            cache_.emplace_back(num_qubits, dispatch_key, kernel_for_op);
+            cache_.emplace_front(num_qubits, dispatch_key, kernel_for_op);
             return kernel_for_op;
         }
         return std::get<2>(*cache_iter);
diff --git a/pennylane_lightning/src/simulator/StateVectorCPU.hpp b/pennylane_lightning/src/simulator/StateVectorCPU.hpp
index ab5d55a800..b113ece944 100644
--- a/pennylane_lightning/src/simulator/StateVectorCPU.hpp
+++ b/pennylane_lightning/src/simulator/StateVectorCPU.hpp
@@ -46,6 +46,14 @@ class StateVectorCPU : public StateVectorBase<PrecisionT, Derived> {
     std::unordered_map<Gates::MatrixOperation, Gates::KernelType>
         kernel_for_matrices_;
 
+    /**
+     * @brief Internal function set kernels for all operations depending on
+     * provided dispatch options.
+     *
+     * @param num_qubits Number of qubits of the statevector
+     * @param threading Threading option
+     * @param memory_model Memory model
+     */
     void setKernels(size_t num_qubits, Threading threading,
                     CPUMemoryModel memory_model) {
         using KernelMap::OperationKernelMap;
@@ -90,5 +98,35 @@ class StateVectorCPU : public StateVectorBase<PrecisionT, Derived> {
         return memory_model_;
     }
     [[nodiscard]] inline Threading threading() const { return threading_; }
+
+    [[nodiscard]] inline auto getGateKernelMap() const & -> const
+        std::unordered_map<Gates::GateOperation, Gates::KernelType> & {
+        return kernel_for_gates_;
+    }
+
+    [[nodiscard]] inline auto getGateKernelMap()
+        && -> std::unordered_map<Gates::GateOperation, Gates::KernelType> {
+        return kernel_for_gates_;
+    }
+
+    [[nodiscard]] inline auto getGeneratorKernelMap() const & -> const
+        std::unordered_map<Gates::GeneratorOperation, Gates::KernelType> & {
+        return kernel_for_generators_;
+    }
+
+    [[nodiscard]] inline auto getGeneratorKernelMap()
+        && -> std::unordered_map<Gates::GeneratorOperation, Gates::KernelType> {
+        return kernel_for_generators_;
+    }
+
+    [[nodiscard]] inline auto getMatrixKernelMap() const & -> const
+        std::unordered_map<Gates::MatrixOperation, Gates::KernelType> & {
+        return kernel_for_matrices_;
+    }
+
+    [[nodiscard]] inline auto getMatrixKernelMap()
+        && -> std::unordered_map<Gates::MatrixOperation, Gates::KernelType> {
+        return kernel_for_matrices_;
+    }
 };
 } // namespace Pennylane
diff --git a/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp b/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
index 9b1e5d1630..be3edd53fc 100644
--- a/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
+++ b/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
@@ -92,6 +92,17 @@ class StateVectorManagedCPU
         return data_.data();
     }
 
+    [[nodiscard]] auto getDataVector()
+        -> std::vector<ComplexPrecisionT, AlignedAllocator<ComplexPrecisionT>>
+            & {
+        return data_;
+    }
+
+    [[nodiscard]] auto getDataVector() const -> const
+        std::vector<ComplexPrecisionT, AlignedAllocator<ComplexPrecisionT>> & {
+        return data_;
+    }
+
     /**
      * @brief Update data of the class to new_data
      *
diff --git a/pennylane_lightning/src/tests/CMakeLists.txt b/pennylane_lightning/src/tests/CMakeLists.txt
index b5cceb02be..35d88a5872 100644
--- a/pennylane_lightning/src/tests/CMakeLists.txt
+++ b/pennylane_lightning/src/tests/CMakeLists.txt
@@ -82,6 +82,7 @@ set(TEST_SOURCES CreateAllWires.cpp
                  Test_KernelMap.cpp
                  Test_Measures.cpp
                  Test_OpToMemberFuncPtr.cpp
+                 Test_RuntimeInfo.cpp
                  Test_StateVectorCPU.cpp
                  Test_Util.cpp
                  Test_VectorJacobianProduct.cpp)
diff --git a/pennylane_lightning/src/tests/TestAvailableKernels.hpp b/pennylane_lightning/src/tests/TestAvailableKernels.hpp
index 669d98ddc8..1139abb961 100644
--- a/pennylane_lightning/src/tests/TestAvailableKernels.hpp
+++ b/pennylane_lightning/src/tests/TestAvailableKernels.hpp
@@ -40,23 +40,6 @@ check_kernels_are_available(const std::array<KernelType, size> &arr) -> bool {
     return true;
 }
 
-/*******************************************************************************
- * Check all kernels in kernels_to_pyexport are available
- ******************************************************************************/
-
-constexpr auto check_kernels_to_pyexport() -> bool {
-    // TODO: change to constexpr std::any_of in C++20
-    // NOLINTNEXTLINE (readability-use-anyofallof)
-    for (const auto &kernel : kernels_to_pyexport) {
-        if (!is_available_kernel(kernel)) {
-            return false;
-        }
-    }
-    return true;
-}
-static_assert(check_kernels_to_pyexport(),
-              "Some of Kernels in Python export is not available.");
-
 /*******************************************************************************
  * Check each element in kernelIdNamesPairs is unique
  ******************************************************************************/
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
index 04ff09cc17..c60b31bf48 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
@@ -25,6 +25,10 @@ using namespace Pennylane;
 using namespace Pennylane::Gates;
 using namespace Pennylane::Util;
 
+namespace {
+using namespace Pennylane::Gates::Constant;
+} // namespace
+
 using std::vector;
 
 template <typename TypeList> std::string kernelsToString() {
@@ -32,7 +36,7 @@ template <typename TypeList> std::string kernelsToString() {
         return std::string(TypeList::Type::name) + ", " +
                kernelsToString<typename TypeList::Next>();
     }
-    return std::string("");
+    return "";
 }
 
 /* Type transformation */
@@ -105,8 +109,8 @@ void testApplyGate(RandomEngine &re, size_t num_qubits) {
 
     using Kernels = typename KernelsImplementingGate<gate_op>::Type;
 
-    INFO("Kernels implementing " << lookup(Constant::gate_names, gate_op)
-                                 << " are " << kernelsToString<Kernels>());
+    INFO("Kernels implementing " << lookup(gate_names, gate_op) << " are "
+                                 << kernelsToString<Kernels>());
 
     INFO("PrecisionT, ParamT = " << PrecisionToName<PrecisionT>::value << ", "
                                  << PrecisionToName<ParamT>::value);
@@ -114,7 +118,7 @@ void testApplyGate(RandomEngine &re, size_t num_qubits) {
     const auto all_wires = crateAllWires(num_qubits, gate_op, true);
     for (const auto &wires : all_wires) {
         const auto params = createParams<ParamT>(gate_op);
-        const auto gate_name = lookup(Constant::gate_names, gate_op);
+        const auto gate_name = lookup(gate_names, gate_op);
         DYNAMIC_SECTION(
             "Test gate "
             << gate_name
@@ -156,10 +160,9 @@ void testAllGatesIter(RandomEngine &re, size_t max_num_qubits) {
     if constexpr (gate_idx < static_cast<size_t>(GateOperation::END)) {
         constexpr static auto gate_op = static_cast<GateOperation>(gate_idx);
 
-        size_t min_num_qubits =
-            array_has_elt(Constant::multi_qubit_gates, gate_op)
-                ? 1
-                : lookup(Constant::gate_wires, gate_op);
+        size_t min_num_qubits = array_has_elt(multi_qubit_gates, gate_op)
+                                    ? 1
+                                    : lookup(gate_wires, gate_op);
         for (size_t num_qubits = min_num_qubits; num_qubits < max_num_qubits;
              num_qubits++) {
             testApplyGate<gate_op, PrecisionT, ParamT>(re, num_qubits);
diff --git a/pennylane_lightning/src/tests/Test_RuntimeInfo.cpp b/pennylane_lightning/src/tests/Test_RuntimeInfo.cpp
new file mode 100644
index 0000000000..93823e386b
--- /dev/null
+++ b/pennylane_lightning/src/tests/Test_RuntimeInfo.cpp
@@ -0,0 +1,13 @@
+#include "Macros.hpp"
+#include "RuntimeInfo.hpp"
+
+#include <catch2/catch.hpp>
+
+using namespace Pennylane::Util;
+
+TEST_CASE("Runtime information is correct", "[Test_RuntimeInfo]") {
+    INFO("RuntimeInfo::AVX " << RuntimeInfo::AVX());
+    INFO("RuntimeInfo::AVX2 " << RuntimeInfo::AVX2());
+    INFO("RuntimeInfo::AVX512F " << RuntimeInfo::AVX512F());
+    REQUIRE(true);
+}
diff --git a/pennylane_lightning/src/tests/Test_StateVectorCPU.cpp b/pennylane_lightning/src/tests/Test_StateVectorCPU.cpp
index 5b1e263de2..a812c212ea 100644
--- a/pennylane_lightning/src/tests/Test_StateVectorCPU.cpp
+++ b/pennylane_lightning/src/tests/Test_StateVectorCPU.cpp
@@ -35,6 +35,7 @@ TEMPLATE_TEST_CASE("StateVectorManagedCPU::StateVectorManagedCPU",
 
         REQUIRE(sv.getNumQubits() == 4);
         REQUIRE(sv.getLength() == 16);
+        REQUIRE(sv.getDataVector().size() == 16);
     }
     SECTION("StateVectorManagedCPU<TestType> {const "
             "StateVectorRawCPU<TestType>&}") {
diff --git a/pennylane_lightning/src/util/BitUtil.hpp b/pennylane_lightning/src/util/BitUtil.hpp
index d6996a77d8..83ffe49995 100644
--- a/pennylane_lightning/src/util/BitUtil.hpp
+++ b/pennylane_lightning/src/util/BitUtil.hpp
@@ -174,7 +174,8 @@ inline auto log2PerfectPower(unsigned long val) -> size_t {
 
 constexpr auto constLog2PerfectPower(size_t value) -> size_t {
     if (value == 0) {
-        return 0; // not well defined
+        return 0; // not well defined. TODO: Raise an exception instead in
+                  // a later version.
     }
     size_t n = 0;
     while ((value & 1U) == 0U) {
diff --git a/pennylane_lightning/src/util/CMakeLists.txt b/pennylane_lightning/src/util/CMakeLists.txt
index 20e75282f5..36b51f00e6 100644
--- a/pennylane_lightning/src/util/CMakeLists.txt
+++ b/pennylane_lightning/src/util/CMakeLists.txt
@@ -1,7 +1,11 @@
 project(lightning_utils LANGUAGES CXX)
 set(CMAKE_CXX_STANDARD 17)
 
-add_library(lightning_utils INTERFACE)
+set(UTIL_FILES RuntimeInfo.cpp CACHE INTERNAL "" FORCE)
+
+add_library(lightning_utils STATIC ${UTIL_FILES})
 target_include_directories(lightning_utils INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
                                                      $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/DS>
-)
\ No newline at end of file
+)
+
+set_property(TARGET lightning_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
diff --git a/pennylane_lightning/src/util/Macros.hpp b/pennylane_lightning/src/util/Macros.hpp
index eeba364ba8..a8cb8c1d7d 100644
--- a/pennylane_lightning/src/util/Macros.hpp
+++ b/pennylane_lightning/src/util/Macros.hpp
@@ -13,10 +13,19 @@
 // limitations under the License.
 /**
  * @file
- * Define some builtin alternatives
+ * Define macros and compile-time constants.
  */
 #pragma once
 
+#include <string>
+
+/**
+ * @brief Predefined macro variable to a string. Use std::format instead in
+ * C++20.
+ */
+#define PL_TO_STR_INDIR(x) #x
+#define PL_TO_STR(VAR) PL_TO_STR_INDIR(VAR)
+
 #if defined(__GNUC__) || defined(__clang__)
 #define PL_UNREACHABLE __builtin_unreachable()
 #elif defined(_MSC_VER)
@@ -90,3 +99,69 @@
 #define PL_FORCE_INLINE
 #endif
 #endif
+
+namespace Pennylane::Util::Constant {
+enum class CPUArch { AMD64, PPC64, ARM, Unknown };
+
+constexpr auto getCPUArchClangGCC() {
+#if defined(__x86_64__)
+    return CPUArch::AMD64;
+#elif defined(__powerpc64__)
+    return CPUArch::PPC64;
+#elif defined(__arm__)
+    return CPUArch::ARM;
+#else
+    return CPUArch::Unknown;
+#endif
+}
+
+constexpr auto getCPUArchMSVC() {
+#if defined(_M_AMD64)
+    return CPUArch::AMD64;
+#elif defined(_M_PPC)
+    return CPUArch::PPC64;
+#elif defined(_M_ARM)
+    return CPUArch::ARM;
+#else
+    return CPUArch::Unknown;
+#endif
+}
+
+#if defined(__GNUC__) || defined(__clang__)
+[[maybe_unused]] constexpr static auto cpu_arch = getCPUArchClangGCC();
+#elif defined(_MSC_VER)
+[[maybe_unused]] constexpr static auto cpu_arch = getCPUArchMSVC();
+#else
+[[maybe_unused]] constexpr static auto cpu_arch = CPUArch::Unknown;
+#endif
+
+enum class Compiler { GCC, Clang, MSVC, Unknown };
+
+template <Compiler compiler>
+constexpr auto getCompilerVersion() -> std::string_view {
+    return "Unknown version";
+}
+template <>
+constexpr auto getCompilerVersion<Compiler::GCC>() -> std::string_view {
+    return PL_TO_STR(__GNUC__) "." PL_TO_STR(__GNUC_MINOR__) "." PL_TO_STR(
+        __GNUC_PATCHLEVEL__);
+}
+template <>
+constexpr auto getCompilerVersion<Compiler::Clang>() -> std::string_view {
+    return PL_TO_STR(__clang_major__) "." PL_TO_STR(
+        __clang_minor__) "." PL_TO_STR(__clang_patchlevel__);
+}
+template <>
+constexpr auto getCompilerVersion<Compiler::MSVC>() -> std::string_view {
+    return PL_TO_STR(_MSC_FULL_VER);
+}
+#if defined(__GNUC__) && !defined(__llvm__) && !defined(__INTEL_COMPILER)
+[[maybe_unused]] constexpr static auto compiler = Compiler::GCC;
+#elif defined(__clang__)
+[[maybe_unused]] constexpr static auto compiler = Compiler::Clang;
+#elif defined(_MSC_VER)
+[[maybe_unused]] constexpr static auto compiler = Compiler::MSVC;
+#else
+[[maybe_unused]] constexpr static auto compiler = Compiler::Unknown;
+#endif
+} // namespace Pennylane::Util::Constant
diff --git a/pennylane_lightning/src/util/Memory.hpp b/pennylane_lightning/src/util/Memory.hpp
index cb271da036..e3779e42f7 100644
--- a/pennylane_lightning/src/util/Memory.hpp
+++ b/pennylane_lightning/src/util/Memory.hpp
@@ -22,17 +22,22 @@
 #include "BitUtil.hpp"
 #include "TypeList.hpp"
 
-/* Apple clang does not support std::aligned_alloc in Mac 10.14 */
-
 namespace Pennylane {
 /**
  * @brief Custom aligned allocate function. As appleclang does not support
  * std::aligned_alloc in Mac OS 10.14, we use posix_memalign function.
  *
- * Note that alignment must be larger than max_align_t.
+ * Note that alignment must be larger than max_align_t. Otherwise, the behavior
+ * is undefined.
+ *
+ * @param alignment Alignment value we want for the data pointer
+ * @param bytes Number of bytes to allocate
+ * @return Memory pointer
  */
 inline auto alignedAlloc(uint32_t alignment, size_t bytes) -> void * {
-#if defined(__clang__) // probably AppleClang
+#if defined(__clang__)
+    /* Apple clang does not support std::aligned_alloc in Mac 10.14.
+     * Thus we use Posix function instead. */
     void *p;
     posix_memalign(&p, alignment, bytes);
     return p;
@@ -43,6 +48,11 @@ inline auto alignedAlloc(uint32_t alignment, size_t bytes) -> void * {
 #endif
 }
 
+/**
+ * @brief Free memory allocated by alignedAlloc.
+ *
+ * @param p Pointer to the memory location allocated by aligendAlloc
+ */
 inline void alignedFree(void *p) {
 #if defined(__clang__)
     return ::free(p); // NOLINT(hicpp-no-malloc)
@@ -53,12 +63,25 @@ inline void alignedFree(void *p) {
 #endif
 }
 
+/**
+ * @brief C++ Allocator class for aligned memory.
+ *
+ * @tparam T Datatype to allocate
+ */
 template <class T> struct AlignedAllocator {
     uint32_t alignment_;
     using value_type = T;
 
+    /**
+     * @brief Constructor of AlignedAllocator class
+     *
+     * @param alignment Memory alignment we want.
+     */
     constexpr explicit AlignedAllocator(uint32_t alignment)
         : alignment_{alignment} {
+        // We do not check input now as it doesn't allow the constructor to be
+        // a constexpr.
+        // TODO: Using exception is allowed in GCC>=10
         // assert(Util::isPerfectPowerOf2(alignment));
     }
 
@@ -69,6 +92,12 @@ template <class T> struct AlignedAllocator {
         [[maybe_unused]] const AlignedAllocator<U> &rhs) noexcept
         : alignment_{rhs.alignment_} {}
 
+    /**
+     * @brief Allocate memory with for the given number of datatype T
+     *
+     * @param size The number of T objects for the allocation
+     * @return Allocated aligned memory
+     */
     [[nodiscard]] T *allocate(std::size_t size) {
         if (size == 0) {
             return nullptr;
@@ -86,6 +115,12 @@ template <class T> struct AlignedAllocator {
         return static_cast<T *>(p);
     }
 
+    /**
+     * @brief Deallocate allocated memory
+     *
+     * @param p Pointer to the allocated data
+     * @param size Size of the data we allocated (unused).
+     */
     void deallocate(T *p, [[maybe_unused]] std::size_t size) noexcept {
         if (alignment_ > alignof(std::max_align_t)) {
             alignedFree(p);
@@ -103,25 +138,29 @@ template <class T> struct AlignedAllocator {
     }
 };
 
+/**
+ * @brief Compare two allocators
+ *
+ * By [the standard](https://en.cppreference.com/w/cpp/named_req/Allocator),
+ * two allocators are equal if the memory allocated by one can be deallocated
+ * by the other.
+ */
 template <class T, class U>
 bool operator==([[maybe_unused]] const AlignedAllocator<T> &lhs,
                 [[maybe_unused]] const AlignedAllocator<U> &rhs) {
     return lhs.alignment_ == rhs.alignment_;
 }
 
+/**
+ * @brief Compare two allocators. See `%operator==` above.
+ */
 template <class T, class U, uint32_t alignment>
 bool operator!=([[maybe_unused]] const AlignedAllocator<T> &lhs,
                 [[maybe_unused]] const AlignedAllocator<U> &rhs) {
     return lhs.alignment_ != rhs.alignment_;
 }
 
-/**
- * @brief This function calculate the common multiplier of alignments of all
- * kernels.
- *
- * As all alignment must be a multiple of 2, we just can choose the maximum
- * alignment.
- */
+///@cond DEV
 template <typename TypeList> struct commonAlignmentHelper {
     constexpr static uint32_t value =
         std::max(TypeList::Type::packed_bytes,
@@ -130,7 +169,17 @@ template <typename TypeList> struct commonAlignmentHelper {
 template <> struct commonAlignmentHelper<void> {
     constexpr static uint32_t value = 4U;
 };
+///@endcond
 
+/**
+ * @brief This function calculate the common multiplier of alignments of the
+ * given kernels in TypeList.
+ *
+ * As all alignment must be a power of 2, we just can choose the maximum
+ * alignment.
+ *
+ * @tparam TypeList Type list of kernels.
+ */
 template <typename TypeList>
 [[maybe_unused]] constexpr static size_t common_alignment =
     commonAlignmentHelper<TypeList>::value;
diff --git a/pennylane_lightning/src/util/RuntimeInfo.cpp b/pennylane_lightning/src/util/RuntimeInfo.cpp
new file mode 100644
index 0000000000..e260cc6894
--- /dev/null
+++ b/pennylane_lightning/src/util/RuntimeInfo.cpp
@@ -0,0 +1,68 @@
+// Copyright 2022 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "RuntimeInfo.hpp"
+
+#if defined(__GNUC__) || defined(__clang__)
+#include <cpuid.h>
+#elif defined(_MSC_VER)
+#include <intrin.h>
+#endif
+namespace Pennylane::Util {
+#if defined(__GNUC__) || defined(__clang__)
+RuntimeInfo::InternalRuntimeInfo::InternalRuntimeInfo() {
+    const auto nids = __get_cpuid_max(0x00, nullptr);
+    if (nids == 0) {
+        return; // cpuid is not supported
+    }
+
+    unsigned int eax = 0;
+    unsigned int ebx = 0;
+    unsigned int ecx = 0;
+    unsigned int edx = 0;
+    if (nids >= 1) {
+        eax = 1;
+        __get_cpuid(1, &eax, &ebx, &ecx, &edx);
+        f_1_ecx = ecx;
+        f_1_edx = edx;
+    }
+    if (nids >= 7) { // NOLINT(readability-magic-numbers)
+        // NOLINTNEXTLINE(readability-magic-numbers)
+        __get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
+        f_7_ebx = ebx;
+        f_7_ecx = ecx;
+    }
+}
+#elif defined(_MSC_VER)
+RuntimeInfo::InternalRuntimeInfo::InternalRuntimeInfo() {
+    std::array<int, 4> cpui;
+    __cpuid(cpui.data(), 0);
+
+    nids = cpui[0];
+
+    if (nids >= 1) {
+        __cpuidex(cpui.data(), 1, 0);
+        f_1_ecx = cpui[2];
+        f_1_edx = cpui[3]
+    }
+
+    if (nids >= 7) {
+        __cpuidex(cpui.data(), 7, 0);
+        f_7_ebx = cpui[1];
+        f_7_ecx = cpui[2]
+    }
+}
+#else
+RuntimeInfo::InternalRuntimeInfo::InternalRuntimeInfo(){};
+#endif
+} // namespace Pennylane::Util
diff --git a/pennylane_lightning/src/util/RuntimeInfo.hpp b/pennylane_lightning/src/util/RuntimeInfo.hpp
new file mode 100644
index 0000000000..416422bd45
--- /dev/null
+++ b/pennylane_lightning/src/util/RuntimeInfo.hpp
@@ -0,0 +1,52 @@
+// Copyright 2022 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/**
+ * @file
+ * Runtime information based on cpuid
+ */
+#pragma once
+#include <bitset>
+
+namespace Pennylane::Util {
+/**
+ * @brief This class is only usable in x86 or AMD64 architecture.
+ */
+class RuntimeInfo {
+  private:
+    struct InternalRuntimeInfo {
+        InternalRuntimeInfo();
+
+        std::bitset<32> f_1_ecx;
+        std::bitset<32> f_1_edx;
+        std::bitset<32> f_7_ebx;
+        std::bitset<32> f_7_ecx;
+    };
+
+    static const inline InternalRuntimeInfo internal_runtime_info_;
+
+  public:
+    static inline bool AVX() {
+        // NOLINTNEXTLINE(readability-magic-numbers)
+        return internal_runtime_info_.f_1_ecx[28];
+    }
+    static inline bool AVX2() {
+        // NOLINTNEXTLINE(readability-magic-numbers)
+        return internal_runtime_info_.f_7_ebx[5];
+    }
+    static inline bool AVX512F() {
+        // NOLINTNEXTLINE(readability-magic-numbers)
+        return internal_runtime_info_.f_7_ebx[16];
+    }
+};
+} // namespace Pennylane::Util
diff --git a/pennylane_lightning/src/util/TypeList.hpp b/pennylane_lightning/src/util/TypeList.hpp
index 97db820da7..a53c3cbd5d 100644
--- a/pennylane_lightning/src/util/TypeList.hpp
+++ b/pennylane_lightning/src/util/TypeList.hpp
@@ -41,22 +41,33 @@ template <typename T> struct TypeNode<T> {
  */
 template <typename... Ts> using TypeList = TypeNode<Ts...>;
 
+/**
+ * @brief Get N-th type of a type list.
+ *
+ * @tparam TypeList Type list
+ * @tparam n The position of a type to extract
+ */
 template <typename TypeList, size_t n> struct getNth {
     using Type = typename getNth<typename TypeList::Next, n - 1>::Type;
 };
 
+/// @cond DEV
 template <typename TypeList> struct getNth<TypeList, 0> {
     static_assert(!std::is_same_v<typename TypeList::Type, void>,
                   "The given n is larger than the length of the type list.");
     using Type = typename TypeList::Type;
 };
+/// @endcod
 
 /**
- * @brief Alias
+ * @brief Convenient of alias of getNth
  */
 template <typename TypeList, size_t n>
 using getNthType = typename getNth<TypeList, n>::Type;
 
+/**
+ * @brief Get the size of a type list
+ */
 template <typename TypeList> constexpr size_t length() {
     if constexpr (std::is_same_v<TypeList, void>) {
         return 0;
@@ -65,8 +76,15 @@ template <typename TypeList> constexpr size_t length() {
     }
 }
 
+/**
+ * @brief Prepend a type to a type list.
+ *
+ * @tparam T Type to prepend
+ * @tparam U TypeList
+ */
 template <typename T, typename U> struct PrependToTypeList;
 
+/// @cond DEV
 template <typename T, typename... Ts>
 struct PrependToTypeList<T, TypeNode<Ts...>> {
     using Type = TypeNode<T, Ts...>;
@@ -74,5 +92,5 @@ struct PrependToTypeList<T, TypeNode<Ts...>> {
 template <typename T> struct PrependToTypeList<T, void> {
     using Type = TypeNode<T, void>;
 };
-
+/// @endcond
 } // namespace Pennylane::Util

From fc54bcb88adeaa2f324f0dc5945d28986d0bde4c Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Sat, 5 Mar 2022 21:10:43 -0500
Subject: [PATCH 36/94] Fix for windows

---
 pennylane_lightning/src/bindings/Bindings.hpp | 4 +++-
 pennylane_lightning/src/util/RuntimeInfo.cpp  | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/pennylane_lightning/src/bindings/Bindings.hpp b/pennylane_lightning/src/bindings/Bindings.hpp
index fe1bdfa4b3..2c213eff40 100644
--- a/pennylane_lightning/src/bindings/Bindings.hpp
+++ b/pennylane_lightning/src/bindings/Bindings.hpp
@@ -256,6 +256,8 @@ auto getCompileInfo() -> pybind11::dict {
             return "ARM";
         default:
             return "Unknown";
+        default:
+            break;
         }
     }();
 
@@ -267,7 +269,7 @@ auto getCompileInfo() -> pybind11::dict {
             return "Clang";
         case Compiler::MSVC:
             return "MSVC";
-        case Compiler::Unknown:
+        default:
             return "Unknown";
         }
     }();
diff --git a/pennylane_lightning/src/util/RuntimeInfo.cpp b/pennylane_lightning/src/util/RuntimeInfo.cpp
index e260cc6894..b8599d770f 100644
--- a/pennylane_lightning/src/util/RuntimeInfo.cpp
+++ b/pennylane_lightning/src/util/RuntimeInfo.cpp
@@ -48,7 +48,7 @@ RuntimeInfo::InternalRuntimeInfo::InternalRuntimeInfo() {
     std::array<int, 4> cpui;
     __cpuid(cpui.data(), 0);
 
-    nids = cpui[0];
+    int nids = cpui[0];
 
     if (nids >= 1) {
         __cpuidex(cpui.data(), 1, 0);

From ad5c60aabf87f277b87491c2d487bbc8ec1175f2 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Sat, 5 Mar 2022 22:32:33 -0500
Subject: [PATCH 37/94] Fix

---
 pennylane_lightning/src/bindings/Bindings.hpp |  2 --
 .../src/examples/benchmark_gate.cpp           | 20 +++++++++----------
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/pennylane_lightning/src/bindings/Bindings.hpp b/pennylane_lightning/src/bindings/Bindings.hpp
index 2c213eff40..dc5bcbc873 100644
--- a/pennylane_lightning/src/bindings/Bindings.hpp
+++ b/pennylane_lightning/src/bindings/Bindings.hpp
@@ -256,8 +256,6 @@ auto getCompileInfo() -> pybind11::dict {
             return "ARM";
         default:
             return "Unknown";
-        default:
-            break;
         }
     }();
 
diff --git a/pennylane_lightning/src/examples/benchmark_gate.cpp b/pennylane_lightning/src/examples/benchmark_gate.cpp
index 7dff1f507c..a20b92d4b9 100644
--- a/pennylane_lightning/src/examples/benchmark_gate.cpp
+++ b/pennylane_lightning/src/examples/benchmark_gate.cpp
@@ -19,8 +19,7 @@ using PrecisionT = double;
 #endif
 
 using namespace Pennylane;
-using namespace Pennylane::Gates;
-using namespace Pennylane::Util;
+using Util::operator<<;
 
 struct GateDesc {
     std::string name;
@@ -45,18 +44,19 @@ auto generateGateSequence(RandomEngine &re, const std::string &gate_name,
                           const size_t num_reps, const size_t num_qubits,
                           const size_t num_wires_for_multi_qubit)
     -> std::vector<GateDesc> {
-    using Gates::Constant::multi_qubit_gates;
+    using namespace Gates::Constant;
+    using Gates::GateOperation;
 
-    const GateOperation gate_op = Util::lookup(
-        Util::reverse_pairs(Constant::gate_names), std::string_view(gate_name));
+    const GateOperation gate_op = Util::lookup(Util::reverse_pairs(gate_names),
+                                               std::string_view(gate_name));
     const size_t num_wires = [=]() {
         if (Util::array_has_elt(multi_qubit_gates, gate_op)) {
             // if multi qubit gate
             return num_wires_for_multi_qubit;
         }
-        return Util::lookup(Constant::gate_wires, gate_op);
+        return Util::lookup(gate_wires, gate_op);
     }();
-    const size_t num_params = Util::lookup(Constant::gate_num_params, gate_op);
+    const size_t num_params = Util::lookup(gate_num_params, gate_op);
 
     std::vector<GateDesc> gate_seq;
     std::uniform_int_distribution<size_t> inverse_dist(0, 1);
@@ -79,7 +79,7 @@ auto generateGateSequence(RandomEngine &re, const std::string &gate_name,
     return gate_seq;
 }
 
-double benchmarkGate(KernelType kernel, const size_t num_qubits,
+double benchmarkGate(Gates::KernelType kernel, const size_t num_qubits,
                      const std::vector<GateDesc> &gate_seq) {
     // Run benchmark. Total num_reps number of gates is used.
     StateVectorManagedCPU<PrecisionT> svdat{num_qubits};
@@ -97,7 +97,7 @@ double benchmarkGate(KernelType kernel, const size_t num_qubits,
 }
 
 template <typename RandomEngine>
-double runBenchmarkGate(RandomEngine &re, KernelType kernel,
+double runBenchmarkGate(RandomEngine &re, Gates::KernelType kernel,
                         const std::string &gate_name, size_t num_reps,
                         size_t num_qubits, size_t num_wires_for_multi_qubit) {
     auto gate_seq = generateGateSequence(re, gate_name, num_reps, num_qubits,
@@ -127,7 +127,7 @@ double runBenchmarkGate(RandomEngine &re, KernelType kernel,
  * @return Returns 0 is completed successfully
  */
 int main(int argc, char *argv[]) {
-    namespace Constant = Gates::Constant;
+    using namespace Pennylane::Gates;
     // Handle input
     if (argc != 5 && argc != 6) { // NOLINT(readability-magic-numbers)
         std::cerr

From 39766bf8921a3287a4aa9bb941297e2f47f06378 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Sat, 5 Mar 2022 22:53:17 -0500
Subject: [PATCH 38/94] Fix for MSVC

---
 pennylane_lightning/src/util/RuntimeInfo.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pennylane_lightning/src/util/RuntimeInfo.cpp b/pennylane_lightning/src/util/RuntimeInfo.cpp
index b8599d770f..c6cd5ff803 100644
--- a/pennylane_lightning/src/util/RuntimeInfo.cpp
+++ b/pennylane_lightning/src/util/RuntimeInfo.cpp
@@ -13,11 +13,14 @@
 // limitations under the License.
 #include "RuntimeInfo.hpp"
 
+#include <array>
+
 #if defined(__GNUC__) || defined(__clang__)
 #include <cpuid.h>
 #elif defined(_MSC_VER)
 #include <intrin.h>
 #endif
+
 namespace Pennylane::Util {
 #if defined(__GNUC__) || defined(__clang__)
 RuntimeInfo::InternalRuntimeInfo::InternalRuntimeInfo() {
@@ -59,7 +62,7 @@ RuntimeInfo::InternalRuntimeInfo::InternalRuntimeInfo() {
     if (nids >= 7) {
         __cpuidex(cpui.data(), 7, 0);
         f_7_ebx = cpui[1];
-        f_7_ecx = cpui[2]
+        f_7_ecx = cpui[2];
     }
 }
 #else

From 34ee6586f1db79d81922d19632da9f2a3eaca823 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Sat, 5 Mar 2022 23:32:34 -0500
Subject: [PATCH 39/94] Fix benchmark plot

---
 .../src/examples/plot_benchmark.py            | 20 +++++++++----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/pennylane_lightning/src/examples/plot_benchmark.py b/pennylane_lightning/src/examples/plot_benchmark.py
index 0d650071ec..329b833ead 100755
--- a/pennylane_lightning/src/examples/plot_benchmark.py
+++ b/pennylane_lightning/src/examples/plot_benchmark.py
@@ -1,26 +1,24 @@
 #!/usr/bin/env python3
-import csv
 import sys
 import numpy as np
 from pathlib import Path
 import matplotlib.pyplot as plt
 import argparse
+import json
 
 import re
 
 plt.rc("font", family="sans-serif")
 
 
-def parse_result_csv(filepath):
+def parse_result_json(filepath):
     n_qubits = []
     times = []
-    with filepath.open() as csvfile:
-        reader = csv.reader(csvfile)
-        next(reader)  # ignore the first line
-        for row in reader:
-            n_qubits.append(int(row[0]))
-            times.append(float(row[1]))
+    with filepath.open() as f:
+        data = json.load(f)
 
+    n_qubits = [int(d["N"]) for d in data]
+    times = [float(d["time"]) for d in data]
     return n_qubits, times
 
 
@@ -37,10 +35,10 @@ def parse_result_csv(filepath):
     res_dir = Path(args.path)
     gate_name = args.gate_name
 
-    filename_rgx = re.compile(f"^benchmark_(.*?)_{gate_name}.csv$")
+    filename_rgx = re.compile(f"^{gate_name}_(.*?).json$")
 
     res_files = []
-    for file in res_dir.glob("*.csv"):
+    for file in res_dir.glob("*.json"):
         m = filename_rgx.match(file.name)
         if m is not None:
             res_files.append((m.group(1), file))
@@ -57,7 +55,7 @@ def parse_result_csv(filepath):
     total_num_qubits = set()
 
     for kernel_idx, (kernel_name, res_file) in enumerate(res_files):
-        n_qubits, times = parse_result_csv(res_file)
+        n_qubits, times = parse_result_json(res_file)
         total_num_qubits |= set(n_qubits)
         n_qubits = np.array(n_qubits, dtype=float)
         plt.bar(n_qubits + 0.8 * (kernel_idx - num_kernels / 2 + 1 / 2), times, label=kernel_name)

From e34ca57da69054e805e7fa53929381afba6bad24 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Sat, 5 Mar 2022 23:34:16 -0500
Subject: [PATCH 40/94] Fix for MSVC

---
 pennylane_lightning/src/util/RuntimeInfo.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/src/util/RuntimeInfo.cpp b/pennylane_lightning/src/util/RuntimeInfo.cpp
index c6cd5ff803..5a208cb540 100644
--- a/pennylane_lightning/src/util/RuntimeInfo.cpp
+++ b/pennylane_lightning/src/util/RuntimeInfo.cpp
@@ -56,7 +56,7 @@ RuntimeInfo::InternalRuntimeInfo::InternalRuntimeInfo() {
     if (nids >= 1) {
         __cpuidex(cpui.data(), 1, 0);
         f_1_ecx = cpui[2];
-        f_1_edx = cpui[3]
+        f_1_edx = cpui[3];
     }
 
     if (nids >= 7) {

From b7f1e6e54679ccb6509b37d1f3ce3d34261f3a4c Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Sun, 6 Mar 2022 00:20:01 -0500
Subject: [PATCH 41/94] Rollback makefile

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index edef79bab5..02556dc3e0 100644
--- a/Makefile
+++ b/Makefile
@@ -75,7 +75,7 @@ coverage:
 
 test-cpp:
 	rm -rf ./BuildTests
-	cmake $(LIGHTNING_CPP_DIR) -BBuildTests -DBUILD_TESTS=ON -DENABLE_OPENMP=OFF
+	cmake $(LIGHTNING_CPP_DIR) -BBuildTests -DBUILD_TESTS=ON
 	cmake --build ./BuildTests --target runner
 	cmake --build ./BuildTests --target test
 

From f05118ebb228ddefffc83e50579e8db4851ee4c9 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Sun, 6 Mar 2022 10:16:05 -0500
Subject: [PATCH 42/94] Fix snapshot

---
 pennylane_lightning/lightning_qubit.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pennylane_lightning/lightning_qubit.py b/pennylane_lightning/lightning_qubit.py
index 014e4acb51..c25b700c7b 100644
--- a/pennylane_lightning/lightning_qubit.py
+++ b/pennylane_lightning/lightning_qubit.py
@@ -195,6 +195,8 @@ def apply_lightning(self, state, operations, dtype=np.complex128):
 
         for o in operations:
             name = o.name.split(".")[0]  # The split is because inverse gates have .inv appended
+            if name == 'Snapshot':
+                continue
             method = getattr(sim, name, None)
 
             wires = self.wires.indices(o.wires)

From 4bd95989e21fb38819ee1b2293ba18bdb275a0d4 Mon Sep 17 00:00:00 2001
From: Dev version update bot <chae-yeun@xanadu.ai>
Date: Tue, 8 Mar 2022 04:28:41 +0000
Subject: [PATCH 43/94] Auto update version

---
 pennylane_lightning/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/_version.py b/pennylane_lightning/_version.py
index b362bf426f..9ffb524fd2 100644
--- a/pennylane_lightning/_version.py
+++ b/pennylane_lightning/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.22.0-dev13"
+__version__ = "0.22.0-dev14"

From 765c684a7d84d1bcbed4b10c4e1cca647774f54d Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Mon, 7 Mar 2022 23:28:51 -0500
Subject: [PATCH 44/94] Remove snapshot check in apply

---
 pennylane_lightning/lightning_qubit.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pennylane_lightning/lightning_qubit.py b/pennylane_lightning/lightning_qubit.py
index d402d13513..9cc9277363 100644
--- a/pennylane_lightning/lightning_qubit.py
+++ b/pennylane_lightning/lightning_qubit.py
@@ -202,8 +202,6 @@ def apply_lightning(self, state, operations, dtype=np.complex128):
 
         for o in operations:
             name = o.name.split(".")[0]  # The split is because inverse gates have .inv appended
-            if name == 'Snapshot':
-                continue
             method = getattr(sim, name, None)
 
             wires = self.wires.indices(o.wires)

From 1c2c50ffdf1cc9e28e291975fc8b98de93209faf Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Tue, 8 Mar 2022 00:02:14 -0500
Subject: [PATCH 45/94] Update tests

---
 .github/workflows/tests.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 2ba5498498..3d647dd8d0 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -82,6 +82,9 @@ jobs:
           pip uninstall pennylane -y
           pip install git+https://github.com/PennyLaneAI/pennylane.git
 
+      - name: Install torch to test interface
+        run: pip install torch
+
       - name: Install lightning.qubit device
         run: |
           cd main

From b0207017d1bc244b570641ef3ad59a365de3570c Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 9 Mar 2022 13:10:56 -0500
Subject: [PATCH 46/94] Fix some functions; Documents added

---
 .../src/algorithms/AdjointDiff.hpp            |   4 +-
 pennylane_lightning/src/bindings/Bindings.cpp |   7 +-
 pennylane_lightning/src/bindings/Bindings.hpp |  36 +++++-
 .../src/gates/SelectKernel.hpp                |  26 ++--
 .../cpu_kernels/GateImplementationsLM.hpp     |   2 +-
 .../src/simulator/CPUMemoryModel.hpp          |  22 ++++
 .../src/simulator/DispatchKeys.hpp            |   7 +
 .../src/simulator/DynamicDispatcher.hpp       |  30 +++--
 .../src/simulator/Measures.hpp                |   3 +-
 .../src/simulator/StateVectorBase.hpp         | 121 +++---------------
 .../src/simulator/StateVectorCPU.hpp          |  34 +++++
 .../src/simulator/StateVectorManagedCPU.hpp   |  42 +++++-
 .../src/simulator/StateVectorRawCPU.hpp       |   3 +
 pennylane_lightning/src/tests/Test_Util.cpp   |  10 +-
 pennylane_lightning/src/util/BitUtil.hpp      |  37 ++++--
 .../src/util/IntegerInterval.hpp              |  33 ++++-
 pennylane_lightning/src/util/Util.hpp         |  15 ++-
 17 files changed, 265 insertions(+), 167 deletions(-)

diff --git a/pennylane_lightning/src/algorithms/AdjointDiff.hpp b/pennylane_lightning/src/algorithms/AdjointDiff.hpp
index d58dad3e7b..40766ac7de 100644
--- a/pennylane_lightning/src/algorithms/AdjointDiff.hpp
+++ b/pennylane_lightning/src/algorithms/AdjointDiff.hpp
@@ -176,7 +176,7 @@ template <class T = double> class AdjointJacobian {
         #endif
             for (size_t h_i = 0; h_i < num_observables; h_i++) {
                 try {
-                    states[h_i].updateData(reference_state.getData());
+                    states[h_i].updateData(reference_state.getDataVector());
                     applyObservable(states[h_i], observables[h_i]);
                 } catch (...) {
                     #if defined(_OPENMP)
@@ -363,7 +363,7 @@ template <class T = double> class AdjointJacobian {
                         "differentiation method");
             if ((ops_name[op_idx] != "QubitStateVector") &&
                 (ops_name[op_idx] != "BasisState")) {
-                mu.updateData(lambda.getData());
+                mu.updateData(lambda.getDataVector());
                 applyOperationAdj(lambda, ops, op_idx);
 
                 if (ops.hasParams(op_idx)) {
diff --git a/pennylane_lightning/src/bindings/Bindings.cpp b/pennylane_lightning/src/bindings/Bindings.cpp
index a0d39159f4..94db3aff37 100644
--- a/pennylane_lightning/src/bindings/Bindings.cpp
+++ b/pennylane_lightning/src/bindings/Bindings.cpp
@@ -56,13 +56,16 @@ void lightning_class_bindings(py::module_ &m) {
     //***********************************************************************//
     //
     std::string class_name = "StateVectorC" + bitsize;
-    auto pyclass = py::class_<StateVectorRaw<PrecisionT>>(m, class_name.c_str(),
-                                                          py::module_local());
+    auto pyclass = py::class_<StateVectorRawCPU<PrecisionT>>(
+        m, class_name.c_str(), py::module_local());
     pyclass.def(py::init(&createRaw<PrecisionT>));
 
     registerGatesForStateVector<PrecisionT, ParamT,
                                 StateVectorRawCPU<PrecisionT>>(pyclass);
 
+    pyclass.def("kernel_map", &svKernelMap<PrecisionT>,
+                "Get internal kernels for operations");
+
     //***********************************************************************//
     //                              Observable
     //***********************************************************************//
diff --git a/pennylane_lightning/src/bindings/Bindings.hpp b/pennylane_lightning/src/bindings/Bindings.hpp
index dc5bcbc873..2ff03e6cdc 100644
--- a/pennylane_lightning/src/bindings/Bindings.hpp
+++ b/pennylane_lightning/src/bindings/Bindings.hpp
@@ -114,7 +114,7 @@ auto toNumpyArray(const StateVectorManagedCPU<PrecisionT> &sv)
 /**
  * @brief Get memory alignment of a given numpy array.
  *
- * @param NumpyArray Pybind11's numpy array type.
+ * @param numpyArray Pybind11's numpy array type.
  * @return Memory model describing alignment
  */
 auto getNumpyArrayAlignment(const pybind11::array &numpyArray)
@@ -239,6 +239,40 @@ void registerGatesForStateVector(PyClass &pyclass) {
     });
 }
 
+/**
+ * @brief Get a gate kernel map for a statevector
+ */
+template <class PrecisionT>
+auto svKernelMap(const StateVectorRawCPU<PrecisionT> &sv) -> pybind11::dict {
+    pybind11::dict res_map;
+    namespace Constant = Gates::Constant;
+
+    for (const auto &[gate_op, kernel] : sv.getGateKernelMap()) {
+        const auto key =
+            std::string(Util::lookup(Constant::gate_names, gate_op));
+        const auto value = Util::lookup(Gates::kernel_id_name_pairs, kernel);
+
+        res_map[key.c_str()] = value;
+    }
+
+    for (const auto &[gntr_op, kernel] : sv.getGeneratorKernelMap()) {
+        const auto key =
+            std::string(Util::lookup(Constant::generator_names, gntr_op));
+        const auto value = Util::lookup(Gates::kernel_id_name_pairs, kernel);
+
+        res_map[key.c_str()] = value;
+    }
+
+    for (const auto &[mat_op, kernel] : sv.getMatrixKernelMap()) {
+        const auto key =
+            std::string(Util::lookup(Constant::matrix_names, mat_op));
+        const auto value = Util::lookup(Gates::kernel_id_name_pairs, kernel);
+
+        res_map[key.c_str()] = value;
+    }
+    return res_map;
+}
+
 /**
  * @brief Return basic information of the compiled binary.
  */
diff --git a/pennylane_lightning/src/gates/SelectKernel.hpp b/pennylane_lightning/src/gates/SelectKernel.hpp
index 5057ed9b42..a43428ddda 100644
--- a/pennylane_lightning/src/gates/SelectKernel.hpp
+++ b/pennylane_lightning/src/gates/SelectKernel.hpp
@@ -35,31 +35,23 @@ namespace Pennylane::Gates {
  * As Util::lookup can be used in constexpr context, this function is redundant
  * (by the standard). But GCC 9 still does not accept Util::lookup in constexpr
  * some cases.
+ *
+ * @tparam e Enum value
+ * @tparam T Value type of array
+ * @tparam size Size of the array
+ *
+ * @param arr Array of key, value pairs
  */
-///@{
-template <GateOperation op, class T, size_t size>
-constexpr auto
-static_lookup(const std::array<std::pair<GateOperation, T>, size> &arr) -> T {
-    for (size_t idx = 0; idx < size; idx++) {
-        if (std::get<0>(arr[idx]) == op) {
-            return std::get<1>(arr[idx]);
-        }
-    }
-    return T{};
-}
-
-template <GeneratorOperation op, class T, size_t size>
+template <auto e, class T, size_t size>
 constexpr auto
-static_lookup(const std::array<std::pair<GeneratorOperation, T>, size> &arr)
-    -> T {
+static_lookup(const std::array<std::pair<decltype(e), T>, size> &arr) -> T {
     for (size_t idx = 0; idx < size; idx++) {
-        if (std::get<0>(arr[idx]) == op) {
+        if (std::get<0>(arr[idx]) == e) {
             return std::get<1>(arr[idx]);
         }
     }
     return T{};
 }
-///@}
 
 /// @cond DEV
 namespace Internal {
diff --git a/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp b/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp
index 87fb7469a8..1ecb8d7fbe 100644
--- a/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp
+++ b/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp
@@ -41,7 +41,7 @@ class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
   private:
     /* Alias utility functions */
     static constexpr auto fillLeadingOnes = Util::fillLeadingOnes;
-    static constexpr auto fillTrailingOnes = Util::fillTrailingOnes;
+    static constexpr auto fillTrailingOnes = Util::fillTrailingOnes<size_t>;
     static constexpr auto bitswap = Util::bitswap;
 
   public:
diff --git a/pennylane_lightning/src/simulator/CPUMemoryModel.hpp b/pennylane_lightning/src/simulator/CPUMemoryModel.hpp
index 282a80390f..09debc9804 100644
--- a/pennylane_lightning/src/simulator/CPUMemoryModel.hpp
+++ b/pennylane_lightning/src/simulator/CPUMemoryModel.hpp
@@ -24,6 +24,10 @@
 #include <memory>
 
 namespace Pennylane {
+
+/**
+ * @brief Enum class for defining CPU memory alignments
+ */
 enum class CPUMemoryModel : uint8_t {
     Unaligned,
     Aligned256,
@@ -32,6 +36,12 @@ enum class CPUMemoryModel : uint8_t {
     BEGIN = Unaligned,
 };
 
+/**
+ * @brief Compute alignment of a given data pointer
+ *
+ * @param ptr Pointer to data
+ * @return CPUMemoryModel
+ */
 inline auto getMemoryModel(const void *ptr) -> CPUMemoryModel {
     if ((reinterpret_cast<uintptr_t>(ptr) % 64) == 0) {
         return CPUMemoryModel::Aligned512;
@@ -47,6 +57,8 @@ inline auto getMemoryModel(const void *ptr) -> CPUMemoryModel {
 /**
  * @brief Choose the best memory model to use using runtime/compile-time
  * information.
+ *
+ * @return CPUMemoryModel
  */
 inline auto bestCPUMemoryModel() -> CPUMemoryModel {
     if constexpr (use_avx512f) {
@@ -64,6 +76,11 @@ inline auto bestCPUMemoryModel() -> CPUMemoryModel {
     return CPUMemoryModel::Unaligned;
 }
 
+/**
+ * @brief Return alignment of a given memory model.
+ *
+ * @tparam T Data type
+ */
 template <class T>
 constexpr inline auto getAlignment(CPUMemoryModel memory_model) -> uint32_t {
     switch (memory_model) {
@@ -79,6 +96,11 @@ constexpr inline auto getAlignment(CPUMemoryModel memory_model) -> uint32_t {
     PL_UNREACHABLE;
 }
 
+/**
+ * @brief Get a corresponding allocator for standard library containers.
+ *
+ * @tparam T Data type
+ */
 template <class T>
 constexpr auto getAllocator(CPUMemoryModel memory_model)
     -> AlignedAllocator<T> {
diff --git a/pennylane_lightning/src/simulator/DispatchKeys.hpp b/pennylane_lightning/src/simulator/DispatchKeys.hpp
index a6d2f4ba94..34431c3ca6 100644
--- a/pennylane_lightning/src/simulator/DispatchKeys.hpp
+++ b/pennylane_lightning/src/simulator/DispatchKeys.hpp
@@ -34,6 +34,11 @@ enum class Threading : uint8_t {
     BEGIN = SingleThread,
 };
 
+/**
+ * @brief Compute dispatch key using threading and memory information.
+ *
+ * @return Dispatch key
+ */
 constexpr uint32_t toDispatchKey(Threading threading,
                                  CPUMemoryModel memory_model) {
     /* Threading is in higher priority */
@@ -43,6 +48,8 @@ constexpr uint32_t toDispatchKey(Threading threading,
 
 /**
  * @brief Choose the best threading based on the current context.
+ *
+ * @return Threading
  */
 inline auto bestThreading() -> Threading {
 #ifdef PL_USE_OMP
diff --git a/pennylane_lightning/src/simulator/DynamicDispatcher.hpp b/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
index d232c32ce6..0929528325 100644
--- a/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
+++ b/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
@@ -208,7 +208,7 @@ template <typename PrecisionT> class DynamicDispatcher {
      * @param kernel Kernel to run the gate operation.
      * @param data Pointer to data.
      * @param num_qubits Number of qubits.
-     * @param op_name Gate operation name.
+     * @param gate_op Gate operation.
      * @param wires Wires to apply gate to.
      * @param inverse Indicates whether to use inverse of gate.
      * @param params Optional parameter list for parametric gates.
@@ -292,17 +292,29 @@ template <typename PrecisionT> class DynamicDispatcher {
      * @param wires Wires the gate applies to.
      * @param inverse Indicate whether inverse should be taken.
      */
-    void applyMatrix(Gates::KernelType kernel, CFP_t *data,
-                     Gates::MatrixOperation mat_op, size_t num_qubits,
+    void applyMatrix(Gates::KernelType kernel, CFP_t *data, size_t num_qubits,
                      const std::complex<PrecisionT> *matrix,
                      const std::vector<size_t> &wires, bool inverse) const {
+        using Gates::MatrixOperation;
         assert(num_qubits >= wires.size());
 
-        const auto iter = matrices_.find(std::make_pair(mat_op, kernel));
+        const auto iter = [n_wires = wires.size(), kernel, this]() {
+            switch (n_wires) {
+            case 1:
+                return matrices_.find(
+                    std::make_pair(MatrixOperation::SingleQubitOp, kernel));
+            case 2:
+                return matrices_.find(
+                    std::make_pair(MatrixOperation::TwoQubitOp, kernel));
+            default:
+                return matrices_.find(
+                    std::make_pair(MatrixOperation::MultiQubitOp, kernel));
+            }
+        }();
         if (iter == matrices_.end()) {
             throw std::invalid_argument(
-                std::string(
-                    Util::lookup(Gates::Constant::matrix_names, mat_op)) +
+                std::string(Util::lookup(Gates::Constant::matrix_names,
+                                         (iter->first).first)) +
                 " is not registered for the given kernel");
         }
         (iter->second)(data, num_qubits, matrix, wires, inverse);
@@ -317,8 +329,7 @@ template <typename PrecisionT> class DynamicDispatcher {
      * @param wires Wires the gate applies to.
      * @param inverse Indicate whether inverse should be taken.
      */
-    void applyMatrix(Gates::KernelType kernel, CFP_t *data,
-                     Gates::MatrixOperation mat_op, size_t num_qubits,
+    void applyMatrix(Gates::KernelType kernel, CFP_t *data, size_t num_qubits,
                      const std::complex<PrecisionT> &matrix,
                      const std::vector<size_t> &wires, bool inverse) const {
         if (matrix.size() != Util::exp2(2 * wires.size())) {
@@ -326,8 +337,7 @@ template <typename PrecisionT> class DynamicDispatcher {
                 "The size of matrix does not match with the given "
                 "number of wires");
         }
-        applyMatrix(kernel, data, mat_op, num_qubits, matrix.data(), wires,
-                    inverse);
+        applyMatrix(kernel, data, num_qubits, matrix.data(), wires, inverse);
     }
 
     /**
diff --git a/pennylane_lightning/src/simulator/Measures.hpp b/pennylane_lightning/src/simulator/Measures.hpp
index c158f7ed37..14e44fa4dc 100644
--- a/pennylane_lightning/src/simulator/Measures.hpp
+++ b/pennylane_lightning/src/simulator/Measures.hpp
@@ -77,8 +77,7 @@ class Measures {
     std::vector<fp_t> probs(const std::vector<size_t> &wires) {
         // Determining index that would sort the vector.
         // This information is needed later.
-        const std::vector<size_t> sorted_ind_wires(
-            Util::sorting_indices(wires));
+        const auto sorted_ind_wires = Util::sorting_indices(wires);
         // Sorting wires.
         std::vector<size_t> sorted_wires(wires.size());
         for (size_t pos = 0; pos < wires.size(); pos++) {
diff --git a/pennylane_lightning/src/simulator/StateVectorBase.hpp b/pennylane_lightning/src/simulator/StateVectorBase.hpp
index 1b48512164..2eda32fc5b 100644
--- a/pennylane_lightning/src/simulator/StateVectorBase.hpp
+++ b/pennylane_lightning/src/simulator/StateVectorBase.hpp
@@ -278,76 +278,12 @@ template <class T, class Derived> class StateVectorBase {
                                       const std::vector<size_t> &wires,
                                       bool adj = false) -> PrecisionT {
         auto *arr = getData();
-        auto &dispatcher = DynamicDispatcher<PrecisionT>::getInstance();
+        const auto &dispatcher = DynamicDispatcher<PrecisionT>::getInstance();
         return dispatcher.applyGenerator(
             getKernelForGenerator(dispatcher.strToGeneratorOp(opName)), arr,
             num_qubits_, opName, wires, adj);
     }
 
-    /**
-     * @brief Apply a general single qubit matrix to given wires.
-     *
-     * @param kernel Kernel to run the operation
-     * @param matrix Pointer to the array data.
-     * @param wires Wires to apply gate to.
-     * @param inverse Indicate whether inverse should be taken.
-     */
-    inline void applySingleQubitOp(Gates::KernelType kernel,
-                                   const ComplexPrecisionT *matrix,
-                                   const std::vector<size_t> &wires,
-                                   bool inverse = false) {
-        using Gates::MatrixOperation;
-
-        assert(wires.size() == 1);
-
-        auto &dispatcher = DynamicDispatcher<PrecisionT>::getInstance();
-        auto *arr = getData();
-        dispatcher.applyMatrix(kernel, arr, MatrixOperation::SingleQubitOp,
-                               num_qubits_, matrix, wires, inverse);
-    }
-
-    /**
-     * @brief Apply a general single qubit matrix to given wires.
-     *
-     * @param kernel Kernel to run the operation
-     * @param matrix Pointer to the array data.
-     * @param wires Wires to apply gate to.
-     * @param inverse Indicate whether inverse should be taken.
-     */
-    inline void applyTwoQubitOp(Gates::KernelType kernel,
-                                const ComplexPrecisionT *matrix,
-                                const std::vector<size_t> &wires,
-                                bool inverse = false) {
-        using Gates::MatrixOperation;
-
-        assert(wires.size() == 2);
-
-        auto &dispatcher = DynamicDispatcher<PrecisionT>::getInstance();
-        auto *arr = getData();
-        dispatcher.applyMatrix(kernel, arr, MatrixOperation::TwoQubitOp,
-                               num_qubits_, matrix, wires, inverse);
-    }
-
-    /**
-     * @brief Apply a general multi qubit matrix to given wires.
-     *
-     * @param kernel Kernel to run the operation
-     * @param matrix Pointer to the array data.
-     * @param wires Wires to apply gate to.
-     * @param inverse Indicate whether inverse should be taken.
-     */
-    inline void applyMultiQubitOp(Gates::KernelType kernel,
-                                  const ComplexPrecisionT *matrix,
-                                  const std::vector<size_t> &wires,
-                                  bool inverse = false) {
-        using Gates::MatrixOperation;
-
-        auto &dispatcher = DynamicDispatcher<PrecisionT>::getInstance();
-        auto *arr = getData();
-        dispatcher.applyMatrix(kernel, arr, MatrixOperation::MultiQubitOp,
-                               num_qubits_, matrix, wires, inverse);
-    }
-
     /**
      * @brief Apply a given matrix directly to the statevector read directly
      * from numpy data. Data can be in 1D or 2D format.
@@ -363,7 +299,7 @@ template <class T, class Derived> class StateVectorBase {
                             bool inverse = false) {
         using Gates::MatrixOperation;
 
-        auto &dispatcher = DynamicDispatcher<PrecisionT>::getInstance();
+        const auto &dispatcher = DynamicDispatcher<PrecisionT>::getInstance();
         auto *arr = getData();
 
         if (wires.empty()) {
@@ -371,21 +307,8 @@ template <class T, class Derived> class StateVectorBase {
                 "Number of wires must be larger than 0");
         }
 
-        switch (wires.size()) {
-        case 1:
-            dispatcher.applyMatrix(kernel, arr, MatrixOperation::SingleQubitOp,
-                                   num_qubits_, matrix, wires, inverse);
-            return;
-        case 2:
-            dispatcher.applyMatrix(kernel, arr, MatrixOperation::TwoQubitOp,
-                                   num_qubits_, matrix, wires, inverse);
-            return;
-        default:
-            dispatcher.applyMatrix(kernel, arr, MatrixOperation::MultiQubitOp,
-                                   num_qubits_, matrix, wires, inverse);
-            return;
-        }
-        PL_UNREACHABLE;
+        dispatcher.applyMatrix(kernel, arr, num_qubits_, matrix, wires,
+                               inverse);
     }
 
     /**
@@ -401,35 +324,22 @@ template <class T, class Derived> class StateVectorBase {
                             bool inverse = false) {
         using Gates::MatrixOperation;
 
-        auto &dispatcher = DynamicDispatcher<PrecisionT>::getInstance();
-        auto *arr = getData();
-
         if (wires.empty()) {
             throw std::invalid_argument(
                 "Number of wires must be larger than 0");
         }
 
-        switch (wires.size()) {
-        case 1:
-            dispatcher.applyMatrix(
-                getKernelForMatrix(MatrixOperation::SingleQubitOp), arr,
-                MatrixOperation::SingleQubitOp, num_qubits_, matrix, wires,
-                inverse);
-            return;
-        case 2:
-            dispatcher.applyMatrix(
-                getKernelForMatrix(MatrixOperation::TwoQubitOp), arr,
-                MatrixOperation::TwoQubitOp, num_qubits_, matrix, wires,
-                inverse);
-            return;
-        default:
-            dispatcher.applyMatrix(
-                getKernelForMatrix(MatrixOperation::MultiQubitOp), arr,
-                MatrixOperation::MultiQubitOp, num_qubits_, matrix, wires,
-                inverse);
-            return;
-        }
-        PL_UNREACHABLE;
+        const auto kernel = [n_wires = wires.size(), this]() {
+            switch (n_wires) {
+            case 1:
+                return getKernelForMatrix(MatrixOperation::SingleQubitOp);
+            case 2:
+                return getKernelForMatrix(MatrixOperation::TwoQubitOp);
+            default:
+                return getKernelForMatrix(MatrixOperation::MultiQubitOp);
+            }
+        }();
+        applyMatrix(kernel, matrix, wires, inverse);
     }
 
     template <typename Alloc>
@@ -663,5 +573,4 @@ inline auto operator<<(std::ostream &out, const StateVectorBase<T, Derived> &sv)
 
     return out;
 }
-
 } // namespace Pennylane
diff --git a/pennylane_lightning/src/simulator/StateVectorCPU.hpp b/pennylane_lightning/src/simulator/StateVectorCPU.hpp
index b113ece944..c815f1f9ad 100644
--- a/pennylane_lightning/src/simulator/StateVectorCPU.hpp
+++ b/pennylane_lightning/src/simulator/StateVectorCPU.hpp
@@ -77,28 +77,56 @@ class StateVectorCPU : public StateVectorBase<PrecisionT, Derived> {
     }
 
   public:
+    /**
+     * @brief Get a kernel for a gate operation.
+     *
+     * @param gate_op Gate operation
+     * @return KernelType
+     */
     [[nodiscard]] inline auto
     getKernelForGate(Gates::GateOperation gate_op) const -> Gates::KernelType {
         return kernel_for_gates_.at(gate_op);
     }
 
+    /**
+     * @brief Get a kernel for a gate operation.
+     *
+     * @param gntr_op Generator operation
+     * @return KernelType
+     */
     [[nodiscard]] inline auto
     getKernelForGenerator(Gates::GeneratorOperation gntr_op) const
         -> Gates::KernelType {
         return kernel_for_generators_.at(gntr_op);
     }
 
+    /**
+     * @brief Get a kernel for a gate operation.
+     *
+     * @param mat_op Matrix operation
+     * @return KernelType
+     */
     [[nodiscard]] inline auto
     getKernelForMatrix(Gates::MatrixOperation mat_op) const
         -> Gates::KernelType {
         return kernel_for_matrices_.at(mat_op);
     }
 
+    /**
+     * @brief Get memory model of the statevector
+     */
     [[nodiscard]] inline CPUMemoryModel memoryModel() const {
         return memory_model_;
     }
+
+    /**
+     * @brief Get threading of the statevector
+     */
     [[nodiscard]] inline Threading threading() const { return threading_; }
 
+    /**
+     * @brief Get kernels for all gate operations.
+     */
     [[nodiscard]] inline auto getGateKernelMap() const & -> const
         std::unordered_map<Gates::GateOperation, Gates::KernelType> & {
         return kernel_for_gates_;
@@ -109,6 +137,9 @@ class StateVectorCPU : public StateVectorBase<PrecisionT, Derived> {
         return kernel_for_gates_;
     }
 
+    /**
+     * @brief Get kernels for all generator operations.
+     */
     [[nodiscard]] inline auto getGeneratorKernelMap() const & -> const
         std::unordered_map<Gates::GeneratorOperation, Gates::KernelType> & {
         return kernel_for_generators_;
@@ -119,6 +150,9 @@ class StateVectorCPU : public StateVectorBase<PrecisionT, Derived> {
         return kernel_for_generators_;
     }
 
+    /**
+     * @brief Get kernels for all matrix operations.
+     */
     [[nodiscard]] inline auto getMatrixKernelMap() const & -> const
         std::unordered_map<Gates::MatrixOperation, Gates::KernelType> & {
         return kernel_for_matrices_;
diff --git a/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp b/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
index be3edd53fc..df626a87a3 100644
--- a/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
+++ b/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
@@ -40,6 +40,13 @@ class StateVectorManagedCPU
     std::vector<ComplexPrecisionT, AlignedAllocator<ComplexPrecisionT>> data_;
 
   public:
+    /**
+     * @brief Create a new statevector
+     *
+     * @param num_qubits Number of qubits
+     * @param threading Threading option the statevector to use
+     * @param memory_model Memory model the statevector will use
+     */
     explicit StateVectorManagedCPU(
         size_t num_qubits, Threading threading = bestThreading(),
         CPUMemoryModel memory_model = bestCPUMemoryModel())
@@ -49,6 +56,11 @@ class StateVectorManagedCPU
         data_[0] = {1, 0};
     }
 
+    /**
+     * @brief Construct a statevector from another statevector
+     *
+     * @param other Another statevector to construct the statevector from
+     */
     template <class OtherDerived>
     explicit StateVectorManagedCPU(
         const StateVectorCPU<PrecisionT, OtherDerived> &other)
@@ -57,6 +69,14 @@ class StateVectorManagedCPU
           data_{other.getData(), other.getData() + other.getLength(),
                 getAllocator<ComplexPrecisionT>(this->memory_model_)} {}
 
+    /**
+     * @brief Construct a statevector from data pointer
+     *
+     * @param other_data Data pointer to construct the statvector from.
+     * @param other_size Size of the data
+     * @param threading Threading option the statevector to use
+     * @param memory_model Memory model the statevector will use
+     */
     StateVectorManagedCPU(const ComplexPrecisionT *other_data,
                           size_t other_size,
                           Threading threading = bestThreading(),
@@ -68,13 +88,19 @@ class StateVectorManagedCPU
                         "The size of provided data must be a power of 2.");
     }
 
-    // Clang-tidy gives false positive for delegating constructor
+    /**
+     * @brief Construct a statevector from a data vector
+     *
+     * @param other Data to construct the statevector from
+     * @param threading Threading option the statevector to use
+     * @param memory_model Memory model the statevector will use
+     */
     template <class Alloc>
     explicit StateVectorManagedCPU(
-        const std::vector<std::complex<PrecisionT>, Alloc> &rhs,
+        const std::vector<std::complex<PrecisionT>, Alloc> &other,
         Threading threading = bestThreading(),
         CPUMemoryModel memory_model = bestCPUMemoryModel())
-        : StateVectorManagedCPU(rhs.data(), rhs.size(), threading,
+        : StateVectorManagedCPU(other.data(), other.size(), threading,
                                 memory_model) {}
 
     StateVectorManagedCPU(const StateVectorManagedCPU &rhs) = default;
@@ -92,6 +118,9 @@ class StateVectorManagedCPU
         return data_.data();
     }
 
+    /**
+     * @brief Get underlying data vector
+     */
     [[nodiscard]] auto getDataVector()
         -> std::vector<ComplexPrecisionT, AlignedAllocator<ComplexPrecisionT>>
             & {
@@ -108,8 +137,11 @@ class StateVectorManagedCPU
      *
      * @param new_data std::vector contains data.
      */
-    void updateData(const ComplexPrecisionT *data) {
-        std::copy(data, data + BaseType::getLength(), data_.data());
+    template <class Alloc>
+    void updateData(const std::vector<ComplexPrecisionT, Alloc> &new_data) {
+        assert(data_.size() == new_data.size());
+        std::copy(new_data.data(), new_data.data() + new_data.size(),
+                  data_.data());
     }
 };
 } // namespace Pennylane
diff --git a/pennylane_lightning/src/simulator/StateVectorRawCPU.hpp b/pennylane_lightning/src/simulator/StateVectorRawCPU.hpp
index 57c0775774..92de97be20 100644
--- a/pennylane_lightning/src/simulator/StateVectorRawCPU.hpp
+++ b/pennylane_lightning/src/simulator/StateVectorRawCPU.hpp
@@ -58,8 +58,11 @@ class StateVectorRawCPU
     /**
      * @brief Construct state-vector from a raw data pointer.
      *
+     * Memory model is automatically deduced from a pointer.
+     *
      * @param data Raw data pointer.
      * @param length The size of the data, i.e. 2^(number of qubits).
+     * @param threading Threading option the statevector to use
      */
     StateVectorRawCPU(ComplexPrecisionT *data, size_t length,
                       Threading threading = bestThreading())
diff --git a/pennylane_lightning/src/tests/Test_Util.cpp b/pennylane_lightning/src/tests/Test_Util.cpp
index adebf08c66..a01117ed98 100644
--- a/pennylane_lightning/src/tests/Test_Util.cpp
+++ b/pennylane_lightning/src/tests/Test_Util.cpp
@@ -575,11 +575,11 @@ TEST_CASE("Utility bit operations", "[Util][BitUtil]") {
         CHECK(Util::bitswap(0B001101, 0, 4) == 0B011100);
     }
 
-    SECTION("fillOnes") {
-        CHECK(Util::fillOnes<uint8_t>(4) == 0B1111);
-        CHECK(Util::fillOnes<uint8_t>(6) == 0B111111);
-        CHECK(Util::fillOnes<uint32_t>(17) == 0B1'1111'1111'1111'1111);
-        CHECK(Util::fillOnes<uint64_t>(54) ==
+    SECTION("fillTrailingOnes") {
+        CHECK(Util::fillTrailingOnes<uint8_t>(4) == 0B1111);
+        CHECK(Util::fillTrailingOnes<uint8_t>(6) == 0B111111);
+        CHECK(Util::fillTrailingOnes<uint32_t>(17) == 0B1'1111'1111'1111'1111);
+        CHECK(Util::fillTrailingOnes<uint64_t>(54) ==
               0x3F'FFFF'FFFF'FFFF); // 54 == 4*13 + 2
     }
 }
diff --git a/pennylane_lightning/src/util/BitUtil.hpp b/pennylane_lightning/src/util/BitUtil.hpp
index 83ffe49995..98ea5e2223 100644
--- a/pennylane_lightning/src/util/BitUtil.hpp
+++ b/pennylane_lightning/src/util/BitUtil.hpp
@@ -172,6 +172,11 @@ inline auto log2PerfectPower(unsigned long val) -> size_t {
 #endif
 ///@}
 
+/**
+ * @brief Compute log2 of value in a compile-time.
+ *
+ * @param value Number to compute log2
+ */
 constexpr auto constLog2PerfectPower(size_t value) -> size_t {
     if (value == 0) {
         return 0; // not well defined. TODO: Raise an exception instead in
@@ -185,6 +190,24 @@ constexpr auto constLog2PerfectPower(size_t value) -> size_t {
     return n;
 }
 
+/**
+ * @brief Fill ones from LSB to nbits. Runnable in a compile-time and for any
+ * integer type.
+ *
+ * @tparam IntegerType Integer type to use
+ * @param nbits Number of bits to fill
+ */
+template <class IntegerType = size_t>
+inline auto constexpr fillTrailingOnes(size_t nbits) -> IntegerType {
+    static_assert(std::is_integral_v<IntegerType> &&
+                  std::is_unsigned_v<IntegerType>);
+
+    return (nbits == 0) ? 0
+                        : static_cast<IntegerType>(~IntegerType(0)) >>
+                              static_cast<IntegerType>(
+                                  CHAR_BIT * sizeof(IntegerType) - nbits);
+}
+
 /**
  * @brief Check if there is a positive integer n such that value == 2^n.
  *
@@ -194,12 +217,6 @@ constexpr auto constLog2PerfectPower(size_t value) -> size_t {
 inline auto isPerfectPowerOf2(size_t value) -> bool {
     return popcount(value) == 1;
 }
-/**
- * @brief Fill ones from LSB to rev_wire
- */
-inline auto constexpr fillTrailingOnes(size_t pos) -> size_t {
-    return (pos == 0) ? 0 : (~size_t(0) >> (CHAR_BIT * sizeof(size_t) - pos));
-}
 /**
  * @brief Fill ones from MSB to pos
  */
@@ -216,12 +233,4 @@ inline auto constexpr bitswap(size_t bits, const size_t i, const size_t j)
     return bits ^ ((x << i) | (x << j));
 }
 
-template <class IntegerType>
-inline auto constexpr fillOnes(size_t nbits) -> IntegerType {
-    static_assert(std::is_integral_v<IntegerType> &&
-                  std::is_unsigned_v<IntegerType>);
-
-    return static_cast<IntegerType>(~IntegerType(0)) >>
-           static_cast<IntegerType>(CHAR_BIT * sizeof(IntegerType) - nbits);
-}
 } // namespace Pennylane::Util
diff --git a/pennylane_lightning/src/util/IntegerInterval.hpp b/pennylane_lightning/src/util/IntegerInterval.hpp
index 06002ca3b0..565bf9de90 100644
--- a/pennylane_lightning/src/util/IntegerInterval.hpp
+++ b/pennylane_lightning/src/util/IntegerInterval.hpp
@@ -46,35 +46,61 @@ template <typename IntegerType> class IntegerInterval {
     [[nodiscard]] IntegerType max() const { return max_; }
 };
 
+/**
+ * @brief Create integer interval (from, inf)
+ */
 template <typename IntegerType>
 auto larger_than(IntegerType from) -> IntegerInterval<IntegerType> {
     return IntegerInterval<IntegerType>{
         from + 1, std::numeric_limits<IntegerType>::max()};
 }
+/**
+ * @brief Create integer interval [from, inf)
+ */
 template <typename IntegerType>
 auto larger_than_equal_to(IntegerType from) -> IntegerInterval<IntegerType> {
     return IntegerInterval<IntegerType>{
         from, std::numeric_limits<IntegerType>::max()};
 }
+/**
+ * @brief Create integer interval [0, to)
+ */
 template <typename IntegerType>
 auto less_than(IntegerType to) -> IntegerInterval<IntegerType> {
     return IntegerInterval<IntegerType>{0, to};
 }
+/**
+ * @brief Create integer interval [0, to]
+ */
 template <typename IntegerType>
 auto less_than_equal_to(IntegerType to) -> IntegerInterval<IntegerType> {
     return IntegerInterval<IntegerType>{0, to + 1};
 }
+
+/**
+ * @brief Create integer interval [from, to]
+ */
 template <typename IntegerType>
 auto in_between_closed(IntegerType from, IntegerType to)
     -> IntegerInterval<IntegerType> {
     return IntegerInterval<IntegerType>{from, to + 1};
 }
+
+/**
+ * @brief Create integer interval [0, inf)
+ */
 template <typename IntegerType>
 constexpr auto full_domain() -> IntegerInterval<IntegerType> {
     return IntegerInterval<IntegerType>{
         0, std::numeric_limits<IntegerType>::max()};
 }
 
+/**
+ * @brief
+ * @rst
+ * Test if :math:`I_1 \cap I_2 = \phi`.
+ * @endrst
+ */
 template <typename IntegerType>
 bool is_disjoint(const IntegerInterval<IntegerType> &interval1,
                  const IntegerInterval<IntegerType> &interval2) {
@@ -82,6 +108,12 @@ bool is_disjoint(const IntegerInterval<IntegerType> &interval1,
            (interval2.max() <= interval1.min());
 }
 
+/**
+ * @brief
+ * @rst
+ * Create :math:`I_1 \cup I_2`
+ * @endrst
+ */
 template <typename IntegerType>
 auto union_interval(const IntegerInterval<IntegerType> &interval1,
                     const IntegerInterval<IntegerType> &interval2)
@@ -90,5 +122,4 @@ auto union_interval(const IntegerInterval<IntegerType> &interval1,
         std::min(interval1.min(), interval2.min()),
         std::max(interval1.max(), interval2.max())};
 }
-
 } // namespace Pennylane::Util
diff --git a/pennylane_lightning/src/util/Util.hpp b/pennylane_lightning/src/util/Util.hpp
index 2da023e2ea..a906c6fec2 100644
--- a/pennylane_lightning/src/util/Util.hpp
+++ b/pennylane_lightning/src/util/Util.hpp
@@ -260,10 +260,11 @@ auto linspace(T start, T end, size_t num_points) -> std::vector<T> {
  *
  * @tparam T Vector data type.
  * @param arr Array to be inspected.
+ * @param length Size of the array
  * @return a vector with indices that would sort the array.
  */
 template <typename T>
-inline auto sorting_indices(const T &arr, size_t length)
+inline auto sorting_indices(const T *arr, size_t length)
     -> std::vector<size_t> {
     std::vector<size_t> indices(length);
     iota(indices.begin(), indices.end(), 0);
@@ -436,15 +437,27 @@ template <class T, class U, class Func> void for_each_enum(Func &&func) {
     }
 }
 
+/**
+ * @brief Get common alignment of given kernels
+ *
+ * @tparam PrecisionT Floating point type
+ * @tparam TypeList Type list of kernels to calculate common alignment
+ */
 template <class PrecisionT, class TypeList> struct common_alignment {
     constexpr static size_t value =
         std::max(TypeList::Type::template required_alignment<PrecisionT>,
                  common_alignment<PrecisionT, typename TypeList::Next>::value);
 };
+
+/// @cond DEV
 template <class PrecisionT> struct common_alignment<PrecisionT, void> {
     constexpr static size_t value = std::alignment_of_v<PrecisionT>;
 };
+/// @endcond
 
+/**
+ * @brief A value alias for common_alignment
+ */
 template <class PrecisionT, class TypeList>
 [[maybe_unused]] constexpr static size_t common_alignment_v =
     common_alignment<PrecisionT, TypeList>::value;

From cef939ce4af2d0fc269d7144e16000a86569a0c6 Mon Sep 17 00:00:00 2001
From: Dev version update bot <chae-yeun@xanadu.ai>
Date: Wed, 9 Mar 2022 18:13:59 +0000
Subject: [PATCH 47/94] Auto update version

---
 pennylane_lightning/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/_version.py b/pennylane_lightning/_version.py
index fd08943a9b..7b13995518 100644
--- a/pennylane_lightning/_version.py
+++ b/pennylane_lightning/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.22.0-dev16"
+__version__ = "0.22.0-dev17"

From ac8ef319a941d54cf43eb13f227a16b17d8bde24 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 9 Mar 2022 13:14:41 -0500
Subject: [PATCH 48/94] Trigger CI


From bbd5840b5d024063f3f9578f7063d985ed5a6393 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 9 Mar 2022 13:21:42 -0500
Subject: [PATCH 49/94] Fix namespace

---
 pennylane_lightning/src/tests/Test_Util.cpp    | 1 +
 pennylane_lightning/src/util/LinearAlgebra.hpp | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/pennylane_lightning/src/tests/Test_Util.cpp b/pennylane_lightning/src/tests/Test_Util.cpp
index a01117ed98..8d1cd15018 100644
--- a/pennylane_lightning/src/tests/Test_Util.cpp
+++ b/pennylane_lightning/src/tests/Test_Util.cpp
@@ -14,6 +14,7 @@
 #include "TestHelpers.hpp"
 
 using namespace Pennylane;
+using namespace Pennylane::Util;
 
 /**
  * @brief This tests the compile-time calculation of a given scalar
diff --git a/pennylane_lightning/src/util/LinearAlgebra.hpp b/pennylane_lightning/src/util/LinearAlgebra.hpp
index f6cca5f397..1cf36e7be9 100644
--- a/pennylane_lightning/src/util/LinearAlgebra.hpp
+++ b/pennylane_lightning/src/util/LinearAlgebra.hpp
@@ -50,13 +50,13 @@ using CBLAS_LAYOUT = enum CBLAS_LAYOUT {
 /// @endcond
 //
 
+namespace Pennylane::Util {
 enum class Trans : int {
     NoTranspose = CblasNoTrans,
     Transpose = CblasTrans,
     Adjoint = CblasConjTrans
 };
 
-namespace Pennylane::Util {
 /**
  * @brief Calculates the inner-product using OpenMP.
  *

From 13009dec9d4b2b9ed1c60a7dfe40fd02af286d3f Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 9 Mar 2022 13:35:04 -0500
Subject: [PATCH 50/94] Fix namespace

---
 doc/add_kernel.rst                            |  6 ++++
 .../src/simulator/CPUMemoryModel.hpp          |  4 +--
 .../src/simulator/StateVectorManagedCPU.hpp   | 12 ++++----
 pennylane_lightning/src/tests/TestHelpers.hpp |  5 ++--
 pennylane_lightning/src/util/Memory.hpp       | 30 +++++++++----------
 pennylane_lightning/src/util/Util.hpp         | 26 ----------------
 6 files changed, 33 insertions(+), 50 deletions(-)

diff --git a/doc/add_kernel.rst b/doc/add_kernel.rst
index 3c365fe62f..821f99a64c 100644
--- a/doc/add_kernel.rst
+++ b/doc/add_kernel.rst
@@ -16,6 +16,12 @@ We discuss how one can add another gate implementation in this document. Assume
         constexpr static kernel_id = KernelType::Mykernel; // Will be discussed below
         constexpr static std::string_view = "MyGateImpl"; // Name of your kernel
 
+        template <typename PrecisionT>
+        constexpr static size_t required_alignment =
+            std::alignment_of_v<PrecisionT>;
+        template <typename PrecisionT>
+        constexpr static size_t packed_bytes = sizeof(PrecisionT);
+
         template <class PrecisionT>
         static void applyPauliX(std::complex<PrecisionT>* data,
                                 size_t num_qubits,
diff --git a/pennylane_lightning/src/simulator/CPUMemoryModel.hpp b/pennylane_lightning/src/simulator/CPUMemoryModel.hpp
index 09debc9804..c7bf4f42bc 100644
--- a/pennylane_lightning/src/simulator/CPUMemoryModel.hpp
+++ b/pennylane_lightning/src/simulator/CPUMemoryModel.hpp
@@ -103,7 +103,7 @@ constexpr inline auto getAlignment(CPUMemoryModel memory_model) -> uint32_t {
  */
 template <class T>
 constexpr auto getAllocator(CPUMemoryModel memory_model)
-    -> AlignedAllocator<T> {
-    return AlignedAllocator<T>{getAlignment<T>(memory_model)};
+    -> Util::AlignedAllocator<T> {
+    return Util::AlignedAllocator<T>{getAlignment<T>(memory_model)};
 }
 } // namespace Pennylane
diff --git a/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp b/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
index df626a87a3..35e93478e1 100644
--- a/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
+++ b/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
@@ -37,7 +37,8 @@ class StateVectorManagedCPU
   private:
     using BaseType = StateVectorCPU<PrecisionT, StateVectorManagedCPU>;
 
-    std::vector<ComplexPrecisionT, AlignedAllocator<ComplexPrecisionT>> data_;
+    std::vector<ComplexPrecisionT, Util::AlignedAllocator<ComplexPrecisionT>>
+        data_;
 
   public:
     /**
@@ -122,13 +123,14 @@ class StateVectorManagedCPU
      * @brief Get underlying data vector
      */
     [[nodiscard]] auto getDataVector()
-        -> std::vector<ComplexPrecisionT, AlignedAllocator<ComplexPrecisionT>>
-            & {
+        -> std::vector<ComplexPrecisionT,
+                       Util::AlignedAllocator<ComplexPrecisionT>> & {
         return data_;
     }
 
-    [[nodiscard]] auto getDataVector() const -> const
-        std::vector<ComplexPrecisionT, AlignedAllocator<ComplexPrecisionT>> & {
+    [[nodiscard]] auto getDataVector() const
+        -> const std::vector<ComplexPrecisionT,
+                             Util::AlignedAllocator<ComplexPrecisionT>> & {
         return data_;
     }
 
diff --git a/pennylane_lightning/src/tests/TestHelpers.hpp b/pennylane_lightning/src/tests/TestHelpers.hpp
index ab383909d4..5078d68428 100644
--- a/pennylane_lightning/src/tests/TestHelpers.hpp
+++ b/pennylane_lightning/src/tests/TestHelpers.hpp
@@ -151,9 +151,10 @@ isApproxEqual(const Data_t &data1, const Data_t &data2,
 
 template <typename T>
 constexpr static auto test_allocator =
-    AlignedAllocator<T>{Util::common_alignment_v<T, TestKernels>};
+    Util::AlignedAllocator<T>{Util::common_alignment_v<T, TestKernels>};
 
-template <typename T> using TestVector = std::vector<T, AlignedAllocator<T>>;
+template <typename T>
+using TestVector = std::vector<T, Util::AlignedAllocator<T>>;
 
 /**
  * @brief Multiplies every value in a dataset by a given complex scalar value.
diff --git a/pennylane_lightning/src/util/Memory.hpp b/pennylane_lightning/src/util/Memory.hpp
index e3779e42f7..9d55b20679 100644
--- a/pennylane_lightning/src/util/Memory.hpp
+++ b/pennylane_lightning/src/util/Memory.hpp
@@ -22,7 +22,7 @@
 #include "BitUtil.hpp"
 #include "TypeList.hpp"
 
-namespace Pennylane {
+namespace Pennylane::Util {
 /**
  * @brief Custom aligned allocate function. As appleclang does not support
  * std::aligned_alloc in Mac OS 10.14, we use posix_memalign function.
@@ -161,16 +161,6 @@ bool operator!=([[maybe_unused]] const AlignedAllocator<T> &lhs,
 }
 
 ///@cond DEV
-template <typename TypeList> struct commonAlignmentHelper {
-    constexpr static uint32_t value =
-        std::max(TypeList::Type::packed_bytes,
-                 commonAlignmentHelper<typename TypeList::Next>::value);
-};
-template <> struct commonAlignmentHelper<void> {
-    constexpr static uint32_t value = 4U;
-};
-///@endcond
-
 /**
  * @brief This function calculate the common multiplier of alignments of the
  * given kernels in TypeList.
@@ -180,7 +170,17 @@ template <> struct commonAlignmentHelper<void> {
  *
  * @tparam TypeList Type list of kernels.
  */
-template <typename TypeList>
-[[maybe_unused]] constexpr static size_t common_alignment =
-    commonAlignmentHelper<TypeList>::value;
-} // namespace Pennylane
+template <class PrecisionT, class TypeList> struct commonAlignmentHelper {
+    constexpr static size_t value = std::max(
+        TypeList::Type::template required_alignment<PrecisionT>,
+        commonAlignmentHelper<PrecisionT, typename TypeList::Next>::value);
+};
+template <class PrecisionT> struct commonAlignmentHelper<PrecisionT, void> {
+    constexpr static size_t value = 1;
+};
+/// @endcond
+template <class PrecisionT, class TypeList>
+[[maybe_unused]] constexpr static auto common_alignment_v =
+    commonAlignmentHelper<PrecisionT, TypeList>::value;
+
+} // namespace Pennylane::Util
diff --git a/pennylane_lightning/src/util/Util.hpp b/pennylane_lightning/src/util/Util.hpp
index a906c6fec2..03a3455967 100644
--- a/pennylane_lightning/src/util/Util.hpp
+++ b/pennylane_lightning/src/util/Util.hpp
@@ -436,30 +436,4 @@ template <class T, class U, class Func> void for_each_enum(Func &&func) {
         }
     }
 }
-
-/**
- * @brief Get common alignment of given kernels
- *
- * @tparam PrecisionT Floating point type
- * @tparam TypeList Type list of kernels to calculate common alignment
- */
-template <class PrecisionT, class TypeList> struct common_alignment {
-    constexpr static size_t value =
-        std::max(TypeList::Type::template required_alignment<PrecisionT>,
-                 common_alignment<PrecisionT, typename TypeList::Next>::value);
-};
-
-/// @cond DEV
-template <class PrecisionT> struct common_alignment<PrecisionT, void> {
-    constexpr static size_t value = std::alignment_of_v<PrecisionT>;
-};
-/// @endcond
-
-/**
- * @brief A value alias for common_alignment
- */
-template <class PrecisionT, class TypeList>
-[[maybe_unused]] constexpr static size_t common_alignment_v =
-    common_alignment<PrecisionT, TypeList>::value;
-
 } // namespace Pennylane::Util

From df068a8eaaf4482958fda5311e3eec86d4abef36 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 9 Mar 2022 16:47:37 -0500
Subject: [PATCH 51/94] Fix some docs

---
 doc/add_kernel.rst                            | 57 +++++++++----------
 .../src/simulator/DynamicDispatcher.hpp       | 20 +++++--
 pennylane_lightning/src/util/Memory.hpp       | 20 ++++---
 3 files changed, 55 insertions(+), 42 deletions(-)

diff --git a/doc/add_kernel.rst b/doc/add_kernel.rst
index 821f99a64c..2b8d99cc8e 100644
--- a/doc/add_kernel.rst
+++ b/doc/add_kernel.rst
@@ -13,14 +13,13 @@ We discuss how one can add another gate implementation in this document. Assume
         constexpr static std::array implemented_gates = {
             GateOperation::PauliX
         }; // List of implemented gates
-        constexpr static kernel_id = KernelType::Mykernel; // Will be discussed below
+        constexpr static kernel_id = KernelType::MyKernel; // Will be discussed below
         constexpr static std::string_view = "MyGateImpl"; // Name of your kernel
 
+        /* This defines the required alignment for this kernel. If there is no special requirement, 
+           using std::alignment_of_v is sufficient. */
         template <typename PrecisionT>
-        constexpr static size_t required_alignment =
-            std::alignment_of_v<PrecisionT>;
-        template <typename PrecisionT>
-        constexpr static size_t packed_bytes = sizeof(PrecisionT);
+        constexpr static size_t required_alignment = std::alignment_of_v<PrecisionT>;
 
         template <class PrecisionT>
         static void applyPauliX(std::complex<PrecisionT>* data,
@@ -51,8 +50,9 @@ and
     // file: simulator/AvailableKernels.hpp
     namespace Pennylane {
         using AvailableKernels = Util::TypeList<GateImplementationsLM,
-                                            GateImplementationsPI,
-                                            MyGateImplementation /* This is added*/>;
+                                                GateImplementationsPI,
+                                                MyGateImplementation /* This is added*/,
+                                                void>;
     } // namespace Pennylane
 
 
@@ -68,48 +68,47 @@ Now you can call your kernel functions in C++.
     // call using the dynamic dispatcher
     sv.applyOperation(KernelType::MyKernel, "PauliX", /*wires=*/{0}, /*inverse=*/false);
 
-To export your gate implementation to python, you also need to add your kernel to ``kernels_to_pyexport``:
+Still, note that your gate implementation is not a default implementation for ``PauliX`` gate yet, i.e.,
 
 .. code-block:: cpp
 
-    // file: simulator/KernelType.hpp
-    [[maybe_unused]] constexpr std::array kernels_to_pyexport = {
-        KernelType::PI, KernelType::LM, KernelType::Mykernel /* This is added */
-    };
+    sv.applyOperation("PauliX", {0}, false) // still call the default implementation
 
-Then you can find ``PauliX_MyKernel`` function in ``lightning_qubit_ops`` Python module.
-
-Still, note that your gate implementation is not a default implementation for ``PauliX`` gate yet, i.e.,
+To make your gate implementation default, you need to change registered ``KernelMap``.
+Thus changing the following lines
 
 .. code-block:: cpp
 
-    sv.applyPauliX({0}, false); // still call the default implementation
-    sv.applyOperation("PauliX", {0}, false) // still call the default implementation
+    // simulator/Kernel.cpp
 
-To make your gate implementation default, you need to change ``default_kernel_for_ops`` constant. Thus changing
+    int assignDefaultKernelsForGateOp() {
+        auto &instance = OperationKernelMap<GateOperation>::getInstance();
 
-.. code-block:: cpp
+        instance.assignKernelForOp(GateOperation::PauliX, all_threading,
+                                   all_memory_model, all_qubit_numbers,
+                                   Gates::KernelType::LM);
 
-    // file: simulator/Constant.hpp
-    constexpr std::array default_kernel_for_gates = {
-        std::pair{GateOperations::PauliX, KernelType::LM},
-        std::pair{GateOperations::PauliY, KernelType::LM},
         ...
     }
 
-to 
+to
 
 .. code-block:: cpp
 
-    constexpr std::array default_kernel_for_gates = {
-        std::pair{GateOperations::PauliX, KernelType::MyKernel},
-        std::pair{GateOperations::PauliY, KernelType::LM},
+    int assignDefaultKernelsForGateOp() {
+        auto &instance = OperationKernelMap<GateOperation>::getInstance();
+
+        instance.assignKernelForOp(GateOperation::PauliX, all_threading,
+                                   all_memory_model, all_qubit_numbers,
+                                   Gates::KernelType::MyKernel);
+
         ...
     }
 
 will make your implementation as default kernel for ``PauliX`` gate (for all C++ calls as well as for the Python binding).
 
-Gate generators can also be handled in the same way.
+Gate generators can also be handled in the same way. Note that it is possible to assign the kernel only for specific memory model or
+threading operations. Check overloaded functions :cpp:func:`Pennylane::KernelMap::OperationKernelMap::assignKernelForOp` for details.
 
 Test your gate implementation
 =============================
@@ -120,7 +119,7 @@ To test your own kernel implementations, you can go to ``tests/TestKernels.hpp``
 
     using TestKernels = Pennylane::Util::TypeList<Pennylane::Gates::GateImplementationsLM,
                                                   Pennylane::Gates::GateImplementationsPI,
-                                                  MyGateImplementation /*This is added */>;
+                                                  MyGateImplementation /*This is added */, void>;
 
 It will automatically test your gate implementation.
 Note that, in the current implementation, this will test a gate if ``apply + gate name`` is defined even when the gate is not included in ``implemented_gates`` variable.
diff --git a/pennylane_lightning/src/simulator/DynamicDispatcher.hpp b/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
index 0929528325..91dc499194 100644
--- a/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
+++ b/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
@@ -130,11 +130,21 @@ template <typename PrecisionT> class DynamicDispatcher {
         return singleton;
     }
 
+    /**
+     * @brief Gate name to gate operation
+     *
+     * @param gate_name Gate name
+     */
     [[nodiscard]] auto strToGateOp(const std::string &gate_name) const
         -> Gates::GateOperation {
         return str_to_gates_.at(gate_name);
     }
 
+    /**
+     * @brief Generator name to generator operation
+     *
+     * @param gntr_name Generator name
+     */
     [[nodiscard]] auto strToGeneratorOp(const std::string &gntr_name) const
         -> Gates::GeneratorOperation {
         return str_to_gntrs_.at(gntr_name);
@@ -286,7 +296,8 @@ template <typename PrecisionT> class DynamicDispatcher {
     /**
      * @brief Apply a given matrix directly to the statevector.
      *
-     * @param arr Pointer to the statevector.
+     * @param kernel Kernel to use for this operation
+     * @param data Pointer to the statevector.
      * @param num_qubits Number of qubits.
      * @param matrix Perfect square matrix in row-major order.
      * @param wires Wires the gate applies to.
@@ -323,14 +334,15 @@ template <typename PrecisionT> class DynamicDispatcher {
     /**
      * @brief Apply a given matrix directly to the statevector.
      *
-     * @param arr Pointer to the statevector.
+     * @param kernel Kernel to use for this operation
+     * @param data Pointer to the statevector.
      * @param num_qubits Number of qubits.
      * @param matrix Perfect square matrix in row-major order.
      * @param wires Wires the gate applies to.
      * @param inverse Indicate whether inverse should be taken.
      */
     void applyMatrix(Gates::KernelType kernel, CFP_t *data, size_t num_qubits,
-                     const std::complex<PrecisionT> &matrix,
+                     const std::vector<std::complex<PrecisionT>> &matrix,
                      const std::vector<size_t> &wires, bool inverse) const {
         if (matrix.size() != Util::exp2(2 * wires.size())) {
             throw std::invalid_argument(
@@ -347,7 +359,7 @@ template <typename PrecisionT> class DynamicDispatcher {
      * @param kernel Kernel to run the gate operation.
      * @param data Pointer to data.
      * @param num_qubits Number of qubits.
-     * @param op_name Gate operation name.
+     * @param gntr_op Generator operation.
      * @param wires Wires to apply gate to.
      * @param adj Indicates whether to use adjoint of gate.
      */
diff --git a/pennylane_lightning/src/util/Memory.hpp b/pennylane_lightning/src/util/Memory.hpp
index 9d55b20679..65294b07a0 100644
--- a/pennylane_lightning/src/util/Memory.hpp
+++ b/pennylane_lightning/src/util/Memory.hpp
@@ -161,15 +161,6 @@ bool operator!=([[maybe_unused]] const AlignedAllocator<T> &lhs,
 }
 
 ///@cond DEV
-/**
- * @brief This function calculate the common multiplier of alignments of the
- * given kernels in TypeList.
- *
- * As all alignment must be a power of 2, we just can choose the maximum
- * alignment.
- *
- * @tparam TypeList Type list of kernels.
- */
 template <class PrecisionT, class TypeList> struct commonAlignmentHelper {
     constexpr static size_t value = std::max(
         TypeList::Type::template required_alignment<PrecisionT>,
@@ -179,6 +170,17 @@ template <class PrecisionT> struct commonAlignmentHelper<PrecisionT, void> {
     constexpr static size_t value = 1;
 };
 /// @endcond
+
+/**
+ * @brief This function calculate the common multiplier of alignments of the
+ * given kernels in TypeList.
+ *
+ * As all alignment must be a power of 2, we just can choose the maximum
+ * alignment.
+ *
+ * @tparam PrecisionT Floating point type
+ * @tparam TypeList Type list of kernels.
+ */
 template <class PrecisionT, class TypeList>
 [[maybe_unused]] constexpr static auto common_alignment_v =
     commonAlignmentHelper<PrecisionT, TypeList>::value;

From a6c629ce5335ad4d32dd74080eab112c5caa4754 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 9 Mar 2022 17:48:18 -0500
Subject: [PATCH 52/94] Fix for tidy

---
 CMakeLists.txt                                 |  1 +
 .../src/tests/CompareVector.hpp                | 18 ++++++++++++++++++
 pennylane_lightning/src/tests/TestHelpers.hpp  | 18 ++----------------
 .../Test_GateImplementations_Nonparam.cpp      |  1 +
 4 files changed, 22 insertions(+), 16 deletions(-)
 create mode 100644 pennylane_lightning/src/tests/CompareVector.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 58b4faab21..89f665e31e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,6 +42,7 @@ option(ENABLE_BLAS "Enable BLAS" OFF)
 option(BUILD_TESTS "Build cpp tests" OFF)
 option(BUILD_EXAMPLES "Build cpp examples" OFF)
 
+
 # Process compile options
 include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/process_options.cmake")
 
diff --git a/pennylane_lightning/src/tests/CompareVector.hpp b/pennylane_lightning/src/tests/CompareVector.hpp
new file mode 100644
index 0000000000..454e71e535
--- /dev/null
+++ b/pennylane_lightning/src/tests/CompareVector.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <cstdlib>
+#include <vector>
+
+template <class T, class AllocA, class AllocB>
+bool operator==(const std::vector<T, AllocA> &lhs,
+                const std::vector<T, AllocB> &rhs) {
+    if (lhs.size() != rhs.size()) {
+        return false;
+    }
+    for (size_t idx = 0; idx < lhs.size(); idx++) {
+        if (lhs[idx] != rhs[idx]) {
+            return false;
+        }
+    }
+    return true;
+}
diff --git a/pennylane_lightning/src/tests/TestHelpers.hpp b/pennylane_lightning/src/tests/TestHelpers.hpp
index 5078d68428..68464ee8c4 100644
--- a/pennylane_lightning/src/tests/TestHelpers.hpp
+++ b/pennylane_lightning/src/tests/TestHelpers.hpp
@@ -8,6 +8,8 @@
 #include "TestKernels.hpp"
 #include "Util.hpp"
 
+#include <catch2/catch.hpp>
+
 #include <algorithm>
 #include <complex>
 #include <random>
@@ -15,8 +17,6 @@
 #include <type_traits>
 #include <vector>
 
-#include <catch2/catch.hpp>
-
 namespace Pennylane {
 template <typename T> struct remove_complex { using type = T; };
 template <typename T> struct remove_complex<std::complex<T>> {
@@ -97,20 +97,6 @@ bool operator!=(const std::vector<T, AllocA> &lhs,
     return !rhs.compare(lhs);
 }
 
-template <class T, class AllocA, class AllocB>
-bool operator==(const std::vector<T, AllocA> &lhs,
-                const std::vector<T, AllocB> &rhs) {
-    if (lhs.size() != rhs.size()) {
-        return false;
-    }
-    for (size_t idx = 0; idx < lhs.size(); idx++) {
-        if (lhs[idx] != rhs[idx]) {
-            return false;
-        }
-    }
-    return true;
-}
-
 /**
  * @brief Utility function to compare complex statevector data.
  *
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Nonparam.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Nonparam.cpp
index 2dde03af2b..a44752e0a3 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Nonparam.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Nonparam.cpp
@@ -1,3 +1,4 @@
+#include "CompareVector.hpp"
 #include "TestHelpers.hpp"
 #include "TestKernels.hpp"
 #include "Util.hpp"

From b6569dc5a90d4ae22012812d77d9f24afe4c3500 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 9 Mar 2022 19:26:04 -0500
Subject: [PATCH 53/94] Update Util; Update doc

---
 .github/workflows/format.yml                  |   2 +-
 bin/cpp-files                                 |   7 +-
 bin/utils.py                                  |  15 +-
 doc/_ext/edit_on_github.py                    |  27 ++--
 doc/conf.py                                   |  94 +++++------
 doc/directives.py                             |  28 ++--
 pennylane_lightning/src/util/CMakeLists.txt   |   8 +-
 pennylane_lightning/src/util/ConstantUtil.hpp |  11 +-
 .../src/util/LinearAlgebra.hpp                | 106 ++++++++++---
 pennylane_lightning/src/util/Macros.hpp       | 147 +++++++++++++++++-
 pennylane_lightning/src/util/RuntimeInfo.cpp  |  71 +++++++++
 pennylane_lightning/src/util/RuntimeInfo.hpp  |  52 +++++++
 pennylane_lightning/src/util/TypeList.hpp     |  52 ++++++-
 pennylane_lightning/src/util/Util.hpp         |   3 +-
 tests/test_measures.py                        |  18 ++-
 15 files changed, 524 insertions(+), 117 deletions(-)
 create mode 100644 pennylane_lightning/src/util/RuntimeInfo.cpp
 create mode 100644 pennylane_lightning/src/util/RuntimeInfo.hpp

diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml
index 347f111cae..b4a91f94a5 100644
--- a/.github/workflows/format.yml
+++ b/.github/workflows/format.yml
@@ -45,7 +45,7 @@ jobs:
           python-version: 3.7
 
       - name: Install dependencies
-        run: sudo apt update && sudo apt -y install clang-tidy-12 cmake g++
+        run: sudo apt update && sudo apt -y install clang-tidy-12 cmake g++ libomp-12-dev
         env:
           DEBIAN_FRONTEND: noninteractive
 
diff --git a/bin/cpp-files b/bin/cpp-files
index b09cc88cf1..7ccd202783 100755
--- a/bin/cpp-files
+++ b/bin/cpp-files
@@ -14,6 +14,9 @@ if __name__ == '__main__':
     parser = argparse.ArgumentParser(
         description="Output C/C++ files in json list"
     )
+    parser.add_argument(
+        "--header-only", action='store_true', dest='header_only', help="whether only include header files"
+    )
     parser.add_argument(
         "paths", nargs="+", metavar="DIR", help="paths to the root source directories"
     )
@@ -23,9 +26,9 @@ if __name__ == '__main__':
 
     args = parser.parse_args()
 
-    files = set(get_cpp_files(args.paths))
+    files = set(get_cpp_files(args.paths, header_only = args.header_only))
     if args.exclude_dirs:
-        files_excludes = set(get_cpp_files(args.exclude_dirs))
+        files_excludes = set(get_cpp_files(args.exclude_dirs, header_only = args.header_only))
         files -= files_excludes
 
     json.dump(list(files), sys.stdout)
diff --git a/bin/utils.py b/bin/utils.py
index 90d1693031..6d9dab9420 100644
--- a/bin/utils.py
+++ b/bin/utils.py
@@ -2,13 +2,14 @@
 import re
 import fnmatch
 
-SRCFILE_EXT = ("c", "cc", "cpp", "cxx", "h", "hh", "hpp", "hxx", "cu", "cuh")
+SRCFILE_EXT = ["c", "cc", "cpp", "cxx", "cu"]
+HEADERFILE_EXT = ["h", "hh", "hpp", "hxx", "cuh"]
 
 LIGHTNING_SOURCE_DIR = Path(__file__).resolve().parent.parent
 
 rgx_gitignore_comment = re.compile("#.*$")
 
-def get_cpp_files_from_path(path, ignore_patterns = None, use_gitignore = True):
+def get_cpp_files_from_path(path, ignore_patterns = None, use_gitignore = True, header_only = False):
     """return set of C++ source files from a path
 
     Args:
@@ -18,7 +19,11 @@ def get_cpp_files_from_path(path, ignore_patterns = None, use_gitignore = True):
     """
     path = Path(path)
     files_rel = set() # file paths relative to path
-    for ext in SRCFILE_EXT:
+
+    exts = HEADERFILE_EXT
+    if not header_only:
+        exts += SRCFILE_EXT
+    for ext in exts:
         for file_path in path.rglob(f"*.{ext}"):
             files_rel.add(file_path.relative_to(path))
 
@@ -46,7 +51,7 @@ def get_cpp_files_from_path(path, ignore_patterns = None, use_gitignore = True):
 
     return set(str(path.joinpath(f)) for f in files_rel)
     
-def get_cpp_files(paths, ignore_patterns = None, use_gitignore = True):
+def get_cpp_files(paths, ignore_patterns = None, use_gitignore = True, header_only = False):
     """return list of C++ source files from paths.
 
     Args:
@@ -56,5 +61,5 @@ def get_cpp_files(paths, ignore_patterns = None, use_gitignore = True):
     """
     files = set()
     for path in paths:
-        files |= get_cpp_files_from_path(path, ignore_patterns, use_gitignore)
+        files |= get_cpp_files_from_path(path, ignore_patterns, use_gitignore, header_only)
     return list(files)
diff --git a/doc/_ext/edit_on_github.py b/doc/_ext/edit_on_github.py
index b69348d97d..954ed00ab4 100644
--- a/doc/_ext/edit_on_github.py
+++ b/doc/_ext/edit_on_github.py
@@ -8,19 +8,20 @@
 import warnings
 
 
-__licence__ = 'BSD (3 clause)'
+__licence__ = "BSD (3 clause)"
 
 
 def get_github_url(app, view, path):
-    return 'https://github.com/{project}/{view}/{branch}/{path}'.format(
+    return "https://github.com/{project}/{view}/{branch}/{path}".format(
         project=app.config.edit_on_github_project,
         view=view,
         branch=app.config.edit_on_github_branch,
-        path=path)
+        path=path,
+    )
 
 
 def html_page_context(app, pagename, templatename, context, doctree):
-    if templatename != 'page.html':
+    if templatename != "page.html":
         return
 
     if not app.config.edit_on_github_project:
@@ -29,16 +30,16 @@ def html_page_context(app, pagename, templatename, context, doctree):
 
     if not doctree:
         return
-    
-    path = os.path.relpath(doctree.get('source'), app.builder.srcdir)
-    show_url = get_github_url(app, 'blob', path)
-    edit_url = get_github_url(app, 'edit', path)
 
-    context['show_on_github_url'] = show_url
-    context['edit_on_github_url'] = edit_url
+    path = os.path.relpath(doctree.get("source"), app.builder.srcdir)
+    show_url = get_github_url(app, "blob", path)
+    edit_url = get_github_url(app, "edit", path)
+
+    context["show_on_github_url"] = show_url
+    context["edit_on_github_url"] = edit_url
 
 
 def setup(app):
-    app.add_config_value('edit_on_github_project', '', True)
-    app.add_config_value('edit_on_github_branch', 'master', True)
-    app.connect('html-page-context', html_page_context)
\ No newline at end of file
+    app.add_config_value("edit_on_github_project", "", True)
+    app.add_config_value("edit_on_github_branch", "master", True)
+    app.connect("html-page-context", html_page_context)
diff --git a/doc/conf.py b/doc/conf.py
index 770f5434dc..37a6be4452 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -20,46 +20,50 @@
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-sys.path.insert(0, os.path.abspath(''))
-sys.path.insert(0, os.path.abspath('_ext'))
-sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath('doc')), 'doc'))
+sys.path.insert(0, os.path.abspath(""))
+sys.path.insert(0, os.path.abspath("_ext"))
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath("doc")), "doc"))
 
 
 # For obtaining all relevant C++ source files
-currdir = Path(__file__).resolve().parent # PROJECT_SOURCE_DIR/docs
+currdir = Path(__file__).resolve().parent  # PROJECT_SOURCE_DIR/docs
 PROJECT_SOURCE_DIR = currdir.parent
-CPP_SOURCE_DIR = PROJECT_SOURCE_DIR.joinpath('pennylane_lightning/src')
-CPP_EXCLUDE_DIRS = ['examples', 'tests'] # relative to CPP_SOURCE_DIR
+CPP_SOURCE_DIR = PROJECT_SOURCE_DIR.joinpath("pennylane_lightning/src")
+CPP_EXCLUDE_DIRS = ["examples", "tests"]  # relative to CPP_SOURCE_DIR
+
 
 def obtain_cpp_files():
-    script_path = PROJECT_SOURCE_DIR.joinpath('bin/cpp-files')
+    script_path = PROJECT_SOURCE_DIR.joinpath("bin/cpp-files")
 
     if not script_path.exists():
-        print('The project directory structure is corrupted.')
+        print("The project directory structure is corrupted.")
         sys.exit(1)
 
     exclude_dirs = [CPP_SOURCE_DIR.joinpath(exclude_dir) for exclude_dir in CPP_EXCLUDE_DIRS]
 
-    p = subprocess.run([str(script_path), CPP_SOURCE_DIR, '--exclude-dirs', *exclude_dirs], capture_output = True)
+    p = subprocess.run(
+        [str(script_path), "--header-only", CPP_SOURCE_DIR, "--exclude-dirs", *exclude_dirs],
+        capture_output=True,
+    )
     file_list = json.loads(p.stdout)
 
-    file_list = ['../' + str(Path(f).relative_to(PROJECT_SOURCE_DIR)) for f in file_list]
+    file_list = ["../" + str(Path(f).relative_to(PROJECT_SOURCE_DIR)) for f in file_list]
     return file_list
 
+
 CPP_FILES = obtain_cpp_files()
 print(CPP_FILES)
 
 
-
 class Mock(MagicMock):
-    __name__ = 'foo'
+    __name__ = "foo"
 
     @classmethod
     def __getattr__(cls, name):
         return MagicMock()
 
 
-MOCK_MODULES = ['pennylane_lightning.lightning_qubit_ops']
+MOCK_MODULES = ["pennylane_lightning.lightning_qubit_ops"]
 
 mock = Mock()
 for mod_name in MOCK_MODULES:
@@ -68,7 +72,7 @@ def __getattr__(cls, name):
 # -- General configuration ------------------------------------------------
 
 # If your documentation needs a minimal Sphinx version, state it here.
-needs_sphinx = '1.6'
+needs_sphinx = "1.6"
 
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
@@ -85,9 +89,9 @@ def __getattr__(cls, name):
     "sphinx.ext.mathjax",
     "sphinx.ext.napoleon",
     "sphinx.ext.todo",
-    'sphinx.ext.viewcode',
+    "sphinx.ext.viewcode",
     "sphinx_automodapi.automodapi",
-    'sphinx_automodapi.smart_resolver'
+    "sphinx_automodapi.smart_resolver",
 ]
 
 intersphinx_mapping = {"https://pennylane.readthedocs.io/en/stable/": None}
@@ -114,10 +118,7 @@ def __getattr__(cls, name):
     # TIP: if using the sphinx-bootstrap-theme, you need
     # "treeViewIsBootstrap": True,
     "exhaleExecutesDoxygen": True,
-    "exhaleDoxygenStdin": (
-        "INPUT = " + ' '.join(CPP_FILES) + ' '
-        "EXCLUDE_SYMBOLS = std::* "
-    ),
+    "exhaleDoxygenStdin": ("INPUT = " + " ".join(CPP_FILES) + " " "EXCLUDE_SYMBOLS = std::* "),
     "afterTitleDescription": inspect.cleandoc(
         """
         The Pennylane Lightning C++ API is intended to be called from Python through Pybind11. Direct use of the C++ API is currently unsupported and is provided for reference only.
@@ -126,21 +127,21 @@ def __getattr__(cls, name):
 }
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates', 'xanadu_theme']
+templates_path = ["_templates", "xanadu_theme"]
 
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
 #
 # source_suffix = ['.rst', '.md']
-source_suffix = '.rst'
+source_suffix = ".rst"
 
 # The master toctree document.
-master_doc = 'index'
+master_doc = "index"
 
 # General information about the project.
-project = 'PennyLane-Lightning'
+project = "PennyLane-Lightning"
 copyright = "Copyright 2021"
-author = 'Xanadu Inc.'
+author = "Xanadu Inc."
 
 add_module_names = False
 
@@ -149,11 +150,12 @@ def __getattr__(cls, name):
 # built documents.
 
 import pennylane_lightning
+
 # The full version, including alpha/beta/rc tags.
 release = pennylane_lightning.__version__
 
 # The short X.Y version.
-version = re.match(r'^(\d+\.\d+)', release).expand(r'\1')
+version = re.match(r"^(\d+\.\d+)", release).expand(r"\1")
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -163,19 +165,19 @@ def __getattr__(cls, name):
 language = None
 
 # today_fmt is used as the format for a strftime call.
-today_fmt = '%Y-%m-%d'
+today_fmt = "%Y-%m-%d"
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This patterns also effect to html_static_path and html_extra_path
-exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 
 # If true, sectionauthor and moduleauthor directives will be shown in the
 # output. They are ignored by default.
 show_authors = True
 
 # The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+pygments_style = "sphinx"
 
 # If true, `todo` and `todoList` produce output, else they produce nothing.
 todo_include_todos = True
@@ -186,12 +188,12 @@ def __getattr__(cls, name):
 # The name of an image file (relative to this directory) to use as a favicon of
 # the docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
 # pixels large.
-html_favicon = '_static/favicon.ico'
+html_favicon = "_static/favicon.ico"
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
 
 # Custom sidebar templates, must be a dictionary that maps document names
 # to template names.
@@ -199,26 +201,24 @@ def __getattr__(cls, name):
 # This is required for the alabaster theme
 # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
 html_sidebars = {
-    '**' : [
-        'logo-text.html',
-        'searchbox.html',
-        'globaltoc.html',
+    "**": [
+        "logo-text.html",
+        "searchbox.html",
+        "globaltoc.html",
     ]
 }
 
 
 # -- Xanadu theme ---------------------------------------------------------
-html_theme = 'xanadu_theme'
-html_theme_path = ['.']
+html_theme = "xanadu_theme"
+html_theme_path = ["."]
 
 # xanadu theme options (see theme.conf for more information)
 html_theme_options = {
     # Set the name of the project to appear in the left sidebar.
     "project_nav_name": "PennyLane-Lightning",
-
     # Path to a touch icon
     "touch_icon": "logo_new.png",
-
     "large_toc": True,
     "navigation_button": "#19b37b",
     "navigation_button_hover": "#0e714d",
@@ -229,22 +229,22 @@ def __getattr__(cls, name):
     "download_button": "#19b37b",
 }
 
-edit_on_github_project = 'XanaduAI/pennylane-lightning'
-edit_on_github_branch = 'master/doc'
+edit_on_github_project = "XanaduAI/pennylane-lightning"
+edit_on_github_branch = "master/doc"
 
-#============================================================
+# ============================================================
 
 # the order in which autodoc lists the documented members
-autodoc_member_order = 'bysource'
+autodoc_member_order = "bysource"
 
 # inheritance_diagram graphviz attributes
-inheritance_node_attrs = dict(color='lightskyblue1', style='filled')
+inheritance_node_attrs = dict(color="lightskyblue1", style="filled")
 
-#autodoc_default_flags = ['members']
+# autodoc_default_flags = ['members']
 autosummary_generate = True
 
 from directives import CustomDeviceGalleryItemDirective
 
-def setup(app):
-    app.add_directive('devicegalleryitem', CustomDeviceGalleryItemDirective)
 
+def setup(app):
+    app.add_directive("devicegalleryitem", CustomDeviceGalleryItemDirective)
diff --git a/doc/directives.py b/doc/directives.py
index 953c5d38ba..3dfe1cc5d7 100644
--- a/doc/directives.py
+++ b/doc/directives.py
@@ -49,25 +49,27 @@ class CustomDeviceGalleryItemDirective(Directive):
     required_arguments = 0
     optional_arguments = 4
     final_argument_whitespace = True
-    option_spec = {'name': directives.unchanged,
-                   'description': directives.unchanged,
-                   'link': directives.unchanged}
+    option_spec = {
+        "name": directives.unchanged,
+        "description": directives.unchanged,
+        "link": directives.unchanged,
+    }
 
     has_content = False
     add_index = False
 
     def run(self):
         try:
-            if 'name' in self.options:
-                name = self.options['name']
+            if "name" in self.options:
+                name = self.options["name"]
 
-            if 'description' in self.options:
-                description = self.options['description']
+            if "description" in self.options:
+                description = self.options["description"]
             else:
-                raise ValueError('description not found')
+                raise ValueError("description not found")
 
-            if 'link' in self.options:
-                link = self.options['link']
+            if "link" in self.options:
+                link = self.options["link"]
             else:
                 link = "code/qml_templates"
 
@@ -79,10 +81,8 @@ def run(self):
             raise
             return []
 
-        thumbnail_rst = GALLERY_TEMPLATE.format(name=name,
-                                                description=description,
-                                                link=link)
-        thumbnail = StringList(thumbnail_rst.split('\n'))
+        thumbnail_rst = GALLERY_TEMPLATE.format(name=name, description=description, link=link)
+        thumbnail = StringList(thumbnail_rst.split("\n"))
         thumb = nodes.paragraph()
         self.state.nested_parse(thumbnail, self.content_offset, thumb)
         return [thumb]
diff --git a/pennylane_lightning/src/util/CMakeLists.txt b/pennylane_lightning/src/util/CMakeLists.txt
index 20e75282f5..36b51f00e6 100644
--- a/pennylane_lightning/src/util/CMakeLists.txt
+++ b/pennylane_lightning/src/util/CMakeLists.txt
@@ -1,7 +1,11 @@
 project(lightning_utils LANGUAGES CXX)
 set(CMAKE_CXX_STANDARD 17)
 
-add_library(lightning_utils INTERFACE)
+set(UTIL_FILES RuntimeInfo.cpp CACHE INTERNAL "" FORCE)
+
+add_library(lightning_utils STATIC ${UTIL_FILES})
 target_include_directories(lightning_utils INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
                                                      $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/DS>
-)
\ No newline at end of file
+)
+
+set_property(TARGET lightning_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
diff --git a/pennylane_lightning/src/util/ConstantUtil.hpp b/pennylane_lightning/src/util/ConstantUtil.hpp
index 532b49ee01..d3995e7642 100644
--- a/pennylane_lightning/src/util/ConstantUtil.hpp
+++ b/pennylane_lightning/src/util/ConstantUtil.hpp
@@ -42,7 +42,7 @@ constexpr auto lookup(const std::array<std::pair<Key, Value>, size> &arr,
         }
     }
     throw std::range_error("The given key does not exist.");
-};
+}
 
 /**
  * @brief Check an array has an element.
@@ -61,7 +61,7 @@ constexpr auto array_has_elt(const std::array<U, size> &arr, const U &elt)
         }
     }
     return false;
-};
+}
 
 /**
  * @brief Extract first elements from the array of pairs.
@@ -207,4 +207,11 @@ constexpr auto reverse_pairs(const std::array<std::pair<T, U>, size> &arr)
     return Internal::reverse_pairs_helper(arr,
                                           std::make_index_sequence<size>{});
 }
+
+constexpr auto constIsPerfectPowerOf2(size_t value) -> bool {
+    while ((value & 1U) == 0) {
+        value >>= 1U;
+    }
+    return value == 1;
+}
 } // namespace Pennylane::Util
diff --git a/pennylane_lightning/src/util/LinearAlgebra.hpp b/pennylane_lightning/src/util/LinearAlgebra.hpp
index 40ea4292a8..1cf36e7be9 100644
--- a/pennylane_lightning/src/util/LinearAlgebra.hpp
+++ b/pennylane_lightning/src/util/LinearAlgebra.hpp
@@ -17,9 +17,11 @@
  */
 #pragma once
 
+#include <algorithm>
 #include <complex>
 #include <cstdlib>
 #include <numeric>
+#include <random>
 #include <vector>
 
 #include "Util.hpp"
@@ -48,13 +50,13 @@ using CBLAS_LAYOUT = enum CBLAS_LAYOUT {
 /// @endcond
 //
 
+namespace Pennylane::Util {
 enum class Trans : int {
     NoTranspose = CblasNoTrans,
     Transpose = CblasTrans,
     Adjoint = CblasConjTrans
 };
 
-namespace Pennylane::Util {
 /**
  * @brief Calculates the inner-product using OpenMP.
  *
@@ -210,9 +212,9 @@ inline auto innerProdC(const std::complex<T> *v1, const std::complex<T> *v2,
  * @see innerProd(const std::complex<T> *v1, const std::complex<T> *v2,
  * const size_t data_size)
  */
-template <class T>
-inline auto innerProd(const std::vector<std::complex<T>> &v1,
-                      const std::vector<std::complex<T>> &v2)
+template <class T, class AllocA, class AllocB>
+inline auto innerProd(const std::vector<std::complex<T>, AllocA> &v1,
+                      const std::vector<std::complex<T>, AllocB> &v2)
     -> std::complex<T> {
     return innerProd(v1.data(), v2.data(), v1.size());
 }
@@ -224,9 +226,9 @@ inline auto innerProd(const std::vector<std::complex<T>> &v1,
  * @see innerProdC(const std::complex<T> *v1, const std::complex<T> *v2,
  * const size_t data_size)
  */
-template <class T>
-inline auto innerProdC(const std::vector<std::complex<T>> &v1,
-                       const std::vector<std::complex<T>> &v2)
+template <class T, class AllocA, class AllocB>
+inline auto innerProdC(const std::vector<std::complex<T>, AllocA> &v1,
+                       const std::vector<std::complex<T>, AllocB> &v2)
     -> std::complex<T> {
     return innerProdC(v1.data(), v2.data(), v1.size());
 }
@@ -461,15 +463,15 @@ inline static void CFTranspose(const std::complex<T> *mat,
  * @param n Number of columns of `mat`.
  * @return mat transpose of shape n * m.
  */
-template <class T>
-inline auto Transpose(const std::vector<std::complex<T>> &mat, size_t m,
-                      size_t n) -> std::vector<std::complex<T>> {
+template <class T, class Alloc>
+inline auto Transpose(const std::vector<std::complex<T>, Alloc> &mat, size_t m,
+                      size_t n) -> std::vector<std::complex<T>, Alloc> {
     if (mat.size() != m * n) {
         throw std::invalid_argument(
             "Invalid number of rows and columns for the input matrix");
     }
 
-    std::vector<std::complex<T>> mat_t(n * m);
+    std::vector<std::complex<T>, Alloc> mat_t(n * m, mat.get_allocator());
     CFTranspose(mat.data(), mat_t.data(), m, n, 0, m, 0, n);
     return mat_t;
 }
@@ -484,15 +486,15 @@ inline auto Transpose(const std::vector<std::complex<T>> &mat, size_t m,
  * @param n Number of columns of `mat`.
  * @return mat transpose of shape n * m.
  */
-template <class T>
-inline auto Transpose(const std::vector<T> &mat, size_t m, size_t n)
-    -> std::vector<T> {
+template <class T, class Alloc>
+inline auto Transpose(const std::vector<T, Alloc> &mat, size_t m, size_t n)
+    -> std::vector<T, Alloc> {
     if (mat.size() != m * n) {
         throw std::invalid_argument(
             "Invalid number of rows and columns for the input matrix");
     }
 
-    std::vector<T> mat_t(n * m);
+    std::vector<T, Alloc> mat_t(n * m, mat.get_allocator());
     CFTranspose(mat.data(), mat_t.data(), m, n, 0, m, 0, n);
     return mat_t;
 }
@@ -548,9 +550,10 @@ inline void vecMatrixProd(const T *v_in, const T *mat, T *v_out, size_t m,
  * @see inline void vecMatrixProd(const T *v_in,
  * const T *mat, T *v_out, size_t m, size_t n)
  */
-template <class T>
-inline auto vecMatrixProd(const std::vector<T> &v_in, const std::vector<T> &mat,
-                          size_t m, size_t n) -> std::vector<T> {
+template <class T, class Alloc>
+inline auto vecMatrixProd(const std::vector<T, Alloc> &v_in,
+                          const std::vector<T, Alloc> &mat, size_t m, size_t n)
+    -> std::vector<T, Alloc> {
     if (v_in.size() != m) {
         throw std::invalid_argument("Invalid size for the input vector");
     }
@@ -559,7 +562,7 @@ inline auto vecMatrixProd(const std::vector<T> &v_in, const std::vector<T> &mat,
             "Invalid number of rows and columns for the input matrix");
     }
 
-    std::vector<T> v_out(n);
+    std::vector<T, Alloc> v_out(n, mat.get_allocator());
     vecMatrixProd(v_in.data(), mat.data(), v_out.data(), m, n);
 
     return v_out;
@@ -745,4 +748,69 @@ inline auto matrixMatProd(const std::vector<std::complex<T>> m_left,
 
     return m_out;
 }
+
+/**
+ * @brief Calculate the squared norm of a vector
+ */
+template <typename PrecisionT>
+auto squaredNorm(const std::complex<PrecisionT> *data, size_t data_size)
+    -> PrecisionT {
+    return std::transform_reduce(
+        data, data + data_size, PrecisionT{}, std::plus<PrecisionT>(),
+        static_cast<PrecisionT (*)(const std::complex<PrecisionT> &)>(
+            &std::norm<PrecisionT>));
+}
+
+/**
+ * @brief Generate random unitary matrix
+ *
+ * @return Generated unitary matrix in row-major format
+ */
+template <typename PrecisionT, class RandomEngine>
+auto randomUnitary(RandomEngine &re, size_t num_qubits)
+    -> std::vector<std::complex<PrecisionT>> {
+    using ComplexPrecisionT = std::complex<PrecisionT>;
+    const size_t dim = (1U << num_qubits);
+    std::vector<ComplexPrecisionT> res(dim * dim, ComplexPrecisionT{});
+
+    std::normal_distribution<PrecisionT> dist;
+
+    auto generator = [&dist, &re]() -> ComplexPrecisionT {
+        return ComplexPrecisionT{dist(re), dist(re)};
+    };
+
+    std::generate(res.begin(), res.end(), generator);
+
+    // Simple algorithm to make rows orthogonal with Gram-Schmidt
+    // This algorithm is unstable but works for a small matrix.
+    // Use QR decomposition when we have LAPACK support.
+
+    for (size_t row2 = 0; row2 < dim; row2++) {
+        ComplexPrecisionT *row2_p = res.data() + row2 * dim;
+        for (size_t row1 = 0; row1 < row2; row1++) {
+            const ComplexPrecisionT *row1_p = res.data() + row1 * dim;
+            ComplexPrecisionT dot12 = Util::innerProdC(row1_p, row2_p, dim);
+            ComplexPrecisionT dot11 = squaredNorm(row1_p, dim);
+
+            // orthogonalize row2
+            std::transform(
+                row2_p, row2_p + dim, row1_p, row2_p,
+                [scale = dot12 / dot11](auto &elt2, const auto &elt1) {
+                    return elt2 - scale * elt1;
+                });
+        }
+    }
+
+    // Normalize each row
+    for (size_t row = 0; row < dim; row++) {
+        ComplexPrecisionT *row_p = res.data() + row * dim;
+        PrecisionT norm2 = std::sqrt(squaredNorm(row_p, dim));
+
+        // normalize row2
+        std::transform(row_p, row_p + dim, row_p, [norm2](const auto c) {
+            return (static_cast<PrecisionT>(1.0) / norm2) * c;
+        });
+    }
+    return res;
+}
 } // namespace Pennylane::Util
diff --git a/pennylane_lightning/src/util/Macros.hpp b/pennylane_lightning/src/util/Macros.hpp
index 1b60d1e076..a8cb8c1d7d 100644
--- a/pennylane_lightning/src/util/Macros.hpp
+++ b/pennylane_lightning/src/util/Macros.hpp
@@ -13,12 +13,155 @@
 // limitations under the License.
 /**
  * @file
- * Define some builtin alternatives
+ * Define macros and compile-time constants.
  */
 #pragma once
 
+#include <string>
+
+/**
+ * @brief Predefined macro variable to a string. Use std::format instead in
+ * C++20.
+ */
+#define PL_TO_STR_INDIR(x) #x
+#define PL_TO_STR(VAR) PL_TO_STR_INDIR(VAR)
+
 #if defined(__GNUC__) || defined(__clang__)
 #define PL_UNREACHABLE __builtin_unreachable()
-#else
+#elif defined(_MSC_VER)
 #define PL_UNREACHABLE __assume(false)
+#else // Unsupported compiler
+#define PL_UNREACHABLE
+#endif
+
+#if defined(__AVX2__)
+#define PL_USE_AVX2 1
+[[maybe_unused]] static constexpr bool use_avx2 = true;
+#else
+[[maybe_unused]] static constexpr bool use_avx2 = false;
+#endif
+
+#if defined(__AVX512F__)
+#define PL_USE_AVX512F 1
+[[maybe_unused]] static constexpr bool use_avx512f = true;
+#else
+[[maybe_unused]] static constexpr bool use_avx512f = false;
+#endif
+
+#if defined(__AVX512DQ__)
+#define PL_USE_AVX512DQ 1
+[[maybe_unused]] static constexpr bool use_avx512dq = true;
+#else
+[[maybe_unused]] static constexpr bool use_avx512dq = false;
+#endif
+
+#if defined(__AVX512VL__)
+#define PL_USE_AVX512VL 1
+[[maybe_unused]] static constexpr bool use_avx512vl = true;
+#else
+[[maybe_unused]] static constexpr bool use_avx512vl = false;
+#endif
+
+#if defined(_OPENMP)
+#define PL_USE_OMP 1
+[[maybe_unused]] static constexpr bool use_openmp = true;
+#else
+[[maybe_unused]] static constexpr bool use_openmp = false;
+#endif
+
+#if (_OPENMP >= 202011)
+#define PL_UNROLL_LOOP __Pragma("omp unroll(8)")
+#elif defined(__GNUC__)
+#define PL_UNROLL_LOOP _Pragma("GCC unroll 8")
+#elif defined(__clang__)
+#define PL_UNROLL_LOOP _Pragma("unroll(8)")
+#else
+#define PL_UNROLL_LOOP
+#endif
+
+// Define force inline
+#if defined(__GNUC__) || defined(__clang__)
+#if NDEBUG
+#define PL_FORCE_INLINE __attribute__((always_inline)) inline
+#else
+#define PL_FORCE_INLINE
+#endif
+#elif defined(_MSC_VER)
+#if NDEBUG
+#define PL_FORCE_INLINE __forceinline
+#else
+#define PL_FORCE_INLINE
+#endif
+#else
+#if NDEBUG
+#define PL_FORCE_INLINE inline
+#else
+#define PL_FORCE_INLINE
+#endif
+#endif
+
+namespace Pennylane::Util::Constant {
+enum class CPUArch { AMD64, PPC64, ARM, Unknown };
+
+constexpr auto getCPUArchClangGCC() {
+#if defined(__x86_64__)
+    return CPUArch::AMD64;
+#elif defined(__powerpc64__)
+    return CPUArch::PPC64;
+#elif defined(__arm__)
+    return CPUArch::ARM;
+#else
+    return CPUArch::Unknown;
+#endif
+}
+
+constexpr auto getCPUArchMSVC() {
+#if defined(_M_AMD64)
+    return CPUArch::AMD64;
+#elif defined(_M_PPC)
+    return CPUArch::PPC64;
+#elif defined(_M_ARM)
+    return CPUArch::ARM;
+#else
+    return CPUArch::Unknown;
+#endif
+}
+
+#if defined(__GNUC__) || defined(__clang__)
+[[maybe_unused]] constexpr static auto cpu_arch = getCPUArchClangGCC();
+#elif defined(_MSC_VER)
+[[maybe_unused]] constexpr static auto cpu_arch = getCPUArchMSVC();
+#else
+[[maybe_unused]] constexpr static auto cpu_arch = CPUArch::Unknown;
+#endif
+
+enum class Compiler { GCC, Clang, MSVC, Unknown };
+
+template <Compiler compiler>
+constexpr auto getCompilerVersion() -> std::string_view {
+    return "Unknown version";
+}
+template <>
+constexpr auto getCompilerVersion<Compiler::GCC>() -> std::string_view {
+    return PL_TO_STR(__GNUC__) "." PL_TO_STR(__GNUC_MINOR__) "." PL_TO_STR(
+        __GNUC_PATCHLEVEL__);
+}
+template <>
+constexpr auto getCompilerVersion<Compiler::Clang>() -> std::string_view {
+    return PL_TO_STR(__clang_major__) "." PL_TO_STR(
+        __clang_minor__) "." PL_TO_STR(__clang_patchlevel__);
+}
+template <>
+constexpr auto getCompilerVersion<Compiler::MSVC>() -> std::string_view {
+    return PL_TO_STR(_MSC_FULL_VER);
+}
+#if defined(__GNUC__) && !defined(__llvm__) && !defined(__INTEL_COMPILER)
+[[maybe_unused]] constexpr static auto compiler = Compiler::GCC;
+#elif defined(__clang__)
+[[maybe_unused]] constexpr static auto compiler = Compiler::Clang;
+#elif defined(_MSC_VER)
+[[maybe_unused]] constexpr static auto compiler = Compiler::MSVC;
+#else
+[[maybe_unused]] constexpr static auto compiler = Compiler::Unknown;
 #endif
+} // namespace Pennylane::Util::Constant
diff --git a/pennylane_lightning/src/util/RuntimeInfo.cpp b/pennylane_lightning/src/util/RuntimeInfo.cpp
new file mode 100644
index 0000000000..5a208cb540
--- /dev/null
+++ b/pennylane_lightning/src/util/RuntimeInfo.cpp
@@ -0,0 +1,71 @@
+// Copyright 2022 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "RuntimeInfo.hpp"
+
+#include <array>
+
+#if defined(__GNUC__) || defined(__clang__)
+#include <cpuid.h>
+#elif defined(_MSC_VER)
+#include <intrin.h>
+#endif
+
+namespace Pennylane::Util {
+#if defined(__GNUC__) || defined(__clang__)
+RuntimeInfo::InternalRuntimeInfo::InternalRuntimeInfo() {
+    const auto nids = __get_cpuid_max(0x00, nullptr);
+    if (nids == 0) {
+        return; // cpuid is not supported
+    }
+
+    unsigned int eax = 0;
+    unsigned int ebx = 0;
+    unsigned int ecx = 0;
+    unsigned int edx = 0;
+    if (nids >= 1) {
+        eax = 1;
+        __get_cpuid(1, &eax, &ebx, &ecx, &edx);
+        f_1_ecx = ecx;
+        f_1_edx = edx;
+    }
+    if (nids >= 7) { // NOLINT(readability-magic-numbers)
+        // NOLINTNEXTLINE(readability-magic-numbers)
+        __get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
+        f_7_ebx = ebx;
+        f_7_ecx = ecx;
+    }
+}
+#elif defined(_MSC_VER)
+RuntimeInfo::InternalRuntimeInfo::InternalRuntimeInfo() {
+    std::array<int, 4> cpui;
+    __cpuid(cpui.data(), 0);
+
+    int nids = cpui[0];
+
+    if (nids >= 1) {
+        __cpuidex(cpui.data(), 1, 0);
+        f_1_ecx = cpui[2];
+        f_1_edx = cpui[3];
+    }
+
+    if (nids >= 7) {
+        __cpuidex(cpui.data(), 7, 0);
+        f_7_ebx = cpui[1];
+        f_7_ecx = cpui[2];
+    }
+}
+#else
+RuntimeInfo::InternalRuntimeInfo::InternalRuntimeInfo(){};
+#endif
+} // namespace Pennylane::Util
diff --git a/pennylane_lightning/src/util/RuntimeInfo.hpp b/pennylane_lightning/src/util/RuntimeInfo.hpp
new file mode 100644
index 0000000000..416422bd45
--- /dev/null
+++ b/pennylane_lightning/src/util/RuntimeInfo.hpp
@@ -0,0 +1,52 @@
+// Copyright 2022 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/**
+ * @file
+ * Runtime information based on cpuid
+ */
+#pragma once
+#include <bitset>
+
+namespace Pennylane::Util {
+/**
+ * @brief This class is only usable in x86 or AMD64 architecture.
+ */
+class RuntimeInfo {
+  private:
+    struct InternalRuntimeInfo {
+        InternalRuntimeInfo();
+
+        std::bitset<32> f_1_ecx;
+        std::bitset<32> f_1_edx;
+        std::bitset<32> f_7_ebx;
+        std::bitset<32> f_7_ecx;
+    };
+
+    static const inline InternalRuntimeInfo internal_runtime_info_;
+
+  public:
+    static inline bool AVX() {
+        // NOLINTNEXTLINE(readability-magic-numbers)
+        return internal_runtime_info_.f_1_ecx[28];
+    }
+    static inline bool AVX2() {
+        // NOLINTNEXTLINE(readability-magic-numbers)
+        return internal_runtime_info_.f_7_ebx[5];
+    }
+    static inline bool AVX512F() {
+        // NOLINTNEXTLINE(readability-magic-numbers)
+        return internal_runtime_info_.f_7_ebx[16];
+    }
+};
+} // namespace Pennylane::Util
diff --git a/pennylane_lightning/src/util/TypeList.hpp b/pennylane_lightning/src/util/TypeList.hpp
index e288bd80a5..a53c3cbd5d 100644
--- a/pennylane_lightning/src/util/TypeList.hpp
+++ b/pennylane_lightning/src/util/TypeList.hpp
@@ -18,14 +18,19 @@
 #pragma once
 
 #include <cstdlib>
+#include <tuple>
 #include <type_traits>
+#include <utility>
 
 namespace Pennylane::Util {
 template <typename T, typename... Ts> struct TypeNode {
     using Type = T;
     using Next = TypeNode<Ts...>;
 };
-
+template <typename T> struct TypeNode<T, void> {
+    using Type = T;
+    using Next = void;
+};
 template <typename T> struct TypeNode<T> {
     using Type = T;
     using Next = void;
@@ -36,16 +41,33 @@ template <typename T> struct TypeNode<T> {
  */
 template <typename... Ts> using TypeList = TypeNode<Ts...>;
 
-template <typename TypeList, size_t n> struct getNthType {
-    static_assert(!std::is_same_v<typename TypeList::Next, void>,
-                  "The given n is larger than the length of the typelist.");
-    using Type = getNthType<typename TypeList::Next, n - 1>;
+/**
+ * @brief Get N-th type of a type list.
+ *
+ * @tparam TypeList Type list
+ * @tparam n The position of a type to extract
+ */
+template <typename TypeList, size_t n> struct getNth {
+    using Type = typename getNth<typename TypeList::Next, n - 1>::Type;
 };
 
-template <typename TypeList> struct getNthType<TypeList, 0> {
+/// @cond DEV
+template <typename TypeList> struct getNth<TypeList, 0> {
+    static_assert(!std::is_same_v<typename TypeList::Type, void>,
+                  "The given n is larger than the length of the type list.");
     using Type = typename TypeList::Type;
 };
+/// @endcod
+
+/**
+ * @brief Convenient of alias of getNth
+ */
+template <typename TypeList, size_t n>
+using getNthType = typename getNth<TypeList, n>::Type;
 
+/**
+ * @brief Get the size of a type list
+ */
 template <typename TypeList> constexpr size_t length() {
     if constexpr (std::is_same_v<TypeList, void>) {
         return 0;
@@ -53,4 +75,22 @@ template <typename TypeList> constexpr size_t length() {
         return 1 + length<typename TypeList::Next>();
     }
 }
+
+/**
+ * @brief Prepend a type to a type list.
+ *
+ * @tparam T Type to prepend
+ * @tparam U TypeList
+ */
+template <typename T, typename U> struct PrependToTypeList;
+
+/// @cond DEV
+template <typename T, typename... Ts>
+struct PrependToTypeList<T, TypeNode<Ts...>> {
+    using Type = TypeNode<T, Ts...>;
+};
+template <typename T> struct PrependToTypeList<T, void> {
+    using Type = TypeNode<T, void>;
+};
+/// @endcond
 } // namespace Pennylane::Util
diff --git a/pennylane_lightning/src/util/Util.hpp b/pennylane_lightning/src/util/Util.hpp
index 3b184b82f9..adbe6d9f42 100644
--- a/pennylane_lightning/src/util/Util.hpp
+++ b/pennylane_lightning/src/util/Util.hpp
@@ -260,10 +260,11 @@ auto linspace(T start, T end, size_t num_points) -> std::vector<T> {
  *
  * @tparam T Vector data type.
  * @param arr Array to be inspected.
+ * @param length Size of the array
  * @return a vector with indices that would sort the array.
  */
 template <typename T>
-inline auto sorting_indices(const T &arr, size_t length)
+inline auto sorting_indices(const T *arr, size_t length)
     -> std::vector<size_t> {
     std::vector<size_t> indices(length);
     iota(indices.begin(), indices.end(), 0);
diff --git a/tests/test_measures.py b/tests/test_measures.py
index a843253faa..10c48c5313 100644
--- a/tests/test_measures.py
+++ b/tests/test_measures.py
@@ -16,6 +16,7 @@
 """
 import numpy as np
 import pennylane as qml
+import math
 from pennylane.measurements import (
     Variance,
     Expectation,
@@ -55,15 +56,20 @@ def dev(self):
 
     def test_probs_dtype64(self, dev):
         """Test if probs changes the state dtype"""
-        dev._state = np.array([1, 0]).astype(np.complex64)
+        dev._state = dev._asarray(
+            np.array([1 / math.sqrt(2), 1 / math.sqrt(2), 0, 0]).astype(np.complex64)
+        )
         p = dev.probability(wires=[0, 1])
 
         assert dev._state.dtype == np.complex64
-        assert np.allclose(p, [1, 1, 0, 0])
+        assert np.allclose(p, [0.5, 0.5, 0, 0])
 
+    @pytest.mark.skipif(
+        not hasattr(np, "complex256"), reason="Numpy only defines complex256 in Linux-like system"
+    )
     def test_probs_dtype_error(self, dev):
         """Test if probs raise error with complex256"""
-        dev._state = np.array([1, 0]).astype(np.complex256)
+        dev._state = np.array([1, 0, 0, 0]).astype(np.complex256)
 
         with pytest.raises(TypeError, match="Unsupported complex Type:"):
             dev.probability(wires=[0, 1])
@@ -179,6 +185,9 @@ def test_expval_dtype64(self, dev):
         assert dev._state.dtype == np.complex64
         assert np.allclose(e, 0.0)
 
+    @pytest.mark.skipif(
+        not hasattr(np, "complex256"), reason="Numpy only defines complex256 in Linux-like system"
+    )
     def test_expval_dtype_error(self, dev):
         """Test if expval raise error with complex256"""
         dev._state = np.array([1, 0]).astype(np.complex256)
@@ -296,6 +305,9 @@ def test_var_dtype64(self, dev):
         assert dev._state.dtype == np.complex64
         assert np.allclose(v, 1.0)
 
+    @pytest.mark.skipif(
+        not hasattr(np, "complex256"), reason="Numpy only defines complex256 in Linux-like system"
+    )
     def test_expval_dtype_error(self, dev):
         """Test if var raise error with complex256"""
         dev._state = np.array([1, 0]).astype(np.complex256)

From 1f9590b39b37d0caaeeaf0f6165cdf71d01e742b Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 9 Mar 2022 19:40:20 -0500
Subject: [PATCH 54/94] Add tests

---
 .../src/gates/GateImplementationsLM.hpp       |   1 +
 pennylane_lightning/src/tests/CMakeLists.txt  |   5 +-
 .../src/tests/CreateAllWires.cpp              |  30 +++
 .../src/tests/CreateAllWires.hpp              |  92 +++++++++
 pennylane_lightning/src/tests/TestKernels.hpp |   2 +-
 ...est_GateImplementations_CompareKernels.cpp | 186 ++++++++++++++++++
 .../src/tests/Test_RuntimeInfo.cpp            |  13 ++
 pennylane_lightning/src/tests/Test_Util.cpp   |   1 +
 8 files changed, 328 insertions(+), 2 deletions(-)
 create mode 100644 pennylane_lightning/src/tests/CreateAllWires.cpp
 create mode 100644 pennylane_lightning/src/tests/CreateAllWires.hpp
 create mode 100644 pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
 create mode 100644 pennylane_lightning/src/tests/Test_RuntimeInfo.cpp

diff --git a/pennylane_lightning/src/gates/GateImplementationsLM.hpp b/pennylane_lightning/src/gates/GateImplementationsLM.hpp
index 9f227862b2..488079992e 100644
--- a/pennylane_lightning/src/gates/GateImplementationsLM.hpp
+++ b/pennylane_lightning/src/gates/GateImplementationsLM.hpp
@@ -235,6 +235,7 @@ class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
     static void applyMatrix(std::complex<PrecisionT> *arr, size_t num_qubits,
                             const std::complex<PrecisionT> *matrix,
                             const std::vector<size_t> &wires, bool inverse) {
+        using Util::Trans;
         assert(num_qubits >= wires.size());
 
         switch (wires.size()) {
diff --git a/pennylane_lightning/src/tests/CMakeLists.txt b/pennylane_lightning/src/tests/CMakeLists.txt
index c507f938bf..3bd3b713b9 100644
--- a/pennylane_lightning/src/tests/CMakeLists.txt
+++ b/pennylane_lightning/src/tests/CMakeLists.txt
@@ -68,9 +68,11 @@ endif()
 add_executable(compile_time_tests compile_time_tests.cpp)
 target_link_libraries(compile_time_tests lightning_gates lightning_utils)
 
-set(TEST_SOURCES Test_AdjDiff.cpp
+set(TEST_SOURCES CreateAllWires.cpp
+                 Test_AdjDiff.cpp
 #                 Test_Bindings.cpp
                  Test_DynamicDispatcher.cpp
+                 Test_GateImplementations_CompareKernels.cpp
                  Test_GateImplementations_Generator.cpp
                  Test_GateImplementations_Inverse.cpp
                  Test_GateImplementations_Matrix.cpp
@@ -80,6 +82,7 @@ set(TEST_SOURCES Test_AdjDiff.cpp
                  Test_Internal.cpp
                  Test_Measures.cpp
                  Test_OpToMemberFuncPtr.cpp
+                 Test_RuntimeInfo.cpp
                  Test_StateVectorManaged.cpp
                  Test_StateVectorRaw.cpp
                  Test_Util.cpp
diff --git a/pennylane_lightning/src/tests/CreateAllWires.cpp b/pennylane_lightning/src/tests/CreateAllWires.cpp
new file mode 100644
index 0000000000..dd0194a625
--- /dev/null
+++ b/pennylane_lightning/src/tests/CreateAllWires.cpp
@@ -0,0 +1,30 @@
+#include "CreateAllWires.hpp"
+namespace Pennylane {
+auto crateAllWires(size_t n_qubits, Gates::GateOperation gate_op, bool order)
+    -> std::vector<std::vector<size_t>> {
+    if (Util::array_has_elt(Gates::Constant::multi_qubit_gates, gate_op)) {
+        // make all possible 2^N permutations
+        std::vector<std::vector<size_t>> res;
+        res.reserve((1U << n_qubits) - 1);
+        ;
+        for (size_t k = 1; k < (static_cast<size_t>(1U) << n_qubits); k++) {
+            std::vector<size_t> wires;
+            wires.reserve(Util::popcount(k));
+
+            for (size_t i = 0; i < n_qubits; i++) {
+                if (((k >> i) & 1U) == 1U) {
+                    wires.emplace_back(i);
+                }
+            }
+
+            res.push_back(wires);
+        }
+        return res;
+    } // else
+    const size_t n_wires = Util::lookup(Gates::Constant::gate_wires, gate_op);
+    if (order) {
+        return PermutationGenerator(n_qubits, n_wires).all_perms();
+    } // else
+    return CombinationGenerator(n_qubits, n_wires).all_perms();
+}
+} // namespace Pennylane
diff --git a/pennylane_lightning/src/tests/CreateAllWires.hpp b/pennylane_lightning/src/tests/CreateAllWires.hpp
new file mode 100644
index 0000000000..54d3cd9e9a
--- /dev/null
+++ b/pennylane_lightning/src/tests/CreateAllWires.hpp
@@ -0,0 +1,92 @@
+#pragma once
+#include "BitUtil.hpp"
+#include "Constant.hpp"
+#include "ConstantUtil.hpp"
+#include "GateOperation.hpp"
+
+#include <cstdlib>
+#include <vector>
+
+namespace Pennylane {
+
+class WiresGenerator {
+  public:
+    [[nodiscard]] virtual auto all_perms() const
+        -> const std::vector<std::vector<size_t>> & = 0;
+};
+class CombinationGenerator : public WiresGenerator {
+  private:
+    std::vector<size_t> v_;
+    std::vector<std::vector<size_t>> all_perms_;
+
+  public:
+    void comb(size_t n, size_t r) {
+        if (r == 0) {
+            all_perms_.push_back(v_);
+            return;
+        }
+        if (n < r) {
+            return;
+        }
+
+        v_[r - 1] = n - 1;
+        comb(n - 1, r - 1);
+
+        comb(n - 1, r);
+    }
+
+    CombinationGenerator(size_t n, size_t r) {
+        v_.resize(r);
+        comb(n, r);
+    }
+
+    [[nodiscard]] auto all_perms() const
+        -> const std::vector<std::vector<size_t>> & override {
+        return all_perms_;
+    }
+};
+class PermutationGenerator : public WiresGenerator {
+  private:
+    std::vector<std::vector<size_t>> all_perms_;
+    std::vector<size_t> available_elts_;
+    std::vector<size_t> v;
+
+  public:
+    void perm(size_t n, size_t r) {
+        if (r == 0) {
+            all_perms_.push_back(v);
+            return;
+        }
+        for (size_t i = 0; i < n; i++) {
+            v[r - 1] = available_elts_[i];
+            std::swap(available_elts_[n - 1], available_elts_[i]);
+            perm(n - 1, r - 1);
+            std::swap(available_elts_[n - 1], available_elts_[i]);
+        }
+    }
+
+    PermutationGenerator(size_t n, size_t r) {
+        v.resize(r);
+
+        available_elts_.resize(n);
+        std::iota(available_elts_.begin(), available_elts_.end(), 0);
+        perm(n, r);
+    }
+
+    [[nodiscard]] auto all_perms() const
+        -> const std::vector<std::vector<size_t>> & override {
+        return all_perms_;
+    }
+};
+
+/**
+ * @brief Create all possible combination of wires
+ * for a given number of qubits and gate operation
+ *
+ * @param n_qubits Number of qubits
+ * @param gate_op Gate operation
+ * @param order Whether the ordering matters (if true, permutation is used)
+ */
+auto crateAllWires(size_t n_qubits, Gates::GateOperation gate_op, bool order)
+    -> std::vector<std::vector<size_t>>;
+} // namespace Pennylane
diff --git a/pennylane_lightning/src/tests/TestKernels.hpp b/pennylane_lightning/src/tests/TestKernels.hpp
index e9b9cfa785..74085a4d75 100644
--- a/pennylane_lightning/src/tests/TestKernels.hpp
+++ b/pennylane_lightning/src/tests/TestKernels.hpp
@@ -10,4 +10,4 @@
 
 using TestKernels =
     Pennylane::Util::TypeList<Pennylane::Gates::GateImplementationsLM,
-                              Pennylane::Gates::GateImplementationsPI>;
+                              Pennylane::Gates::GateImplementationsPI, void>;
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
new file mode 100644
index 0000000000..c66f07e522
--- /dev/null
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
@@ -0,0 +1,186 @@
+#include "CreateAllWires.hpp"
+#include "TestHelpers.hpp"
+
+#include "OpToMemberFuncPtr.hpp"
+#include "TestKernels.hpp"
+#include "Util.hpp"
+
+#include <catch2/catch.hpp>
+
+#include <algorithm>
+#include <complex>
+#include <iostream>
+#include <limits>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+/**
+ * @file Test_GateImplementations_Nonparam.cpp
+ *
+ * This file tests all gate operations (besides matrix) by comparing results
+ * between different kernels (gate implementations).
+ */
+using namespace Pennylane;
+using namespace Pennylane::Gates;
+using namespace Pennylane::Util;
+
+namespace {
+using namespace Pennylane::Gates::Constant;
+} // namespace
+
+using std::vector;
+
+template <typename TypeList> std::string kernelsToString() {
+    if constexpr (!std::is_same_v<TypeList, void>) {
+        return std::string(TypeList::Type::name) + ", " +
+               kernelsToString<typename TypeList::Next>();
+    }
+    return "";
+}
+
+/* Type transformation */
+template <Gates::GateOperation gate_op, typename TypeList>
+struct KernelsImplementingGateHelper {
+    using Type = std::conditional_t<
+        array_has_elt(TypeList::Type::implemented_gates, gate_op),
+        typename PrependToTypeList<
+            typename TypeList::Type,
+            typename KernelsImplementingGateHelper<
+                gate_op, typename TypeList::Next>::Type>::Type,
+        typename KernelsImplementingGateHelper<gate_op,
+                                               typename TypeList::Next>::Type>;
+};
+template <Gates::GateOperation gate_op>
+struct KernelsImplementingGateHelper<gate_op, void> {
+    using Type = void;
+};
+
+template <Gates::GateOperation gate_op> struct KernelsImplementingGate {
+    using Type =
+        typename KernelsImplementingGateHelper<gate_op, TestKernels>::Type;
+};
+
+/**
+ * @brief Apply the given gate operation with the given gate implementation.
+ *
+ * @tparam gate_op Gate operation to test
+ * @tparam PrecisionT Floating point data type for statevector
+ * @tparam ParamT Floating point data type for parameter
+ * @tparam GateImplementation Gate implementation class
+ * @param ini Initial statevector
+ * @param num_qubits Number of qubits
+ * @param wires Wires the gate applies to
+ * @param inverse Whether to use inverse of gate
+ * @param params Paramters for gate
+ */
+template <Gates::GateOperation gate_op, typename PrecisionT, typename ParamT,
+          typename GateImplementation, class Alloc>
+auto applyGate(std::vector<std::complex<PrecisionT>, Alloc> ini,
+               size_t num_qubits, const std::vector<size_t> &wires,
+               bool inverse, const std::vector<ParamT> &params)
+    -> std::vector<std::complex<PrecisionT>, Alloc> {
+    callGateOps(GateOpToMemberFuncPtr<PrecisionT, ParamT, GateImplementation,
+                                      gate_op>::value,
+                ini.data(), num_qubits, wires, inverse, params);
+    return ini;
+}
+
+/**
+ * @brief Apply the given gate using all implementing kernels and return
+ * results in tuple.
+ */
+template <Gates::GateOperation gate_op, typename PrecisionT, typename ParamT,
+          typename Kernels, class Alloc, size_t... I>
+auto applyGateForImplemetingKernels(
+    const std::vector<std::complex<PrecisionT>, Alloc> &ini, size_t num_qubits,
+    const std::vector<size_t> &wires, bool inverse,
+    const std::vector<ParamT> &params,
+    [[maybe_unused]] std::index_sequence<I...> dummy) {
+    return std::make_tuple(
+        applyGate<gate_op, PrecisionT, ParamT, getNthType<Kernels, I>>(
+            ini, num_qubits, wires, inverse, params)...);
+}
+
+template <Gates::GateOperation gate_op, typename PrecisionT, typename ParamT,
+          class RandomEngine>
+void testApplyGate(RandomEngine &re, size_t num_qubits) {
+    const auto ini = createRandomState<PrecisionT>(re, num_qubits);
+
+    using Kernels = typename KernelsImplementingGate<gate_op>::Type;
+
+    INFO("Kernels implementing " << lookup(gate_names, gate_op) << " are "
+                                 << kernelsToString<Kernels>());
+
+    INFO("PrecisionT, ParamT = " << PrecisionToName<PrecisionT>::value << ", "
+                                 << PrecisionToName<ParamT>::value);
+
+    const auto all_wires = crateAllWires(num_qubits, gate_op, true);
+    for (const auto &wires : all_wires) {
+        const auto params = createParams<ParamT>(gate_op);
+        const auto gate_name = lookup(gate_names, gate_op);
+        DYNAMIC_SECTION(
+            "Test gate "
+            << gate_name
+            << " with inverse = false") { // Test with inverse = false
+            const auto results = Util::tuple_to_array(
+                applyGateForImplemetingKernels<gate_op, PrecisionT, ParamT,
+                                               Kernels>(
+                    ini, num_qubits, wires, false, params,
+                    std::make_index_sequence<length<Kernels>()>()));
+
+            for (size_t i = 0; i < results.size() - 1; i++) {
+                REQUIRE(results[i] ==
+                        PLApprox(results[i + 1])
+                            .margin(static_cast<PrecisionT>(1e-5)));
+            }
+        }
+
+        DYNAMIC_SECTION("Test gate "
+                        << gate_name
+                        << " with inverse = true") { // Test with inverse = true
+            const auto results = Util::tuple_to_array(
+                applyGateForImplemetingKernels<gate_op, PrecisionT, ParamT,
+                                               Kernels>(
+                    ini, num_qubits, wires, true, params,
+                    std::make_index_sequence<length<Kernels>()>()));
+
+            for (size_t i = 0; i < results.size() - 1; i++) {
+                REQUIRE(results[i] ==
+                        PLApprox(results[i + 1])
+                            .margin(static_cast<PrecisionT>(1e-5)));
+            }
+        }
+    }
+}
+
+template <size_t gate_idx, typename PrecisionT, typename ParamT,
+          class RandomEngine>
+void testAllGatesIter(RandomEngine &re, size_t max_num_qubits) {
+    if constexpr (gate_idx < static_cast<size_t>(GateOperation::END)) {
+        constexpr static auto gate_op = static_cast<GateOperation>(gate_idx);
+
+        if constexpr (gate_op != GateOperation::Matrix) {
+            size_t min_num_qubits = array_has_elt(multi_qubit_gates, gate_op)
+                                        ? 1
+                                        : lookup(gate_wires, gate_op);
+            for (size_t num_qubits = min_num_qubits;
+                 num_qubits < max_num_qubits; num_qubits++) {
+                testApplyGate<gate_op, PrecisionT, ParamT>(re, num_qubits);
+            }
+            testAllGatesIter<gate_idx + 1, PrecisionT, ParamT>(re,
+                                                               max_num_qubits);
+        }
+    }
+}
+
+template <typename PrecisionT, typename ParamT, class RandomEngine>
+void testAllGates(RandomEngine &re, size_t max_num_qubits) {
+    testAllGatesIter<0, PrecisionT, ParamT>(re, max_num_qubits);
+}
+
+TEMPLATE_TEST_CASE("Test all kernels give the same results",
+                   "[Test_GateImplementations_CompareKernels]", float, double) {
+    std::mt19937 re{1337};
+    testAllGates<TestType, TestType>(re, 6);
+}
diff --git a/pennylane_lightning/src/tests/Test_RuntimeInfo.cpp b/pennylane_lightning/src/tests/Test_RuntimeInfo.cpp
new file mode 100644
index 0000000000..93823e386b
--- /dev/null
+++ b/pennylane_lightning/src/tests/Test_RuntimeInfo.cpp
@@ -0,0 +1,13 @@
+#include "Macros.hpp"
+#include "RuntimeInfo.hpp"
+
+#include <catch2/catch.hpp>
+
+using namespace Pennylane::Util;
+
+TEST_CASE("Runtime information is correct", "[Test_RuntimeInfo]") {
+    INFO("RuntimeInfo::AVX " << RuntimeInfo::AVX());
+    INFO("RuntimeInfo::AVX2 " << RuntimeInfo::AVX2());
+    INFO("RuntimeInfo::AVX512F " << RuntimeInfo::AVX512F());
+    REQUIRE(true);
+}
diff --git a/pennylane_lightning/src/tests/Test_Util.cpp b/pennylane_lightning/src/tests/Test_Util.cpp
index 4360a793f5..c07da227c9 100644
--- a/pennylane_lightning/src/tests/Test_Util.cpp
+++ b/pennylane_lightning/src/tests/Test_Util.cpp
@@ -60,6 +60,7 @@ TEMPLATE_TEST_CASE("Constant values", "[Util]", float, double) {
 // NOLINTNEXTLINE: Avoid complexity errors
 TEMPLATE_TEST_CASE("Utility math functions", "[Util][LinearAlgebra]", float,
                    double) {
+    using Util::Trans;
     SECTION("exp2: 2^n") {
         for (size_t i = 0; i < 10; i++) {
             CHECK(Util::exp2(i) == static_cast<size_t>(std::pow(2, i)));

From 66ceab58a49452e119a2efb9d7d72d272028420d Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 9 Mar 2022 19:55:59 -0500
Subject: [PATCH 55/94] Add runtime/compile info in binary

---
 cmake/process_options.cmake                   | 14 +++++
 pennylane_lightning/src/bindings/Bindings.cpp | 13 ++++-
 pennylane_lightning/src/bindings/Bindings.hpp | 55 +++++++++++++++++++
 tests/test_binary_info.py                     | 33 +++++++++++
 4 files changed, 112 insertions(+), 3 deletions(-)
 create mode 100644 tests/test_binary_info.py

diff --git a/cmake/process_options.cmake b/cmake/process_options.cmake
index d3ecccd3f9..815a04e43d 100644
--- a/cmake/process_options.cmake
+++ b/cmake/process_options.cmake
@@ -62,6 +62,20 @@ else()
     message(STATUS "ENABLE_AVX is OFF")
 endif()
 
+if(ENABLE_AVX2)
+    message(STATUS "ENABLE_AVX2 is ON.")
+    target_compile_options(lightning_compile_options INTERFACE -mavx2)
+else()
+    message(STATUS "ENABLE_AVX2 is OFF")
+endif()
+
+if(ENABLE_AVX512)
+    message(STATUS "ENABLE_AVX512 is ON.")
+    target_compile_options(lightning_compile_options INTERFACE -mavx512f) # Now we only use avx512f
+else()
+    message(STATUS "ENABLE_AVX512 is OFF")
+endif()
+
 if(ENABLE_OPENMP)
     message(STATUS "ENABLE_OPENMP is ON.")
     find_package(OpenMP)
diff --git a/pennylane_lightning/src/bindings/Bindings.cpp b/pennylane_lightning/src/bindings/Bindings.cpp
index eb1e73daf8..73b98f8c55 100644
--- a/pennylane_lightning/src/bindings/Bindings.cpp
+++ b/pennylane_lightning/src/bindings/Bindings.cpp
@@ -364,6 +364,12 @@ PYBIND11_MODULE(lightning_qubit_ops, // NOLINT: No control over Pybind internals
               &Gates::getIndicesAfterExclusion),
           "Get statevector indices for gate application");
 
+    /* Add compile info */
+    m.def("compile_info", &getCompileInfo, "Compiled binary information.");
+
+    /* Add compile info */
+    m.def("runtime_info", &getRuntimeInfo, "Runtime information.");
+
     /* Add EXPORTED_KERNELS */
     std::vector<std::pair<std::string, std::string>> exported_kernel_ops;
 
@@ -372,7 +378,7 @@ PYBIND11_MODULE(lightning_qubit_ops, // NOLINT: No control over Pybind internals
         const auto implemented_gates = implementedGatesForKernel(kernel);
         for (const auto gate_op : implemented_gates) {
             const auto gate_name =
-                std::string(lookup(Constant::gate_names, gate_op));
+                std::string(lookup(Gates::Constant::gate_names, gate_op));
             exported_kernel_ops.emplace_back(kernel_name, gate_name);
         }
     }
@@ -381,8 +387,9 @@ PYBIND11_MODULE(lightning_qubit_ops, // NOLINT: No control over Pybind internals
 
     /* Add DEFAULT_KERNEL_FOR_OPS */
     std::map<std::string, std::string> default_kernel_ops_map;
-    for (const auto &[gate_op, name] : Constant::gate_names) {
-        const auto kernel = lookup(Constant::default_kernel_for_gates, gate_op);
+    for (const auto &[gate_op, name] : Gates::Constant::gate_names) {
+        const auto kernel =
+            lookup(Gates::Constant::default_kernel_for_gates, gate_op);
         const auto kernel_name = Util::lookup(kernel_id_name_pairs, kernel);
         default_kernel_ops_map.emplace(std::string(name), kernel_name);
     }
diff --git a/pennylane_lightning/src/bindings/Bindings.hpp b/pennylane_lightning/src/bindings/Bindings.hpp
index 5d79774ffd..d247144a8a 100644
--- a/pennylane_lightning/src/bindings/Bindings.hpp
+++ b/pennylane_lightning/src/bindings/Bindings.hpp
@@ -19,8 +19,10 @@
 #pragma once
 #include "AdjointDiff.hpp"
 #include "JacobianProd.hpp"
+#include "Macros.hpp"
 #include "Measures.hpp"
 #include "OpToMemberFuncPtr.hpp"
+#include "RuntimeInfo.hpp"
 #include "StateVectorRaw.hpp"
 
 #include "pybind11/complex.h"
@@ -227,4 +229,57 @@ template <class PrecisionT, class ParamT, class PyClass>
 void registerKernelsToPyexport(PyClass &pyclass) {
     registerKernelsToPyexportIter<PrecisionT, ParamT, 0>(pyclass);
 }
+
+/**
+ * @brief Return basic information of the compiled binary.
+ */
+auto getCompileInfo() -> pybind11::dict {
+    using namespace Util::Constant;
+    using namespace pybind11::literals;
+
+    const std::string_view cpu_arch_str = [] {
+        switch (cpu_arch) {
+        case CPUArch::AMD64:
+            return "AMD64";
+        case CPUArch::PPC64:
+            return "PPC64";
+        case CPUArch::ARM:
+            return "ARM";
+        default:
+            return "Unknown";
+        }
+    }();
+
+    const std::string_view compiler_name_str = [] {
+        switch (compiler) {
+        case Compiler::GCC:
+            return "GCC";
+        case Compiler::Clang:
+            return "Clang";
+        case Compiler::MSVC:
+            return "MSVC";
+        default:
+            return "Unknown";
+        }
+    }();
+
+    const auto compiler_version_str = getCompilerVersion<compiler>();
+
+    return pybind11::dict("cpu.arch"_a = cpu_arch_str,
+                          "compiler.name"_a = compiler_name_str,
+                          "compiler.version"_a = compiler_version_str,
+                          "AVX2"_a = use_avx2, "AVX512F"_a = use_avx512f);
+}
+
+/**
+ * @brief Return basic information of runtime environment
+ */
+auto getRuntimeInfo() -> pybind11::dict {
+    using namespace Util::Constant;
+    using namespace pybind11::literals;
+
+    return pybind11::dict("AVX"_a = RuntimeInfo::AVX(),
+                          "AVX2"_a = RuntimeInfo::AVX2(),
+                          "AVX512F"_a = RuntimeInfo::AVX512F());
+}
 } // namespace Pennylane
diff --git a/tests/test_binary_info.py b/tests/test_binary_info.py
new file mode 100644
index 0000000000..7fe6ff5b72
--- /dev/null
+++ b/tests/test_binary_info.py
@@ -0,0 +1,33 @@
+# Copyright 2018-2020 Xanadu Quantum Technologies Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Test binary information of ``lightning.qubit``.
+"""
+
+try:
+    from pennylane_lightning.lightning_qubit_ops import runtime_info, compile_info
+except (ImportError, ModuleNotFoundError):
+    pytest.skip("No binary module found. Skipping.", allow_module_level=True)
+
+
+def test_runtime_info():
+    m = runtime_info()
+    for key in ["AVX", "AVX2", "AVX512F"]:
+        assert key in m
+
+
+def test_compile_info():
+    m = compile_info()
+    for key in ["cpu.arch", "compiler.name", "compiler.version", "AVX2", "AVX512F"]:
+        assert key in m

From d237ceb39ab35cb7011c211747e52196c803ba9b Mon Sep 17 00:00:00 2001
From: Dev version update bot <chae-yeun@xanadu.ai>
Date: Thu, 10 Mar 2022 00:57:00 +0000
Subject: [PATCH 56/94] Auto update version

---
 pennylane_lightning/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/_version.py b/pennylane_lightning/_version.py
index fd08943a9b..7b13995518 100644
--- a/pennylane_lightning/_version.py
+++ b/pennylane_lightning/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.22.0-dev16"
+__version__ = "0.22.0-dev17"

From 386edb609c7065e03ef61476494d6404aa30497f Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 9 Mar 2022 19:58:26 -0500
Subject: [PATCH 57/94] Fix for non-biary

---
 tests/test_binary_info.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_binary_info.py b/tests/test_binary_info.py
index 7fe6ff5b72..dfa207436b 100644
--- a/tests/test_binary_info.py
+++ b/tests/test_binary_info.py
@@ -14,6 +14,7 @@
 """
 Test binary information of ``lightning.qubit``.
 """
+import pytest
 
 try:
     from pennylane_lightning.lightning_qubit_ops import runtime_info, compile_info
@@ -26,7 +27,6 @@ def test_runtime_info():
     for key in ["AVX", "AVX2", "AVX512F"]:
         assert key in m
 
-
 def test_compile_info():
     m = compile_info()
     for key in ["cpu.arch", "compiler.name", "compiler.version", "AVX2", "AVX512F"]:

From cd9027f08d020d9506992d9aa52284e7db4023a7 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 9 Mar 2022 20:00:34 -0500
Subject: [PATCH 58/94] Format

---
 tests/test_binary_info.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_binary_info.py b/tests/test_binary_info.py
index dfa207436b..d0035ac61e 100644
--- a/tests/test_binary_info.py
+++ b/tests/test_binary_info.py
@@ -27,6 +27,7 @@ def test_runtime_info():
     for key in ["AVX", "AVX2", "AVX512F"]:
         assert key in m
 
+
 def test_compile_info():
     m = compile_info()
     for key in ["cpu.arch", "compiler.name", "compiler.version", "AVX2", "AVX512F"]:

From da3d9ec097ac37c0a5a3d6742bc7f1be8d66f922 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 9 Mar 2022 20:08:49 -0500
Subject: [PATCH 59/94] Update tidy

---
 pennylane_lightning/src/.clang-tidy       | 6 ++----
 pennylane_lightning/src/CMakeLists.txt    | 3 ++-
 pennylane_lightning/src/tests/.clang-tidy | 4 ++--
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/pennylane_lightning/src/.clang-tidy b/pennylane_lightning/src/.clang-tidy
index f015b16a1d..50b924d24b 100644
--- a/pennylane_lightning/src/.clang-tidy
+++ b/pennylane_lightning/src/.clang-tidy
@@ -1,5 +1,5 @@
 ---
-Checks:          'clang-diagnostic-*,clang-analyzer-*,-*,-llvmlibc-*,modernize-*,-modernize-use-trailing-return-type,clang-analyzer-cplusplus*,openmp-*,performance-*,portability-*,readability-*,hicpp-*,-hicpp-no-array-decay,bugprone-suspicious-*,llvm-namespace-comment,'
+Checks:          '-*,clang-diagnostic-*,clang-analyzer-*,modernize-*,-modernize-use-trailing-return-type,clang-analyzer-cplusplus*,openmp-*,performance-*,portability-*,readability-*,hicpp-*,-hicpp-avoid-c-arrays,-hicpp-no-array-decay,bugprone-suspicious-*,llvm-namespace-comment,cppcoreguidelines-avoid-non-const-global-variables,cppcoreguidelines-slicing,cppcoreguidelines-special-member-functions'
 WarningsAsErrors: '*'
 HeaderFilterRegex: '.*'
 AnalyzeTemporaryDtors: false
@@ -25,8 +25,6 @@ CheckOptions:
     value:           'false'
   - key:             readability-magic-numbers.IgnoredIntegerValues
     value:           '1;2;3;4;'
-  - key:             readability-magic-numbers.IgnorePowersOf2IntegerValues
-    value:           true
   - key:             modernize-use-default-member-init.UseAssignment
     value:           'false'
   - key:             readability-function-size.NestingThreshold
@@ -218,7 +216,7 @@ CheckOptions:
   - key:             modernize-use-auto.RemoveStars
     value:           'false'
   - key:             readability-magic-numbers.IgnorePowersOf2IntegerValues
-    value:           'false'
+    value:           'true'
   - key:             portability-simd-intrinsics.Std
     value:           ''
   - key:             readability-redundant-member-init.IgnoreBaseInCopyConstructors
diff --git a/pennylane_lightning/src/CMakeLists.txt b/pennylane_lightning/src/CMakeLists.txt
index b6776ac992..0385f947d6 100644
--- a/pennylane_lightning/src/CMakeLists.txt
+++ b/pennylane_lightning/src/CMakeLists.txt
@@ -11,8 +11,9 @@ if(ENABLE_CLANG_TIDY)
     if(NOT DEFINED CLANG_TIDY_BINARY)
         set(CLANG_TIDY_BINARY clang-tidy)
     endif()
+    message(STATUS "Using CLANG_TIDY_BINARY=${CLANG_TIDY_BINARY}")
     set(CMAKE_CXX_CLANG_TIDY ${CLANG_TIDY_BINARY};
-                            -extra-arg=-std=c++17;
+                             -extra-arg=-std=c++17;
     )
 endif()
 
diff --git a/pennylane_lightning/src/tests/.clang-tidy b/pennylane_lightning/src/tests/.clang-tidy
index 3ed93f21bf..75afabace1 100644
--- a/pennylane_lightning/src/tests/.clang-tidy
+++ b/pennylane_lightning/src/tests/.clang-tidy
@@ -1,5 +1,5 @@
 ---
-Checks:          'clang-diagnostic-*,clang-analyzer-*,-*,-llvmlibc-*,modernize-*,-modernize-use-trailing-return-type,clang-analyzer-cplusplus*,openmp-*,performance-*,portability-*,readability-*,-readability-magic-numbers,-modernize-avoid-c-arrays'
+Checks:          '-*,clang-diagnostic-*,clang-analyzer-*,-llvmlibc-*,modernize-*,-modernize-use-trailing-return-type,clang-analyzer-cplusplus*,openmp-*,performance-*,portability-*,readability-*,-modernize-avoid-c-arrays,-readability-magic-numbers,hicpp-*,-hicpp-no-array-decay,-hicpp-avoid-c-arrays,bugprone-suspicious-*,llvm-namespace-comment,cppcoreguidelines-slicing,cppcoreguidelines-special-member-functions'
 WarningsAsErrors: '*'
 HeaderFilterRegex: '.*'
 AnalyzeTemporaryDtors: false
@@ -216,7 +216,7 @@ CheckOptions:
   - key:             modernize-use-auto.RemoveStars
     value:           'false'
   - key:             readability-magic-numbers.IgnorePowersOf2IntegerValues
-    value:           'false'
+    value:           'true'
   - key:             portability-simd-intrinsics.Std
     value:           ''
   - key:             readability-redundant-member-init.IgnoreBaseInCopyConstructors

From 6f7d770a40e5e8b45e729698e5f360cbd868100a Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 9 Mar 2022 20:36:23 -0500
Subject: [PATCH 60/94] Fix for tidy

---
 .../src/gates/GateImplementationsLM.hpp       |  4 ++--
 .../src/simulator/DynamicDispatcher.hpp       |  4 ++--
 .../src/simulator/Measures.hpp                |  2 +-
 pennylane_lightning/src/tests/TestHelpers.hpp |  2 +-
 .../src/tests/Test_AdjDiff.cpp                | 16 +++++++-------
 .../Test_GateImplementations_Generator.cpp    |  2 --
 .../Test_GateImplementations_Nonparam.cpp     | 10 +++------
 .../tests/Test_GateImplementations_Param.cpp  | 12 +++++-----
 .../src/tests/Test_OpToMemberFuncPtr.cpp      |  1 +
 .../src/tests/Test_StateVectorRaw.cpp         |  2 +-
 pennylane_lightning/src/tests/Test_Util.cpp   |  6 ++---
 .../src/tests/Test_VectorJacobianProduct.cpp  | 22 +++++++++----------
 pennylane_lightning/src/util/BitUtil.hpp      |  5 +++--
 13 files changed, 42 insertions(+), 46 deletions(-)

diff --git a/pennylane_lightning/src/gates/GateImplementationsLM.hpp b/pennylane_lightning/src/gates/GateImplementationsLM.hpp
index 488079992e..a985e0e373 100644
--- a/pennylane_lightning/src/gates/GateImplementationsLM.hpp
+++ b/pennylane_lightning/src/gates/GateImplementationsLM.hpp
@@ -258,8 +258,8 @@ class GateImplementationsLM : public PauliGenerator<GateImplementationsLM> {
                     size_t idx = k | inner_idx;
                     size_t n_wires = wires.size();
                     for (size_t pos = 0; pos < n_wires; pos++) {
-                        bitswap(idx, n_wires - pos - 1,
-                                num_qubits - wires[pos] - 1);
+                        idx = bitswap(idx, n_wires - pos - 1,
+                                      num_qubits - wires[pos] - 1);
                     }
                     indices[inner_idx] = idx;
                     coeffs_in[inner_idx] = arr[idx];
diff --git a/pennylane_lightning/src/simulator/DynamicDispatcher.hpp b/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
index 83536f9076..469873751f 100644
--- a/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
+++ b/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
@@ -60,12 +60,12 @@ namespace Pennylane {
 template <class PrecisionT, class ParamT> struct registerBeforeMain;
 
 template <> struct registerBeforeMain<float, float> {
-    static inline int dummy =
+    const static inline int dummy =
         Internal::registerAllAvailableKernels<float, float>();
 };
 
 template <> struct registerBeforeMain<double, double> {
-    static inline int dummy =
+    const static inline int dummy =
         Internal::registerAllAvailableKernels<double, double>();
 };
 
diff --git a/pennylane_lightning/src/simulator/Measures.hpp b/pennylane_lightning/src/simulator/Measures.hpp
index 26208b6ba1..9e45453067 100644
--- a/pennylane_lightning/src/simulator/Measures.hpp
+++ b/pennylane_lightning/src/simulator/Measures.hpp
@@ -46,7 +46,7 @@ class Measures {
     using CFP_t = std::complex<fp_t>;
 
   public:
-    Measures(const SVType &provided_statevector)
+    explicit Measures(const SVType &provided_statevector)
         : original_statevector{provided_statevector} {};
 
     /**
diff --git a/pennylane_lightning/src/tests/TestHelpers.hpp b/pennylane_lightning/src/tests/TestHelpers.hpp
index 74faeeb5ce..655a43079d 100644
--- a/pennylane_lightning/src/tests/TestHelpers.hpp
+++ b/pennylane_lightning/src/tests/TestHelpers.hpp
@@ -110,7 +110,7 @@ isApproxEqual(const std::vector<Data_t, AllocA> &data1,
               const typename Data_t::value_type eps =
                   std::numeric_limits<typename Data_t::value_type>::epsilon() *
                   100) {
-    return data1 == PLApprox(data2);
+    return data1 == PLApprox(data2).epsilon(eps);
 }
 
 /**
diff --git a/pennylane_lightning/src/tests/Test_AdjDiff.cpp b/pennylane_lightning/src/tests/Test_AdjDiff.cpp
index dfbac67214..2ca9e3213b 100644
--- a/pennylane_lightning/src/tests/Test_AdjDiff.cpp
+++ b/pennylane_lightning/src/tests/Test_AdjDiff.cpp
@@ -50,7 +50,7 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=RX, Obs=Z",
         for (const auto &p : param) {
             auto ops = OpsData<double>({"RX"}, {{p}}, {{0}}, {false});
 
-            std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+            std::vector<std::complex<double>> cdata(1U << num_qubits);
             cdata[0] = std::complex<double>{1, 0};
 
             StateVectorRaw<double> psi(cdata.data(), cdata.size());
@@ -82,7 +82,7 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=RY, Obs=X",
         for (const auto &p : param) {
             auto ops = OpsData<double>({"RY"}, {{p}}, {{0}}, {false});
 
-            std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+            std::vector<std::complex<double>> cdata(1U << num_qubits);
             cdata[0] = std::complex<double>{1, 0};
 
             StateVectorRaw<double> psi(cdata.data(), cdata.size());
@@ -109,7 +109,7 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=RX, Obs=[Z,Z]",
         const size_t num_obs = 2;
         std::vector<double> jacobian(num_obs * num_params, 0);
 
-        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+        std::vector<std::complex<double>> cdata(1U << num_qubits);
         StateVectorRaw<double> psi(cdata.data(), cdata.size());
         cdata[0] = std::complex<double>{1, 0};
 
@@ -140,7 +140,7 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=[RX,RX,RX], Obs=[Z,Z,Z]",
         const size_t num_obs = 3;
         std::vector<double> jacobian(num_obs * num_params, 0);
 
-        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+        std::vector<std::complex<double>> cdata(1U << num_qubits);
         StateVectorRaw<double> psi(cdata.data(), cdata.size());
         cdata[0] = std::complex<double>{1, 0};
 
@@ -179,7 +179,7 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=[RX,RX,RX], Obs=[Z,Z,Z], "
         std::vector<double> jacobian(num_obs * num_params, 0);
         std::vector<size_t> t_params{0, 2};
 
-        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+        std::vector<std::complex<double>> cdata(1U << num_qubits);
         StateVectorRaw<double> psi(cdata.data(), cdata.size());
         cdata[0] = std::complex<double>{1, 0};
 
@@ -214,7 +214,7 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=[RX,RX,RX], Obs=[ZZZ]",
         const size_t num_obs = 1;
         std::vector<double> jacobian(num_obs * num_params, 0);
 
-        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+        std::vector<std::complex<double>> cdata(1U << num_qubits);
         StateVectorRaw<double> psi(cdata.data(), cdata.size());
         cdata[0] = std::complex<double>{1, 0};
 
@@ -249,7 +249,7 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=Mixed, Obs=[XXX]",
         const size_t num_obs = 1;
         std::vector<double> jacobian(num_obs * num_params, 0);
 
-        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+        std::vector<std::complex<double>> cdata(1U << num_qubits);
         StateVectorRaw<double> psi(cdata.data(), cdata.size());
         cdata[0] = std::complex<double>{1, 0};
 
@@ -388,4 +388,4 @@ TEST_CASE("AdjointJacobian::adjointJacobian Mixed Ops, Obs and TParams",
         CHECK(expected[1] == Approx(jacobian[1]));
         CHECK(expected[2] == Approx(jacobian[2]));
     }
-}
\ No newline at end of file
+}
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp
index 377c45bd5f..fd045742e9 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp
@@ -35,8 +35,6 @@ constexpr std::string_view remove_prefix(const std::string_view &str,
     return {str.data() + len, str.length() - len};
 }
 
-constexpr auto gate_name_to_ops = Util::reverse_pairs(Constant::gate_names);
-
 template <GeneratorOperation gntr_op>
 constexpr auto findGateOpForGenerator() -> GateOperation {
     constexpr auto gntr_name =
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Nonparam.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Nonparam.cpp
index 86894a47f7..49eb353529 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Nonparam.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Nonparam.cpp
@@ -21,10 +21,6 @@
  */
 using namespace Pennylane;
 
-namespace {
-using std::vector;
-}
-
 /**
  * @brief Run test suit only when the gate is defined
  */
@@ -77,7 +73,7 @@ void testApplyPauliX() {
 
         GateImplementation::applyPauliX(st.data(), num_qubits, {index}, false);
         CHECK(st[0] == Util::ZERO<PrecisionT>());
-        CHECK(st[0b1 << (num_qubits - index - 1)] == Util::ONE<PrecisionT>());
+        CHECK(st[1U << (num_qubits - index - 1)] == Util::ONE<PrecisionT>());
     }
 }
 PENNYLANE_RUN_TEST(PauliX);
@@ -147,9 +143,9 @@ void testApplyHadamard() {
         CHECK(expected.imag() == Approx(st[0].imag()));
 
         CHECK(expected.real() ==
-              Approx(st[0b1 << (num_qubits - index - 1)].real()));
+              Approx(st[1U << (num_qubits - index - 1)].real()));
         CHECK(expected.imag() ==
-              Approx(st[0b1 << (num_qubits - index - 1)].imag()));
+              Approx(st[1U << (num_qubits - index - 1)].imag()));
     }
 }
 PENNYLANE_RUN_TEST(Hadamard);
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Param.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Param.cpp
index cc6f687e11..894038c514 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Param.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Param.cpp
@@ -237,15 +237,15 @@ void testApplyRot() {
         std::vector<PrecisionT>{2.3, 0.1, 0.4}};
 
     std::vector<std::vector<ComplexPrecisionT>> expected_results{
-        std::vector<ComplexPrecisionT>(0b1 << num_qubits),
-        std::vector<ComplexPrecisionT>(0b1 << num_qubits),
-        std::vector<ComplexPrecisionT>(0b1 << num_qubits)};
+        std::vector<ComplexPrecisionT>(1U << num_qubits),
+        std::vector<ComplexPrecisionT>(1U << num_qubits),
+        std::vector<ComplexPrecisionT>(1U << num_qubits)};
 
     for (size_t i = 0; i < angles.size(); i++) {
         const auto rot_mat =
             Gates::getRot<PrecisionT>(angles[i][0], angles[i][1], angles[i][2]);
         expected_results[i][0] = rot_mat[0];
-        expected_results[i][0b1 << (num_qubits - i - 1)] = rot_mat[2];
+        expected_results[i][1U << (num_qubits - i - 1)] = rot_mat[2];
     }
 
     for (size_t index = 0; index < num_qubits; index++) {
@@ -1225,8 +1225,8 @@ void testApplyCRot() {
     std::vector<ComplexPrecisionT> expected_results(8);
     const auto rot_mat =
         Gates::getRot<PrecisionT>(angles[0], angles[1], angles[2]);
-    expected_results[0b1 << (num_qubits - 1)] = rot_mat[0];
-    expected_results[(0b1 << num_qubits) - 2] = rot_mat[2];
+    expected_results[1U << (num_qubits - 1)] = rot_mat[0];
+    expected_results[(1U << num_qubits) - 2] = rot_mat[2];
 
     DYNAMIC_SECTION(GateImplementation::name
                     << ", CRot0,1 |000> -> |000> - "
diff --git a/pennylane_lightning/src/tests/Test_OpToMemberFuncPtr.cpp b/pennylane_lightning/src/tests/Test_OpToMemberFuncPtr.cpp
index a46a7387f6..81f85038e4 100644
--- a/pennylane_lightning/src/tests/Test_OpToMemberFuncPtr.cpp
+++ b/pennylane_lightning/src/tests/Test_OpToMemberFuncPtr.cpp
@@ -104,6 +104,7 @@ class DummyImplementation {
         static_cast<void>(arr);
         static_cast<void>(num_qubits);
         static_cast<void>(matrix);
+        static_cast<void>(wires);
         static_cast<void>(inverse);
     }
 
diff --git a/pennylane_lightning/src/tests/Test_StateVectorRaw.cpp b/pennylane_lightning/src/tests/Test_StateVectorRaw.cpp
index 4700c74881..e80c076583 100644
--- a/pennylane_lightning/src/tests/Test_StateVectorRaw.cpp
+++ b/pennylane_lightning/src/tests/Test_StateVectorRaw.cpp
@@ -43,5 +43,5 @@ TEMPLATE_TEST_CASE("StateVectorRaw::setData", "[StateVectorRaw]", float,
 
     REQUIRE(sv.getNumQubits() == 8);
     REQUIRE(sv.getData() == st_data2.data());
-    REQUIRE(sv.getLength() == (1U << 8));
+    REQUIRE(sv.getLength() == (1U << 8U));
 }
diff --git a/pennylane_lightning/src/tests/Test_Util.cpp b/pennylane_lightning/src/tests/Test_Util.cpp
index c07da227c9..18c0a7c4b9 100644
--- a/pennylane_lightning/src/tests/Test_Util.cpp
+++ b/pennylane_lightning/src/tests/Test_Util.cpp
@@ -469,7 +469,7 @@ TEMPLATE_TEST_CASE("Utility math functions", "[Util][LinearAlgebra]", float,
  */
 size_t popcount_slow(uint64_t x) {
     size_t c = 0;
-    for (; x != 0; x >>= 1) {
+    for (; x != 0U; x >>= 1U) {
         if ((x & 1U) != 0U) {
             c++;
         }
@@ -484,8 +484,8 @@ size_t popcount_slow(uint64_t x) {
  */
 size_t ctz_slow(uint64_t x) {
     size_t c = 0;
-    while ((x & 1) == 0) {
-        x >>= 1;
+    while ((x & 1U) == 0) {
+        x >>= 1U;
         c++;
     }
     return c;
diff --git a/pennylane_lightning/src/tests/Test_VectorJacobianProduct.cpp b/pennylane_lightning/src/tests/Test_VectorJacobianProduct.cpp
index babee6b726..8636feb1d9 100644
--- a/pennylane_lightning/src/tests/Test_VectorJacobianProduct.cpp
+++ b/pennylane_lightning/src/tests/Test_VectorJacobianProduct.cpp
@@ -53,7 +53,7 @@ TEST_CASE("VectorJacobianProduct::vectorJacobianProduct Op=RX, Obs=Z dy={0}",
 
         for (const auto &p : param) {
             auto ops = OpsData<double>({"RX"}, {{p}}, {{0}}, {false});
-            std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+            std::vector<std::complex<double>> cdata(1U << num_qubits);
             cdata[0] = std::complex<double>{1, 0};
 
             StateVectorRaw<double> psi(cdata.data(), cdata.size());
@@ -91,7 +91,7 @@ TEST_CASE("VectorJacobianProduct::vectorJacobianProduct Op=RX, Obs=Z dy={1}",
         for (const auto &p : param) {
             auto ops = OpsData<double>({"RX"}, {{p}}, {{0}}, {false});
 
-            std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+            std::vector<std::complex<double>> cdata(1U << num_qubits);
             cdata[0] = std::complex<double>{1, 0};
 
             StateVectorRaw<double> psi(cdata.data(), cdata.size());
@@ -129,7 +129,7 @@ TEST_CASE("VectorJacobianProduct::vectorJacobianProduct Op=RX, Obs=Z dy={0.4}",
         for (const auto &p : param) {
             auto ops = OpsData<double>({"RX"}, {{p}}, {{0}}, {false});
 
-            std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+            std::vector<std::complex<double>> cdata(1U << num_qubits);
             cdata[0] = std::complex<double>{1, 0};
 
             StateVectorRaw<double> psi(cdata.data(), cdata.size());
@@ -168,7 +168,7 @@ TEST_CASE("VectorJacobianProduct::vectorJacobianProduct Op=RY, Obs=X dy={0.4}",
         for (const auto &p : param) {
             auto ops = OpsData<double>({"RY"}, {{p}}, {{0}}, {false});
 
-            std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+            std::vector<std::complex<double>> cdata(1U << num_qubits);
             cdata[0] = std::complex<double>{1, 0};
 
             StateVectorRaw<double> psi(cdata.data(), cdata.size());
@@ -203,7 +203,7 @@ TEST_CASE(
         std::vector<double> vjp_res(num_params);
         std::vector<double> dy(num_obs, 1);
 
-        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+        std::vector<std::complex<double>> cdata(1U << num_qubits);
         StateVectorRaw<double> psi(cdata.data(), cdata.size());
         cdata[0] = std::complex<double>{1, 0};
 
@@ -239,7 +239,7 @@ TEST_CASE("VectorJacobianProduct::vectorJacobianProduct Op=[RX,RX,RX], "
         std::vector<double> vjp_res(num_params);
         std::vector<double> dy(num_obs, 0.4);
 
-        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+        std::vector<std::complex<double>> cdata(1U << num_qubits);
         StateVectorRaw<double> psi(cdata.data(), cdata.size());
         cdata[0] = std::complex<double>{1, 0};
 
@@ -282,7 +282,7 @@ TEST_CASE(
         std::vector<double> vjp_res(num_params);
         std::vector<double> dy(num_obs, 1);
 
-        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+        std::vector<std::complex<double>> cdata(1U << num_qubits);
         StateVectorRaw<double> psi(cdata.data(), cdata.size());
         cdata[0] = std::complex<double>{1, 0};
 
@@ -322,7 +322,7 @@ TEST_CASE("VectorJacobianProduct::vectorJacobianProduct Op=[RX,RX,RX], "
         std::vector<double> vjp_res(num_params);
         std::vector<double> dy(num_obs, 0.4);
 
-        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+        std::vector<std::complex<double>> cdata(1U << num_qubits);
         StateVectorRaw<double> psi(cdata.data(), cdata.size());
         cdata[0] = std::complex<double>{1, 0};
 
@@ -361,7 +361,7 @@ TEST_CASE(
         std::vector<double> vjp_res(num_params);
         std::vector<double> dy(num_obs, 1);
 
-        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+        std::vector<std::complex<double>> cdata(1U << num_qubits);
         StateVectorRaw<double> psi(cdata.data(), cdata.size());
         cdata[0] = std::complex<double>{1, 0};
 
@@ -412,7 +412,7 @@ TEST_CASE("VectorJacobianProduct::vectorJacobianProduct Op=Mixed, Obs=[XXX], "
         std::vector<double> vjp_res(num_params);
         std::vector<double> dy(num_obs, -0.2);
 
-        std::vector<std::complex<double>> cdata(0b1 << num_qubits);
+        std::vector<std::complex<double>> cdata(1U << num_qubits);
         StateVectorRaw<double> psi(cdata.data(), cdata.size());
         cdata[0] = std::complex<double>{1, 0};
 
@@ -622,4 +622,4 @@ TEST_CASE("VectorJacobianProduct::vectorJacobianProduct Mixed Ops, Obs and "
         CHECK(-0.5 * expected[1] == Approx(vjp_res[1]).margin(1e-7));
         CHECK(-0.5 * expected[2] == Approx(vjp_res[2]).margin(1e-7));
     }
-}
\ No newline at end of file
+}
diff --git a/pennylane_lightning/src/util/BitUtil.hpp b/pennylane_lightning/src/util/BitUtil.hpp
index 8b7251ddc3..48fe1ddfcf 100644
--- a/pennylane_lightning/src/util/BitUtil.hpp
+++ b/pennylane_lightning/src/util/BitUtil.hpp
@@ -196,8 +196,9 @@ inline auto constexpr fillLeadingOnes(size_t pos) -> size_t {
 /**
  * @brief Swap bits in i-th and j-th position in place
  */
-inline void constexpr bitswap(size_t bits, const size_t i, const size_t j) {
+inline auto constexpr bitswap(size_t bits, const size_t i, const size_t j)
+    -> size_t {
     size_t x = ((bits >> i) ^ (bits >> j)) & 1U;
-    bits ^= ((x << i) | (x << j));
+    return bits ^ ((x << i) | (x << j));
 }
 } // namespace Pennylane::Util

From 16df1251c6f79c3e31f7f49f241fdb42c16f9ff4 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 9 Mar 2022 20:54:48 -0500
Subject: [PATCH 61/94] Trigger CI


From 6908f2086f235630d0681a28476a57dc1a1386a3 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 9 Mar 2022 21:28:20 -0500
Subject: [PATCH 62/94] Futher update in tests

---
 pennylane_lightning/src/tests/TestHelpers.hpp | 52 ----------
 .../tests/Test_GateImplementations_Matrix.cpp |  1 +
 .../src/tests/Test_Internal.cpp               | 98 ++++++++++++++-----
 pennylane_lightning/src/tests/Test_Util.cpp   | 32 ++++++
 4 files changed, 104 insertions(+), 79 deletions(-)

diff --git a/pennylane_lightning/src/tests/TestHelpers.hpp b/pennylane_lightning/src/tests/TestHelpers.hpp
index 655a43079d..6831a97b76 100644
--- a/pennylane_lightning/src/tests/TestHelpers.hpp
+++ b/pennylane_lightning/src/tests/TestHelpers.hpp
@@ -294,58 +294,6 @@ auto createParams(Gates::GateOperation op) -> std::vector<PrecisionT> {
     }
     return {};
 }
-/**
- * @brief Generate random unitary matrix
- *
- * @return Generated unitary matrix in row-major format
- */
-template <typename PrecisionT, class RandomEngine>
-auto randomUnitary(RandomEngine &re, size_t num_qubits)
-    -> std::vector<std::complex<PrecisionT>> {
-    using ComplexPrecisionT = std::complex<PrecisionT>;
-    const size_t dim = (1U << num_qubits);
-    std::vector<ComplexPrecisionT> res(dim * dim, ComplexPrecisionT{});
-
-    std::normal_distribution<PrecisionT> dist;
-
-    auto generator = [&dist, &re]() -> ComplexPrecisionT {
-        return ComplexPrecisionT{dist(re), dist(re)};
-    };
-
-    std::generate(res.begin(), res.end(), generator);
-
-    // Simple algorithm to make rows orthogonal with Gram-Schmidt
-    // This algorithm is unstable but works for a small matrix.
-    // Use QR decomposition when we have LAPACK support.
-
-    for (size_t row2 = 0; row2 < dim; row2++) {
-        ComplexPrecisionT *row2_p = res.data() + row2 * dim;
-        for (size_t row1 = 0; row1 < row2; row1++) {
-            const ComplexPrecisionT *row1_p = res.data() + row1 * dim;
-            ComplexPrecisionT dot12 = Util::innerProdC(row1_p, row2_p, dim);
-            ComplexPrecisionT dot11 = squaredNorm(row1_p, dim);
-
-            // orthogonalize row2
-            std::transform(
-                row2_p, row2_p + dim, row1_p, row2_p,
-                [scale = dot12 / dot11](auto &elt2, const auto &elt1) {
-                    return elt2 - scale * elt1;
-                });
-        }
-    }
-
-    // Normalize each row
-    for (size_t row = 0; row < dim; row++) {
-        ComplexPrecisionT *row_p = res.data() + row * dim;
-        PrecisionT norm2 = std::sqrt(squaredNorm(row_p, dim));
-
-        // noramlize row2
-        std::transform(row_p, row_p + dim, row_p, [norm2](const auto c) {
-            return (static_cast<PrecisionT>(1.0) / norm2) * c;
-        });
-    }
-    return res;
-}
 
 template <class PrecisionT> struct PrecisionToName;
 
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Matrix.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Matrix.cpp
index dfda96073f..a49440aa20 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Matrix.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Matrix.cpp
@@ -6,6 +6,7 @@
 #include <catch2/catch.hpp>
 
 using namespace Pennylane;
+using Pennylane::Util::randomUnitary;
 
 template <typename PrecisionT>
 using ApplyMatrixType = void (*)(std::complex<PrecisionT> *, size_t,
diff --git a/pennylane_lightning/src/tests/Test_Internal.cpp b/pennylane_lightning/src/tests/Test_Internal.cpp
index 00ead21271..33b5fef81a 100644
--- a/pennylane_lightning/src/tests/Test_Internal.cpp
+++ b/pennylane_lightning/src/tests/Test_Internal.cpp
@@ -1,8 +1,10 @@
+#include "CreateAllWires.hpp"
 #include "GateImplementationsPI.hpp"
 #include "TestHelpers.hpp"
 
 #include <catch2/catch.hpp>
 
+#include <algorithm>
 #include <random>
 
 /**
@@ -83,35 +85,77 @@ TEMPLATE_TEST_CASE("createProductState", "[Test_Internal]", float, double) {
     }
 }
 
-/**
- * @brief Test randomUnitary is correct
- */
-TEMPLATE_TEST_CASE("randomUnitary", "[Test_Internal]", float, double) {
-    using PrecisionT = TestType;
-
-    std::mt19937 re{1337};
-
-    for (size_t num_qubits = 1; num_qubits <= 5; num_qubits++) {
-        const size_t dim = (1U << num_qubits);
-        const auto unitary = randomUnitary<PrecisionT>(re, num_qubits);
-
-        std::vector<std::complex<PrecisionT>> unitary_dagger =
-            Util::Transpose(unitary, dim, dim);
-        std::transform(
-            unitary_dagger.begin(), unitary_dagger.end(),
-            unitary_dagger.begin(),
-            [](const std::complex<PrecisionT> &v) { return std::conj(v); });
+size_t binomialCeff(size_t n, size_t r) {
+    size_t num = 1;
+    size_t dem = 1;
+    for (size_t k = 0; k < r; k++) {
+        num *= (n - k);
+    }
+    for (size_t k = 1; k <= r; k++) {
+        dem *= k;
+    }
+    return num / dem;
+}
 
-        std::vector<std::complex<PrecisionT>> mat(dim * dim);
-        Util::matrixMatProd(unitary.data(), unitary_dagger.data(), mat.data(),
-                            dim, dim, dim);
+size_t permSize(size_t n, size_t r) {
+    size_t res = 1;
+    for (size_t k = 0; k < r; k++) {
+        res *= (n - k);
+    }
+    return res;
+}
 
-        std::vector<std::complex<PrecisionT>> identity(
-            dim * dim, std::complex<PrecisionT>{});
-        for (size_t i = 0; i < dim; i++) {
-            identity[i * dim + i] = std::complex<PrecisionT>{1.0, 0.0};
+/**
+ * @brief Test create all wires
+ */
+TEST_CASE("createAllWires", "[Test_Internal]") {
+    SECTION("order = false") {
+        const std::vector<std::pair<size_t, size_t>> test_pairs{
+            {4, 2},  {8, 3},  {12, 1}, {12, 2}, {12, 3},  {12, 4},  {12, 5},
+            {12, 6}, {12, 7}, {12, 8}, {12, 9}, {12, 10}, {12, 11}, {12, 12}};
+
+        for (const auto &[n, r] : test_pairs) {
+            std::vector<std::set<size_t>> vec;
+            auto v = CombinationGenerator(n, r).all_perms();
+
+            REQUIRE(v.size() == binomialCeff(n, r));
+            for (const auto &perm : v) {
+                REQUIRE(perm.size() == r);
+                vec.emplace_back(perm.begin(), perm.end());
+            }
+
+            std::sort(v.begin(), v.end(),
+                      [](const std::vector<size_t> &v1,
+                         const std::vector<size_t> &v2) {
+                          return std::lexicographical_compare(
+                              v1.begin(), v1.end(), v2.begin(), v2.end());
+                      }); // sort lexicographically
+            for (size_t i = 0; i < v.size() - 1; i++) {
+                REQUIRE(v[i] != v[i + 1]); // all combinations must be different
+            }
+        }
+    }
+    SECTION("order = true") {
+        const std::vector<std::pair<size_t, size_t>> test_pairs{
+            {4, 2}, {8, 3}, {12, 1}, {12, 2}, {12, 3}, {12, 4}, {12, 5}};
+
+        for (const auto &[n, r] : test_pairs) {
+            auto v = PermutationGenerator(n, r).all_perms();
+
+            REQUIRE(v.size() == permSize(n, r));
+            for (const auto &perm : v) {
+                REQUIRE(perm.size() == r);
+            }
+
+            std::sort(v.begin(), v.end(),
+                      [](const std::vector<size_t> &v1,
+                         const std::vector<size_t> &v2) {
+                          return std::lexicographical_compare(
+                              v1.begin(), v1.end(), v2.begin(), v2.end());
+                      }); // sort lexicographically
+            for (size_t i = 0; i < v.size() - 1; i++) {
+                REQUIRE(v[i] != v[i + 1]); // all permutations must be different
+            }
         }
-
-        REQUIRE(mat == PLApprox(identity).margin(1e-5));
     }
 }
diff --git a/pennylane_lightning/src/tests/Test_Util.cpp b/pennylane_lightning/src/tests/Test_Util.cpp
index 18c0a7c4b9..7986ee9ef5 100644
--- a/pennylane_lightning/src/tests/Test_Util.cpp
+++ b/pennylane_lightning/src/tests/Test_Util.cpp
@@ -594,3 +594,35 @@ TEST_CASE("Utility array and tuples", "[Util]") {
                 std::pair<std::string_view, int>("Four", 4),
             });
 }
+
+/**
+ * @brief Test randomUnitary is correct
+ */
+TEMPLATE_TEST_CASE("randomUnitary", "[Test_Internal]", float, double) {
+    using PrecisionT = TestType;
+
+    std::mt19937 re{1337};
+
+    for (size_t num_qubits = 1; num_qubits <= 5; num_qubits++) {
+        const size_t dim = (1U << num_qubits);
+        const auto unitary = Util::randomUnitary<PrecisionT>(re, num_qubits);
+
+        auto unitary_dagger = Util::Transpose(unitary, dim, dim);
+        std::transform(
+            unitary_dagger.begin(), unitary_dagger.end(),
+            unitary_dagger.begin(),
+            [](const std::complex<PrecisionT> &v) { return std::conj(v); });
+
+        std::vector<std::complex<PrecisionT>> mat(dim * dim);
+        Util::matrixMatProd(unitary.data(), unitary_dagger.data(), mat.data(),
+                            dim, dim, dim);
+
+        std::vector<std::complex<PrecisionT>> identity(
+            dim * dim, std::complex<PrecisionT>{});
+        for (size_t i = 0; i < dim; i++) {
+            identity[i * dim + i] = std::complex<PrecisionT>{1.0, 0.0};
+        }
+
+        REQUIRE(mat == PLApprox(identity).margin(1e-5));
+    }
+}

From fd2c3f6d0bdf9a17c47312c08ce689ac107a4f4a Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@xanadu.ai>
Date: Mon, 14 Mar 2022 16:09:25 -0400
Subject: [PATCH 63/94] Apply suggestions from code review

Co-authored-by: Ali Asadi <ali@xanadu.ai>
---
 doc/conf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/conf.py b/doc/conf.py
index 37a6be4452..b94e1c5983 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -72,7 +72,7 @@ def __getattr__(cls, name):
 # -- General configuration ------------------------------------------------
 
 # If your documentation needs a minimal Sphinx version, state it here.
-needs_sphinx = "1.6"
+needs_sphinx = "3.3"
 
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
@@ -229,7 +229,7 @@ def __getattr__(cls, name):
     "download_button": "#19b37b",
 }
 
-edit_on_github_project = "XanaduAI/pennylane-lightning"
+edit_on_github_project = "PennyLaneAI/pennylane-lightning"
 edit_on_github_branch = "master/doc"
 
 # ============================================================

From de62ece3e8b1030f19558265e6fef4c497d2603b Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@xanadu.ai>
Date: Mon, 14 Mar 2022 16:10:34 -0400
Subject: [PATCH 64/94] Update pennylane_lightning/src/bindings/Bindings.cpp

Co-authored-by: Ali Asadi <ali@xanadu.ai>
---
 pennylane_lightning/src/bindings/Bindings.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/src/bindings/Bindings.cpp b/pennylane_lightning/src/bindings/Bindings.cpp
index 73b98f8c55..8cdabbf2ed 100644
--- a/pennylane_lightning/src/bindings/Bindings.cpp
+++ b/pennylane_lightning/src/bindings/Bindings.cpp
@@ -367,7 +367,7 @@ PYBIND11_MODULE(lightning_qubit_ops, // NOLINT: No control over Pybind internals
     /* Add compile info */
     m.def("compile_info", &getCompileInfo, "Compiled binary information.");
 
-    /* Add compile info */
+    /* Add runtime info */
     m.def("runtime_info", &getRuntimeInfo, "Runtime information.");
 
     /* Add EXPORTED_KERNELS */

From 6507b59c4f181515cf9a4e038d0ee0d0adb4c031 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Mon, 14 Mar 2022 17:15:47 -0400
Subject: [PATCH 65/94] Fix PLApprox

---
 pennylane_lightning/src/tests/TestHelpers.hpp | 15 +++-
 ...est_GateImplementations_CompareKernels.cpp |  4 +-
 .../Test_GateImplementations_Generator.cpp    |  2 +-
 .../Test_GateImplementations_Inverse.cpp      |  2 +-
 .../tests/Test_GateImplementations_Matrix.cpp | 32 ++++----
 .../Test_GateImplementations_Nonparam.cpp     |  8 +-
 .../tests/Test_GateImplementations_Param.cpp  | 78 +++++++++----------
 .../src/tests/Test_Internal.cpp               | 10 +--
 pennylane_lightning/src/tests/Test_Util.cpp   | 24 +++---
 9 files changed, 93 insertions(+), 82 deletions(-)

diff --git a/pennylane_lightning/src/tests/TestHelpers.hpp b/pennylane_lightning/src/tests/TestHelpers.hpp
index 6831a97b76..3b2fca4eba 100644
--- a/pennylane_lightning/src/tests/TestHelpers.hpp
+++ b/pennylane_lightning/src/tests/TestHelpers.hpp
@@ -1,5 +1,6 @@
 #include <algorithm>
 #include <complex>
+#include <memory>
 #include <random>
 #include <string>
 #include <type_traits>
@@ -27,7 +28,7 @@ template <typename T> struct is_complex<std::complex<T>> : std::true_type {};
 
 template <typename T> constexpr bool is_complex_v = is_complex<T>::value;
 
-template <class T, class Alloc> struct PLApprox {
+template <class T, class Alloc = std::allocator<T>> struct PLApprox {
     const std::vector<T, Alloc> &comp_;
 
     explicit PLApprox(const std::vector<T, Alloc> &comp) : comp_{comp} {}
@@ -78,6 +79,16 @@ template <class T, class Alloc> struct PLApprox {
         return *this;
     }
 };
+
+/**
+ * @brief Simple helper for PLApprox for the cases when the class template
+ * deduction does not work well.
+ */
+template <typename T, class Alloc>
+PLApprox<T, Alloc> approx(const std::vector<T, Alloc> &vec) {
+    return PLApprox<T, Alloc>(vec);
+}
+
 template <typename T, class Alloc>
 std::ostream &operator<<(std::ostream &os, const PLApprox<T, Alloc> &approx) {
     os << approx.describe();
@@ -110,7 +121,7 @@ isApproxEqual(const std::vector<Data_t, AllocA> &data1,
               const typename Data_t::value_type eps =
                   std::numeric_limits<typename Data_t::value_type>::epsilon() *
                   100) {
-    return data1 == PLApprox(data2).epsilon(eps);
+    return data1 == PLApprox<Data_t, AllocB>(data2).epsilon(eps);
 }
 
 /**
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
index c66f07e522..98bd3c0870 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
@@ -131,7 +131,7 @@ void testApplyGate(RandomEngine &re, size_t num_qubits) {
 
             for (size_t i = 0; i < results.size() - 1; i++) {
                 REQUIRE(results[i] ==
-                        PLApprox(results[i + 1])
+                        approx(results[i + 1])
                             .margin(static_cast<PrecisionT>(1e-5)));
             }
         }
@@ -147,7 +147,7 @@ void testApplyGate(RandomEngine &re, size_t num_qubits) {
 
             for (size_t i = 0; i < results.size() - 1; i++) {
                 REQUIRE(results[i] ==
-                        PLApprox(results[i + 1])
+                        approx(results[i + 1])
                             .margin(static_cast<PrecisionT>(1e-5)));
             }
         }
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp
index fd045742e9..d2ecd00a4a 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp
@@ -114,7 +114,7 @@ void testGeneratorForGate(RandomEngine &re, size_t num_qubits) {
 
         scaleVector(gate_der_st, static_cast<PrecisionT>(0.5) / eps);
 
-        REQUIRE(gntr_st == PLApprox(gate_der_st).margin(1e-3));
+        REQUIRE(gntr_st == approx(gate_der_st).margin(1e-3));
     }
 }
 template <typename PrecisionT, typename ParamT, class GateImplementation,
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Inverse.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Inverse.cpp
index fb172dafdb..6e33d781a5 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Inverse.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Inverse.cpp
@@ -44,7 +44,7 @@ void testInverseKernelGate(RandomEngine &re, size_t num_qubits) {
             callGateOps(func_ptr, st.data(), num_qubits, wires, false, params);
             callGateOps(func_ptr, st.data(), num_qubits, wires, true, params);
 
-            REQUIRE(st == PLApprox(ini_st).margin(1e-7));
+            REQUIRE(st == approx(ini_st).margin(1e-7));
         }
     } else {
         static_cast<void>(re);
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Matrix.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Matrix.cpp
index a49440aa20..90ede20986 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Matrix.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Matrix.cpp
@@ -83,7 +83,7 @@ void testApplyMatrix() {
         auto st = ini_st;
         GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
                                         wires, false);
-        REQUIRE(st == PLApprox(expected).margin(1e-5));
+        REQUIRE(st == approx(expected).margin(1e-5));
     }
 
     DYNAMIC_SECTION(GateImplementation::name
@@ -139,7 +139,7 @@ void testApplyMatrix() {
         auto st = ini_st;
         GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
                                         wires, false);
-        REQUIRE(st == PLApprox(expected).margin(1e-5));
+        REQUIRE(st == approx(expected).margin(1e-5));
     }
 
     DYNAMIC_SECTION(GateImplementation::name
@@ -195,7 +195,7 @@ void testApplyMatrix() {
         auto st = ini_st;
         GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
                                         wires, false);
-        REQUIRE(st == PLApprox(expected).margin(1e-5));
+        REQUIRE(st == approx(expected).margin(1e-5));
     }
 
     DYNAMIC_SECTION(GateImplementation::name
@@ -263,7 +263,7 @@ void testApplyMatrix() {
         auto st = ini_st;
         GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
                                         wires, false);
-        REQUIRE(st == PLApprox(expected).margin(1e-5));
+        REQUIRE(st == approx(expected).margin(1e-5));
     }
 
     DYNAMIC_SECTION(GateImplementation::name
@@ -331,7 +331,7 @@ void testApplyMatrix() {
         auto st = ini_st;
         GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
                                         wires, false);
-        REQUIRE(st == PLApprox(expected).margin(1e-5));
+        REQUIRE(st == approx(expected).margin(1e-5));
     }
 
     DYNAMIC_SECTION(GateImplementation::name
@@ -448,7 +448,7 @@ void testApplyMatrix() {
         auto st = ini_st;
         GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
                                         wires, false);
-        REQUIRE(st == PLApprox(expected).margin(1e-5));
+        REQUIRE(st == approx(expected).margin(1e-5));
     }
 
     DYNAMIC_SECTION(GateImplementation::name
@@ -757,7 +757,7 @@ void testApplyMatrix() {
         auto st = ini_st;
         GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
                                         wires, false);
-        REQUIRE(st == PLApprox(expected).margin(1e-5));
+        REQUIRE(st == approx(expected).margin(1e-5));
     }
 }
 
@@ -803,7 +803,7 @@ void testApplyMatrixInverse() {
                                         wires, false);
         GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
                                         wires, true);
-        REQUIRE(st == PLApprox(ini_st).margin(1e-5));
+        REQUIRE(st == approx(ini_st).margin(1e-5));
     }
 
     DYNAMIC_SECTION(GateImplementation::name
@@ -821,7 +821,7 @@ void testApplyMatrixInverse() {
         GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
                                         wires, true);
 
-        REQUIRE(st == PLApprox(ini_st).margin(1e-5));
+        REQUIRE(st == approx(ini_st).margin(1e-5));
     }
 
     DYNAMIC_SECTION(GateImplementation::name
@@ -839,7 +839,7 @@ void testApplyMatrixInverse() {
         GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
                                         wires, true);
 
-        REQUIRE(st == PLApprox(ini_st).margin(1e-5));
+        REQUIRE(st == approx(ini_st).margin(1e-5));
     }
 
     DYNAMIC_SECTION(GateImplementation::name
@@ -857,7 +857,7 @@ void testApplyMatrixInverse() {
         GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
                                         wires, true);
 
-        REQUIRE(st == PLApprox(ini_st).margin(1e-5));
+        REQUIRE(st == approx(ini_st).margin(1e-5));
     }
 
     DYNAMIC_SECTION(GateImplementation::name
@@ -875,7 +875,7 @@ void testApplyMatrixInverse() {
         GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
                                         wires, true);
 
-        REQUIRE(st == PLApprox(ini_st).margin(1e-5));
+        REQUIRE(st == approx(ini_st).margin(1e-5));
     }
     DYNAMIC_SECTION(GateImplementation::name
                     << ", wires = {1,2} - "
@@ -891,7 +891,7 @@ void testApplyMatrixInverse() {
         GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
                                         wires, true);
 
-        REQUIRE(st == PLApprox(ini_st).margin(1e-5));
+        REQUIRE(st == approx(ini_st).margin(1e-5));
     }
     DYNAMIC_SECTION(GateImplementation::name
                     << ", wires = {1,3} - "
@@ -907,7 +907,7 @@ void testApplyMatrixInverse() {
         GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
                                         wires, true);
 
-        REQUIRE(st == PLApprox(ini_st).margin(1e-5));
+        REQUIRE(st == approx(ini_st).margin(1e-5));
     }
 
     DYNAMIC_SECTION(GateImplementation::name
@@ -924,7 +924,7 @@ void testApplyMatrixInverse() {
         GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
                                         wires, true);
 
-        REQUIRE(st == PLApprox(ini_st).margin(1e-5));
+        REQUIRE(st == approx(ini_st).margin(1e-5));
     }
     DYNAMIC_SECTION(GateImplementation::name
                     << ", wires = {0,1,2,3} - "
@@ -940,7 +940,7 @@ void testApplyMatrixInverse() {
         GateImplementation::applyMatrix(st.data(), num_qubits, matrix.data(),
                                         wires, true);
 
-        REQUIRE(st == PLApprox(ini_st).margin(1e-5));
+        REQUIRE(st == approx(ini_st).margin(1e-5));
     }
 }
 
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Nonparam.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Nonparam.cpp
index 49eb353529..c5b3e01227 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Nonparam.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Nonparam.cpp
@@ -99,7 +99,7 @@ void testApplyPauliY() {
 
         GateImplementation::applyPauliY(st.data(), num_qubits, {index}, false);
 
-        CHECK(st == PLApprox(expected_results[index]));
+        CHECK(st == approx(expected_results[index]));
     }
 }
 PENNYLANE_RUN_TEST(PauliY);
@@ -122,7 +122,7 @@ void testApplyPauliZ() {
         auto st = createPlusState<PrecisionT>(num_qubits);
         GateImplementation::applyPauliZ(st.data(), num_qubits, {index}, false);
 
-        CHECK(st == PLApprox(expected_results[index]));
+        CHECK(st == approx(expected_results[index]));
     }
 }
 PENNYLANE_RUN_TEST(PauliZ);
@@ -168,7 +168,7 @@ template <typename PrecisionT, class GateImplementation> void testApplyS() {
 
         GateImplementation::applyS(st.data(), num_qubits, {index}, false);
 
-        CHECK(st == PLApprox(expected_results[index]));
+        CHECK(st == approx(expected_results[index]));
     }
 }
 PENNYLANE_RUN_TEST(S);
@@ -191,7 +191,7 @@ template <typename PrecisionT, class GateImplementation> void testApplyT() {
 
         GateImplementation::applyT(st.data(), num_qubits, {index}, false);
 
-        CHECK(st == PLApprox(expected_results[index]));
+        CHECK(st == approx(expected_results[index]));
     }
 }
 PENNYLANE_RUN_TEST(T);
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Param.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Param.cpp
index 894038c514..33ec8656a7 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Param.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Param.cpp
@@ -107,7 +107,7 @@ void testApplyPhaseShift() {
         GateImplementation::applyPhaseShift(st.data(), num_qubits, {index},
                                             false, {angles[index]});
 
-        CHECK(st == PLApprox(expected_results[index]));
+        CHECK(st == approx(expected_results[index]));
     }
 }
 PENNYLANE_RUN_TEST(PhaseShift);
@@ -132,7 +132,7 @@ void testApplyRX() {
         GateImplementation::applyRX(st.data(), num_qubits, {0}, false,
                                     {angles[index]});
 
-        CHECK(st == PLApprox(expected_results[index]).epsilon(1e-7));
+        CHECK(st == approx(expected_results[index]).epsilon(1e-7));
     }
 }
 PENNYLANE_RUN_TEST(RX);
@@ -172,7 +172,7 @@ void testApplyRY() {
             auto st = init_state;
             GateImplementation::applyRY(st.data(), num_qubits, {0}, false,
                                         {angles[index]});
-            CHECK(st == PLApprox(expected_results[index]).epsilon(1e-5));
+            CHECK(st == approx(expected_results[index]).epsilon(1e-5));
         }
     }
 }
@@ -220,7 +220,7 @@ void testApplyRZ() {
         GateImplementation::applyRZ(st.data(), num_qubits, {index}, false,
                                     {angles[index]});
 
-        CHECK(st == PLApprox(expected_results[index]));
+        CHECK(st == approx(expected_results[index]));
     }
 }
 PENNYLANE_RUN_TEST(RZ);
@@ -254,7 +254,7 @@ void testApplyRot() {
                                      angles[index][0], angles[index][1],
                                      angles[index][2]);
 
-        CHECK(st == PLApprox(expected_results[index]));
+        CHECK(st == approx(expected_results[index]));
     }
 }
 PENNYLANE_RUN_TEST(Rot);
@@ -289,7 +289,7 @@ void testApplyIsingXX() {
         auto st = ini_st;
         GateImplementation::applyIsingXX(st.data(), num_qubits, {0, 1}, false,
                                          angle);
-        REQUIRE(st == PLApprox(expected_results).margin(1e-7));
+        REQUIRE(st == approx(expected_results).margin(1e-7));
     }
     DYNAMIC_SECTION(GateImplementation::name
                     << ", IsingXX0,1 |100> -> a|100> + b|010> - "
@@ -312,7 +312,7 @@ void testApplyIsingXX() {
         auto st = ini_st;
         GateImplementation::applyIsingXX(st.data(), num_qubits, {0, 1}, false,
                                          angle);
-        REQUIRE(st == PLApprox(expected_results).margin(1e-7));
+        REQUIRE(st == approx(expected_results).margin(1e-7));
     }
     DYNAMIC_SECTION(GateImplementation::name
                     << ", IsingXX0,1 |010> -> a|010> + b|100> - "
@@ -335,7 +335,7 @@ void testApplyIsingXX() {
         auto st = ini_st;
         GateImplementation::applyIsingXX(st.data(), num_qubits, {0, 1}, false,
                                          angle);
-        REQUIRE(st == PLApprox(expected_results).margin(1e-7));
+        REQUIRE(st == approx(expected_results).margin(1e-7));
     }
     DYNAMIC_SECTION(GateImplementation::name
                     << ", IsingXX0,1 |110> -> a|110> + b|000> - "
@@ -358,7 +358,7 @@ void testApplyIsingXX() {
         auto st = ini_st;
         GateImplementation::applyIsingXX(st.data(), num_qubits, {0, 1}, false,
                                          angle);
-        REQUIRE(st == PLApprox(expected_results).margin(1e-7));
+        REQUIRE(st == approx(expected_results).margin(1e-7));
     }
     DYNAMIC_SECTION(GateImplementation::name
                     << ", IsingXX0,2 - "
@@ -390,7 +390,7 @@ void testApplyIsingXX() {
         auto st = ini_st;
         GateImplementation::applyIsingXX(st.data(), num_qubits, wires, false,
                                          angle);
-        REQUIRE(st == PLApprox(expected).margin(1e-5));
+        REQUIRE(st == approx(expected).margin(1e-5));
     }
 }
 PENNYLANE_RUN_TEST(IsingXX);
@@ -422,7 +422,7 @@ void testApplyIsingYY() {
         auto st = ini_st;
         GateImplementation::applyIsingYY(st.data(), num_qubits, {0, 1}, false,
                                          angle);
-        REQUIRE(st == PLApprox(expected_results).margin(1e-7));
+        REQUIRE(st == approx(expected_results).margin(1e-7));
     }
     DYNAMIC_SECTION(GateImplementation::name
                     << ", IsingYY0,1 |100> -> a|100> + b|010> - "
@@ -445,7 +445,7 @@ void testApplyIsingYY() {
         auto st = ini_st;
         GateImplementation::applyIsingYY(st.data(), num_qubits, {0, 1}, false,
                                          angle);
-        REQUIRE(st == PLApprox(expected_results).margin(1e-7));
+        REQUIRE(st == approx(expected_results).margin(1e-7));
     }
     DYNAMIC_SECTION(GateImplementation::name
                     << ", IsingYY0,1 |010> -> a|010> + b|100> - "
@@ -468,7 +468,7 @@ void testApplyIsingYY() {
         auto st = ini_st;
         GateImplementation::applyIsingYY(st.data(), num_qubits, {0, 1}, false,
                                          angle);
-        REQUIRE(st == PLApprox(expected_results).margin(1e-7));
+        REQUIRE(st == approx(expected_results).margin(1e-7));
     }
     DYNAMIC_SECTION(GateImplementation::name
                     << ", IsingYY0,1 |110> -> a|110> + b|000> - "
@@ -491,7 +491,7 @@ void testApplyIsingYY() {
         auto st = ini_st;
         GateImplementation::applyIsingYY(st.data(), num_qubits, {0, 1}, false,
                                          angle);
-        REQUIRE(st == PLApprox(expected_results).margin(1e-7));
+        REQUIRE(st == approx(expected_results).margin(1e-7));
     }
     DYNAMIC_SECTION(GateImplementation::name
                     << ", IsingYY0,1 - "
@@ -542,7 +542,7 @@ void testApplyIsingYY() {
         auto st = ini_st;
         GateImplementation::applyIsingYY(st.data(), num_qubits, wires, false,
                                          angle);
-        REQUIRE(st == PLApprox(expected).margin(1e-5));
+        REQUIRE(st == approx(expected).margin(1e-5));
     }
 }
 PENNYLANE_RUN_TEST(IsingYY);
@@ -574,7 +574,7 @@ void testApplyIsingZZ() {
         auto st = ini_st;
         GateImplementation::applyIsingZZ(st.data(), num_qubits, {0, 1}, false,
                                          angle);
-        REQUIRE(st == PLApprox(expected_results).margin(1e-7));
+        REQUIRE(st == approx(expected_results).margin(1e-7));
     }
     DYNAMIC_SECTION(GateImplementation::name
                     << ", IsingZZ0,1 |100> -> |100> - "
@@ -597,7 +597,7 @@ void testApplyIsingZZ() {
         auto st = ini_st;
         GateImplementation::applyIsingZZ(st.data(), num_qubits, {0, 1}, false,
                                          angle);
-        REQUIRE(st == PLApprox(expected_results).margin(1e-7));
+        REQUIRE(st == approx(expected_results).margin(1e-7));
     }
 
     DYNAMIC_SECTION(GateImplementation::name
@@ -621,7 +621,7 @@ void testApplyIsingZZ() {
         auto st = ini_st;
         GateImplementation::applyIsingZZ(st.data(), num_qubits, {0, 1}, false,
                                          angle);
-        REQUIRE(st == PLApprox(expected_results).margin(1e-7));
+        REQUIRE(st == approx(expected_results).margin(1e-7));
     }
 
     DYNAMIC_SECTION(GateImplementation::name
@@ -645,7 +645,7 @@ void testApplyIsingZZ() {
         auto st = ini_st;
         GateImplementation::applyIsingZZ(st.data(), num_qubits, {0, 1}, false,
                                          angle);
-        REQUIRE(st == PLApprox(expected_results).margin(1e-7));
+        REQUIRE(st == approx(expected_results).margin(1e-7));
     }
     DYNAMIC_SECTION(GateImplementation::name
                     << ", IsingZZ0,1 - "
@@ -696,7 +696,7 @@ void testApplyIsingZZ() {
         auto st = ini_st;
         GateImplementation::applyIsingZZ(st.data(), num_qubits, wires, false,
                                          angle);
-        REQUIRE(st == PLApprox(expected).margin(1e-5));
+        REQUIRE(st == approx(expected).margin(1e-5));
     }
 }
 PENNYLANE_RUN_TEST(IsingZZ);
@@ -734,7 +734,7 @@ void testApplyControlledPhaseShift() {
     GateImplementation::applyControlledPhaseShift(st.data(), num_qubits, {0, 1},
                                                   false, angles[0]);
     CAPTURE(st);
-    CHECK(st == PLApprox(expected_results[0]));
+    CHECK(st == approx(expected_results[0]));
 }
 PENNYLANE_RUN_TEST(ControlledPhaseShift);
 
@@ -789,7 +789,7 @@ void testApplyCRX() {
         auto st = ini_st;
         GateImplementation::applyCRX(st.data(), num_qubits, wires, false,
                                      angle);
-        REQUIRE(st == PLApprox(expected).margin(1e-5));
+        REQUIRE(st == approx(expected).margin(1e-5));
     }
     DYNAMIC_SECTION(GateImplementation::name
                     << ", CRX0,2 - " << PrecisionToName<PrecisionT>::value) {
@@ -839,7 +839,7 @@ void testApplyCRX() {
         auto st = ini_st;
         GateImplementation::applyCRX(st.data(), num_qubits, wires, false,
                                      angle);
-        REQUIRE(st == PLApprox(expected).margin(1e-5));
+        REQUIRE(st == approx(expected).margin(1e-5));
     }
     DYNAMIC_SECTION(GateImplementation::name
                     << ", CRX1,3 - " << PrecisionToName<PrecisionT>::value) {
@@ -889,7 +889,7 @@ void testApplyCRX() {
         auto st = ini_st;
         GateImplementation::applyCRX(st.data(), num_qubits, wires, false,
                                      angle);
-        REQUIRE(st == PLApprox(expected).margin(1e-5));
+        REQUIRE(st == approx(expected).margin(1e-5));
     }
 }
 PENNYLANE_RUN_TEST(CRX);
@@ -946,7 +946,7 @@ void testApplyCRY() {
         auto st = ini_st;
         GateImplementation::applyCRY(st.data(), num_qubits, wires, false,
                                      angle);
-        REQUIRE(st == PLApprox(expected).margin(1e-5));
+        REQUIRE(st == approx(expected).margin(1e-5));
     }
 
     DYNAMIC_SECTION(GateImplementation::name
@@ -997,7 +997,7 @@ void testApplyCRY() {
         auto st = ini_st;
         GateImplementation::applyCRY(st.data(), num_qubits, wires, false,
                                      angle);
-        REQUIRE(st == PLApprox(expected).margin(1e-5));
+        REQUIRE(st == approx(expected).margin(1e-5));
     }
 
     DYNAMIC_SECTION(GateImplementation::name
@@ -1048,7 +1048,7 @@ void testApplyCRY() {
         auto st = ini_st;
         GateImplementation::applyCRY(st.data(), num_qubits, wires, false,
                                      angle);
-        REQUIRE(st == PLApprox(expected).margin(1e-5));
+        REQUIRE(st == approx(expected).margin(1e-5));
     }
 }
 
@@ -1106,7 +1106,7 @@ void testApplyCRZ() {
         auto st = ini_st;
         GateImplementation::applyCRZ(st.data(), num_qubits, wires, false,
                                      angle);
-        REQUIRE(st == PLApprox(expected).margin(1e-5));
+        REQUIRE(st == approx(expected).margin(1e-5));
     }
 
     DYNAMIC_SECTION(GateImplementation::name
@@ -1157,7 +1157,7 @@ void testApplyCRZ() {
         auto st = ini_st;
         GateImplementation::applyCRZ(st.data(), num_qubits, wires, false,
                                      angle);
-        REQUIRE(st == PLApprox(expected).margin(1e-5));
+        REQUIRE(st == approx(expected).margin(1e-5));
     }
 
     DYNAMIC_SECTION(GateImplementation::name
@@ -1208,7 +1208,7 @@ void testApplyCRZ() {
         auto st = ini_st;
         GateImplementation::applyCRZ(st.data(), num_qubits, wires, false,
                                      angle);
-        REQUIRE(st == PLApprox(expected).margin(1e-5));
+        REQUIRE(st == approx(expected).margin(1e-5));
     }
 }
 PENNYLANE_RUN_TEST(CRZ);
@@ -1235,7 +1235,7 @@ void testApplyCRot() {
         GateImplementation::applyCRot(st.data(), num_qubits, {0, 1}, false,
                                       angles[0], angles[1], angles[2]);
 
-        CHECK(st == PLApprox(ini_st));
+        CHECK(st == approx(ini_st));
     }
     DYNAMIC_SECTION(GateImplementation::name
                     << ", CRot0,1 |100> -> |1>(a|0>+b|1>)|0> - "
@@ -1246,7 +1246,7 @@ void testApplyCRot() {
         GateImplementation::applyCRot(st.data(), num_qubits, {0, 1}, false,
                                       angles[0], angles[1], angles[2]);
 
-        CHECK(st == PLApprox(expected_results));
+        CHECK(st == approx(expected_results));
     }
 
     DYNAMIC_SECTION(GateImplementation::name
@@ -1299,7 +1299,7 @@ void testApplyCRot() {
         auto st = ini_st;
         GateImplementation::applyCRot(st.data(), num_qubits, wires, false, phi,
                                       theta, omega);
-        REQUIRE(st == PLApprox(expected).margin(1e-5));
+        REQUIRE(st == approx(expected).margin(1e-5));
     }
 }
 PENNYLANE_RUN_TEST(CRot);
@@ -1332,7 +1332,7 @@ void testApplyMultiRZ() {
         GateImplementation::applyMultiRZ(st.data(), num_qubits, {0}, false,
                                          angle);
 
-        REQUIRE(st == PLApprox(expected).margin(1e-7));
+        REQUIRE(st == approx(expected).margin(1e-7));
     }
     DYNAMIC_SECTION(GateImplementation::name
                     << ", MultiRZ0 |++++> - "
@@ -1355,7 +1355,7 @@ void testApplyMultiRZ() {
         GateImplementation::applyMultiRZ(st.data(), num_qubits, {0}, false,
                                          angle);
 
-        REQUIRE(st == PLApprox(expected).margin(1e-7));
+        REQUIRE(st == approx(expected).margin(1e-7));
     }
     DYNAMIC_SECTION(GateImplementation::name
                     << ", MultiRZ01 |++++> - "
@@ -1378,7 +1378,7 @@ void testApplyMultiRZ() {
         GateImplementation::applyMultiRZ(st.data(), num_qubits, {0, 1}, false,
                                          angle);
 
-        REQUIRE(st == PLApprox(expected).margin(1e-7));
+        REQUIRE(st == approx(expected).margin(1e-7));
     }
     DYNAMIC_SECTION(GateImplementation::name
                     << ", MultiRZ012 |++++> - "
@@ -1401,7 +1401,7 @@ void testApplyMultiRZ() {
         GateImplementation::applyMultiRZ(st.data(), num_qubits, {0, 1, 2},
                                          false, angle);
 
-        REQUIRE(st == PLApprox(expected).margin(1e-7));
+        REQUIRE(st == approx(expected).margin(1e-7));
     }
     DYNAMIC_SECTION(GateImplementation::name
                     << ", MultiRZ0123 |++++> - "
@@ -1424,7 +1424,7 @@ void testApplyMultiRZ() {
         GateImplementation::applyMultiRZ(st.data(), num_qubits, {0, 1, 2, 3},
                                          false, angle);
 
-        REQUIRE(st == PLApprox(expected).margin(1e-7));
+        REQUIRE(st == approx(expected).margin(1e-7));
     }
 
     DYNAMIC_SECTION(GateImplementation::name
@@ -1474,7 +1474,7 @@ void testApplyMultiRZ() {
 
         GateImplementation::applyMultiRZ(st.data(), num_qubits, wires, false,
                                          angle);
-        REQUIRE(st == PLApprox(expected).margin(1e-7));
+        REQUIRE(st == approx(expected).margin(1e-7));
     }
 }
 PENNYLANE_RUN_TEST(MultiRZ);
diff --git a/pennylane_lightning/src/tests/Test_Internal.cpp b/pennylane_lightning/src/tests/Test_Internal.cpp
index 33b5fef81a..d5fadfe14b 100644
--- a/pennylane_lightning/src/tests/Test_Internal.cpp
+++ b/pennylane_lightning/src/tests/Test_Internal.cpp
@@ -28,7 +28,7 @@ TEMPLATE_TEST_CASE("Approx", "[Test_Internal]", float, double) {
             ComplexPrecisionT{1.0001, 0.0},
             ComplexPrecisionT{0.0, 0.9999},
         };
-        REQUIRE(test1 == PLApprox(test2).margin(0.00015));
+        REQUIRE(test1 == approx(test2).margin(0.00015));
     }
     SECTION("vector{1.0, 1.0*I} does not approx vector{1.0002, 0.9998*I} with "
             "margin 0.00015") {
@@ -40,7 +40,7 @@ TEMPLATE_TEST_CASE("Approx", "[Test_Internal]", float, double) {
             ComplexPrecisionT{1.0002, 0.0},
             ComplexPrecisionT{0.0, 0.9998},
         };
-        REQUIRE(test1 != PLApprox(test2).margin(0.00015));
+        REQUIRE(test1 != approx(test2).margin(0.00015));
     }
     SECTION("vector{1.0, 1.0*I} does not approx vector{1.0I, 1.0} with margin "
             "0.00015") {
@@ -52,7 +52,7 @@ TEMPLATE_TEST_CASE("Approx", "[Test_Internal]", float, double) {
             ComplexPrecisionT{0.0, 1.0},
             ComplexPrecisionT{1.0, 0.0},
         };
-        REQUIRE(test1 != PLApprox(test2).margin(0.00015));
+        REQUIRE(test1 != approx(test2).margin(0.00015));
     }
 }
 
@@ -68,7 +68,7 @@ TEMPLATE_TEST_CASE("createProductState", "[Test_Internal]", float, double) {
         GateImplementationsPI::applyPauliX(expected.data(), 3, {1}, false);
         GateImplementationsPI::applyHadamard(expected.data(), 3, {1}, false);
 
-        REQUIRE(st == PLApprox(expected).margin(1e-7));
+        REQUIRE(st == approx(expected).margin(1e-7));
     }
     SECTION("createProductState(\"+-0\") == |+-1> ") {
         const auto st = createProductState<PrecisionT>("+-0");
@@ -81,7 +81,7 @@ TEMPLATE_TEST_CASE("createProductState", "[Test_Internal]", float, double) {
 
         GateImplementationsPI::applyPauliX(expected.data(), 3, {2}, false);
 
-        REQUIRE(st != PLApprox(expected).margin(1e-7));
+        REQUIRE(st != approx(expected).margin(1e-7));
     }
 }
 
diff --git a/pennylane_lightning/src/tests/Test_Util.cpp b/pennylane_lightning/src/tests/Test_Util.cpp
index 7986ee9ef5..a286d7bba6 100644
--- a/pennylane_lightning/src/tests/Test_Util.cpp
+++ b/pennylane_lightning/src/tests/Test_Util.cpp
@@ -159,7 +159,7 @@ TEMPLATE_TEST_CASE("Utility math functions", "[Util][LinearAlgebra]", float,
                 CAPTURE(v_out);
                 CAPTURE(v_expected);
 
-                CHECK(v_out == PLApprox(v_expected).margin(1e-7));
+                CHECK(v_out == approx(v_expected).margin(1e-7));
             }
         }
         SECTION("Random Complex") {
@@ -185,7 +185,7 @@ TEMPLATE_TEST_CASE("Utility math functions", "[Util][LinearAlgebra]", float,
                 Util::matrixVecProd(mat, v_in, 4, 4);
             CAPTURE(v_out);
 
-            CHECK(v_out == PLApprox(v_expected).margin(1e-7));
+            CHECK(v_out == approx(v_expected).margin(1e-7));
         }
         SECTION("Invalid Arguments") {
             using namespace Catch::Matchers;
@@ -215,7 +215,7 @@ TEMPLATE_TEST_CASE("Utility math functions", "[Util][LinearAlgebra]", float,
                 CAPTURE(v_out);
                 CAPTURE(v_expected);
 
-                CHECK(v_out == PLApprox(v_expected).margin(1e-7));
+                CHECK(v_out == approx(v_expected).margin(1e-7));
             }
         }
         SECTION("Zero Vector") {
@@ -229,7 +229,7 @@ TEMPLATE_TEST_CASE("Utility math functions", "[Util][LinearAlgebra]", float,
                 CAPTURE(v_out);
                 CAPTURE(v_expected);
 
-                CHECK(v_out == PLApprox(v_expected).margin(1e-7));
+                CHECK(v_out == approx(v_expected).margin(1e-7));
             }
         }
         SECTION("Random Matrix") {
@@ -242,7 +242,7 @@ TEMPLATE_TEST_CASE("Utility math functions", "[Util][LinearAlgebra]", float,
             CAPTURE(v_out);
             CAPTURE(v_expected);
 
-            CHECK(v_out == PLApprox(v_expected).margin(1e-7));
+            CHECK(v_out == approx(v_expected).margin(1e-7));
         }
     }
     SECTION("Transpose") {
@@ -258,7 +258,7 @@ TEMPLATE_TEST_CASE("Utility math functions", "[Util][LinearAlgebra]", float,
                 CAPTURE(mat_t);
                 CAPTURE(mat);
 
-                CHECK(mat_t == PLApprox(mat).margin(1e-7));
+                CHECK(mat_t == approx(mat).margin(1e-7));
             }
         }
         SECTION("Random Complex") {
@@ -286,7 +286,7 @@ TEMPLATE_TEST_CASE("Utility math functions", "[Util][LinearAlgebra]", float,
             CAPTURE(mat_t);
             CAPTURE(mat_t_exp);
 
-            CHECK(mat_t == PLApprox(mat_t_exp));
+            CHECK(mat_t == approx(mat_t_exp));
         }
         SECTION("Invalid Arguments") {
             using namespace Catch::Matchers;
@@ -311,7 +311,7 @@ TEMPLATE_TEST_CASE("Utility math functions", "[Util][LinearAlgebra]", float,
                 CAPTURE(m_out);
                 CAPTURE(m_out_exp);
 
-                CHECK(m_out == PLApprox(m_out_exp));
+                CHECK(m_out == approx(m_out_exp));
             }
         }
         SECTION("Random Complex") {
@@ -368,8 +368,8 @@ TEMPLATE_TEST_CASE("Utility math functions", "[Util][LinearAlgebra]", float,
             CAPTURE(m_out_2);
             CAPTURE(m_out_exp);
 
-            CHECK(m_out_1 == PLApprox(m_out_2));
-            CHECK(m_out_1 == PLApprox(m_out_exp));
+            CHECK(m_out_1 == approx(m_out_2));
+            CHECK(m_out_1 == approx(m_out_exp));
         }
         SECTION("Random complex non-square") {
             const size_t m = 4;
@@ -442,7 +442,7 @@ TEMPLATE_TEST_CASE("Utility math functions", "[Util][LinearAlgebra]", float,
 
             const auto m_out = Util::matrixMatProd(mat1, mat2, m, n, k);
 
-            CHECK(m_out == PLApprox(expected));
+            CHECK(m_out == approx(expected));
         }
         SECTION("Invalid Arguments") {
             using namespace Catch::Matchers;
@@ -623,6 +623,6 @@ TEMPLATE_TEST_CASE("randomUnitary", "[Test_Internal]", float, double) {
             identity[i * dim + i] = std::complex<PrecisionT>{1.0, 0.0};
         }
 
-        REQUIRE(mat == PLApprox(identity).margin(1e-5));
+        REQUIRE(mat == approx(identity).margin(1e-5));
     }
 }

From 1c450d2c4fd8f67f9dea5fac54205f3b57a0c83a Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@xanadu.ai>
Date: Mon, 14 Mar 2022 17:16:13 -0400
Subject: [PATCH 66/94] Update pennylane_lightning/src/tests/CreateAllWires.cpp

Co-authored-by: Ali Asadi <ali@xanadu.ai>
---
 pennylane_lightning/src/tests/CreateAllWires.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pennylane_lightning/src/tests/CreateAllWires.cpp b/pennylane_lightning/src/tests/CreateAllWires.cpp
index dd0194a625..65b3a10ffa 100644
--- a/pennylane_lightning/src/tests/CreateAllWires.cpp
+++ b/pennylane_lightning/src/tests/CreateAllWires.cpp
@@ -6,7 +6,6 @@ auto crateAllWires(size_t n_qubits, Gates::GateOperation gate_op, bool order)
         // make all possible 2^N permutations
         std::vector<std::vector<size_t>> res;
         res.reserve((1U << n_qubits) - 1);
-        ;
         for (size_t k = 1; k < (static_cast<size_t>(1U) << n_qubits); k++) {
             std::vector<size_t> wires;
             wires.reserve(Util::popcount(k));

From 131c626a77ca38c0076f70805a3480e17ca18649 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Mon, 14 Mar 2022 17:59:27 -0400
Subject: [PATCH 67/94] Fix createAllWires; Add NVCC/NVHPC compiler info

---
 .../src/tests/CreateAllWires.cpp              |  2 +-
 .../src/tests/CreateAllWires.hpp              |  2 +-
 ...est_GateImplementations_CompareKernels.cpp |  2 +-
 pennylane_lightning/src/util/Macros.hpp       | 76 ++++++++++++++++++-
 pennylane_lightning/src/util/RuntimeInfo.cpp  |  3 +-
 5 files changed, 77 insertions(+), 8 deletions(-)

diff --git a/pennylane_lightning/src/tests/CreateAllWires.cpp b/pennylane_lightning/src/tests/CreateAllWires.cpp
index 65b3a10ffa..6bea13f39a 100644
--- a/pennylane_lightning/src/tests/CreateAllWires.cpp
+++ b/pennylane_lightning/src/tests/CreateAllWires.cpp
@@ -1,6 +1,6 @@
 #include "CreateAllWires.hpp"
 namespace Pennylane {
-auto crateAllWires(size_t n_qubits, Gates::GateOperation gate_op, bool order)
+auto createAllWires(size_t n_qubits, Gates::GateOperation gate_op, bool order)
     -> std::vector<std::vector<size_t>> {
     if (Util::array_has_elt(Gates::Constant::multi_qubit_gates, gate_op)) {
         // make all possible 2^N permutations
diff --git a/pennylane_lightning/src/tests/CreateAllWires.hpp b/pennylane_lightning/src/tests/CreateAllWires.hpp
index 54d3cd9e9a..f462b4ae20 100644
--- a/pennylane_lightning/src/tests/CreateAllWires.hpp
+++ b/pennylane_lightning/src/tests/CreateAllWires.hpp
@@ -87,6 +87,6 @@ class PermutationGenerator : public WiresGenerator {
  * @param gate_op Gate operation
  * @param order Whether the ordering matters (if true, permutation is used)
  */
-auto crateAllWires(size_t n_qubits, Gates::GateOperation gate_op, bool order)
+auto createAllWires(size_t n_qubits, Gates::GateOperation gate_op, bool order)
     -> std::vector<std::vector<size_t>>;
 } // namespace Pennylane
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
index 98bd3c0870..0c87e07154 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
@@ -115,7 +115,7 @@ void testApplyGate(RandomEngine &re, size_t num_qubits) {
     INFO("PrecisionT, ParamT = " << PrecisionToName<PrecisionT>::value << ", "
                                  << PrecisionToName<ParamT>::value);
 
-    const auto all_wires = crateAllWires(num_qubits, gate_op, true);
+    const auto all_wires = createAllWires(num_qubits, gate_op, true);
     for (const auto &wires : all_wires) {
         const auto params = createParams<ParamT>(gate_op);
         const auto gate_name = lookup(gate_names, gate_op);
diff --git a/pennylane_lightning/src/util/Macros.hpp b/pennylane_lightning/src/util/Macros.hpp
index a8cb8c1d7d..43d44daaab 100644
--- a/pennylane_lightning/src/util/Macros.hpp
+++ b/pennylane_lightning/src/util/Macros.hpp
@@ -70,7 +70,7 @@
 #endif
 
 #if (_OPENMP >= 202011)
-#define PL_UNROLL_LOOP __Pragma("omp unroll(8)")
+#define PL_UNROLL_LOOP _Pragma("omp unroll(8)")
 #elif defined(__GNUC__)
 #define PL_UNROLL_LOOP _Pragma("GCC unroll 8")
 #elif defined(__clang__)
@@ -135,27 +135,97 @@ constexpr auto getCPUArchMSVC() {
 [[maybe_unused]] constexpr static auto cpu_arch = CPUArch::Unknown;
 #endif
 
-enum class Compiler { GCC, Clang, MSVC, Unknown };
+enum class Compiler { GCC, Clang, MSVC, NVCC, NVHPC, Unknown };
 
+/**
+ * @brief When none of the specialized functions is called.
+ */
 template <Compiler compiler>
 constexpr auto getCompilerVersion() -> std::string_view {
     return "Unknown version";
 }
+/**
+ * @brief Create version string for GCC.
+ *
+ * This function raises an error when instantiated (invoked) if a compiler
+ * does not define macros (i.e. other than GCC compatible compilers).
+ */
 template <>
 constexpr auto getCompilerVersion<Compiler::GCC>() -> std::string_view {
     return PL_TO_STR(__GNUC__) "." PL_TO_STR(__GNUC_MINOR__) "." PL_TO_STR(
         __GNUC_PATCHLEVEL__);
 }
+
+/**
+ * @brief Create version string for Clang.
+ *
+ * This function raises an error when instantiated (invoked) if a compiler
+ * does not define macros (i.e. other than Clang).
+ */
 template <>
 constexpr auto getCompilerVersion<Compiler::Clang>() -> std::string_view {
     return PL_TO_STR(__clang_major__) "." PL_TO_STR(
         __clang_minor__) "." PL_TO_STR(__clang_patchlevel__);
 }
+
+/**
+ * @brief Create version string for MSVC.
+ *
+ * This function raises an error when instantiated (invoked) if a compiler
+ * does not define macros (i.e. other than MSVC).
+ */
 template <>
 constexpr auto getCompilerVersion<Compiler::MSVC>() -> std::string_view {
     return PL_TO_STR(_MSC_FULL_VER);
 }
-#if defined(__GNUC__) && !defined(__llvm__) && !defined(__INTEL_COMPILER)
+
+/**
+ * @brief Create version string for NVCC.
+ *
+ * This function raises an error when instantiated (invoked) if a compiler
+ * does not define macros (i.e. other than NVCC).
+ */
+template <>
+constexpr auto getCompilerVersion<Compiler::NVCC>() -> std::string_view {
+    return PL_TO_STR(__CUDACC_VER_MAJOR__) "." PL_TO_STR(
+        __CUDACC_VER_MINOR__) "." PL_TO_STR(__CUDACC_VER_BUILD__);
+}
+
+/**
+ * @brief Create version string for NVCC.
+ *
+ * This function raises an error when instantiated (invoked) if a compiler
+ * does not define macros (i.e. other than NVCC).
+ *
+ * See
+ * https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-identification-macro
+ * for related information
+ */
+template <>
+constexpr auto getCompilerVersion<Compiler::NVCC>() -> std::string_view {
+    return PL_TO_STR(__CUDACC_VER_MAJOR__) "." PL_TO_STR(
+        __CUDACC_VER_MINOR__) "." PL_TO_STR(__CUDACC_VER_BUILD__);
+}
+
+/**
+ * @brief Create version string for NVHPC (C/C++ compilers without CUDA from
+ * NVIDIA).
+ *
+ * This function raises an error when instantiated (invoked) if a compiler
+ * does not define macros (i.e. other than NVHPC).
+ */
+template <>
+constexpr auto getCompilerVersion<Compiler::NVHPC>() -> std::string_view {
+    return PL_TO_STR(__NVCOMPILER_MAJOR__) "." PL_TO_STR(
+        __NVCOMPILER_MINOR__) "." PL_TO_STR(__NVCOMPILER_PATCHLEVEL__);
+}
+
+#if defined(__NVCC__)
+[[maybe_unused]] constexpr static auto compiler = Compiler::NVCC;
+#elif defined(__NVCOMPILER)
+[[maybe_unused]] constexpr static auto compiler = Compiler::NVHPC;
+#elif defined(__GNUC__) && !defined(__llvm__) && !defined(__INTEL_COMPILER)
+// All GCC compatible compilers define __GNUC__.
 [[maybe_unused]] constexpr static auto compiler = Compiler::GCC;
 #elif defined(__clang__)
 [[maybe_unused]] constexpr static auto compiler = Compiler::Clang;
diff --git a/pennylane_lightning/src/util/RuntimeInfo.cpp b/pennylane_lightning/src/util/RuntimeInfo.cpp
index 5a208cb540..6d89794615 100644
--- a/pennylane_lightning/src/util/RuntimeInfo.cpp
+++ b/pennylane_lightning/src/util/RuntimeInfo.cpp
@@ -13,11 +13,10 @@
 // limitations under the License.
 #include "RuntimeInfo.hpp"
 
-#include <array>
-
 #if defined(__GNUC__) || defined(__clang__)
 #include <cpuid.h>
 #elif defined(_MSC_VER)
+#include <array>
 #include <intrin.h>
 #endif
 

From 02b5e332600f713e1772421217e1ce8c9138ada1 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Mon, 14 Mar 2022 18:06:41 -0400
Subject: [PATCH 68/94] Fix

---
 pennylane_lightning/src/util/Macros.hpp | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/pennylane_lightning/src/util/Macros.hpp b/pennylane_lightning/src/util/Macros.hpp
index 43d44daaab..c4c20d07bb 100644
--- a/pennylane_lightning/src/util/Macros.hpp
+++ b/pennylane_lightning/src/util/Macros.hpp
@@ -179,18 +179,6 @@ constexpr auto getCompilerVersion<Compiler::MSVC>() -> std::string_view {
     return PL_TO_STR(_MSC_FULL_VER);
 }
 
-/**
- * @brief Create version string for NVCC.
- *
- * This function raises an error when instantiated (invoked) if a compiler
- * does not define macros (i.e. other than NVCC).
- */
-template <>
-constexpr auto getCompilerVersion<Compiler::NVCC>() -> std::string_view {
-    return PL_TO_STR(__CUDACC_VER_MAJOR__) "." PL_TO_STR(
-        __CUDACC_VER_MINOR__) "." PL_TO_STR(__CUDACC_VER_BUILD__);
-}
-
 /**
  * @brief Create version string for NVCC.
  *

From 351841cad464bfb8c119280ce9667f61c4b7be8f Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Mon, 14 Mar 2022 19:58:36 -0400
Subject: [PATCH 69/94] Add test for squaredNorm

---
 pennylane_lightning/src/tests/TestHelpers.hpp | 39 ++++-------------
 pennylane_lightning/src/tests/Test_Util.cpp   | 22 ++++++++++
 .../src/util/LinearAlgebra.hpp                | 42 +++++++++++++++----
 pennylane_lightning/src/util/Util.hpp         | 12 ++++++
 4 files changed, 77 insertions(+), 38 deletions(-)

diff --git a/pennylane_lightning/src/tests/TestHelpers.hpp b/pennylane_lightning/src/tests/TestHelpers.hpp
index 3b2fca4eba..a5c87328f5 100644
--- a/pennylane_lightning/src/tests/TestHelpers.hpp
+++ b/pennylane_lightning/src/tests/TestHelpers.hpp
@@ -16,25 +16,14 @@
 #include <catch2/catch.hpp>
 
 namespace Pennylane {
-template <typename T> struct remove_complex { using type = T; };
-template <typename T> struct remove_complex<std::complex<T>> {
-    using type = T;
-};
-template <typename T> using remove_complex_t = typename remove_complex<T>::type;
-
-template <typename T> struct is_complex : std::false_type {};
-
-template <typename T> struct is_complex<std::complex<T>> : std::true_type {};
-
-template <typename T> constexpr bool is_complex_v = is_complex<T>::value;
-
 template <class T, class Alloc = std::allocator<T>> struct PLApprox {
     const std::vector<T, Alloc> &comp_;
 
     explicit PLApprox(const std::vector<T, Alloc> &comp) : comp_{comp} {}
 
-    remove_complex_t<T> margin_{};
-    remove_complex_t<T> epsilon_ = std::numeric_limits<float>::epsilon() * 100;
+    Util::remove_complex_t<T> margin_{};
+    Util::remove_complex_t<T> epsilon_ =
+        std::numeric_limits<float>::epsilon() * 100;
 
     template <class AllocA>
     [[nodiscard]] bool compare(const std::vector<T, AllocA> &lhs) const {
@@ -43,7 +32,7 @@ template <class T, class Alloc = std::allocator<T>> struct PLApprox {
         }
 
         for (size_t i = 0; i < lhs.size(); i++) {
-            if constexpr (is_complex_v<T>) {
+            if constexpr (Util::is_complex_v<T>) {
                 if (lhs[i].real() != Approx(comp_[i].real())
                                          .epsilon(epsilon_)
                                          .margin(margin_) ||
@@ -61,6 +50,7 @@ template <class T, class Alloc = std::allocator<T>> struct PLApprox {
         }
         return true;
     }
+
     [[nodiscard]] std::string describe() const {
         std::ostringstream ss;
         ss << "is Approx to {";
@@ -70,11 +60,12 @@ template <class T, class Alloc = std::allocator<T>> struct PLApprox {
         ss << "}" << std::endl;
         return ss.str();
     }
-    PLApprox &epsilon(remove_complex_t<T> eps) {
+
+    PLApprox &epsilon(Util::remove_complex_t<T> eps) {
         epsilon_ = eps;
         return *this;
     }
-    PLApprox &margin(remove_complex_t<T> m) {
+    PLApprox &margin(Util::remove_complex_t<T> m) {
         margin_ = m;
         return *this;
     }
@@ -198,18 +189,6 @@ auto createPlusState(size_t num_qubits)
     return res;
 }
 
-/**
- * @brief Calculate the squared norm of a vector
- */
-template <typename PrecisionT>
-auto squaredNorm(const std::complex<PrecisionT> *data, size_t data_size)
-    -> PrecisionT {
-    return std::transform_reduce(
-        data, data + data_size, PrecisionT{}, std::plus<PrecisionT>(),
-        static_cast<PrecisionT (*)(const std::complex<PrecisionT> &)>(
-            &std::norm<PrecisionT>));
-}
-
 /**
  * @brief create a random state
  */
@@ -223,7 +202,7 @@ auto createRandomState(RandomEngine &re, size_t num_qubits)
     }
 
     scaleVector(res, std::complex<PrecisionT>{1.0, 0.0} /
-                         std::sqrt(squaredNorm(res.data(), res.size())));
+                         std::sqrt(Util::squaredNorm(res.data(), res.size())));
     return res;
 }
 
diff --git a/pennylane_lightning/src/tests/Test_Util.cpp b/pennylane_lightning/src/tests/Test_Util.cpp
index a286d7bba6..0b4d4f7e9d 100644
--- a/pennylane_lightning/src/tests/Test_Util.cpp
+++ b/pennylane_lightning/src/tests/Test_Util.cpp
@@ -568,6 +568,28 @@ TEST_CASE("Utility bit operations", "[Util][BitUtil]") {
             }
         }
     }
+
+    SECTION("SquaredNorm") {
+        { // for float
+            std::vector<float> vec{0.0, 1.0, 3.0, 10.0};
+            CHECK(Util::squaredNorm(vec) == Approx(110.0));
+        }
+
+        { // for double
+            std::vector<double> vec{0.0, 1.0, 3.0, 10.0};
+            CHECK(Util::squaredNorm(vec) == Approx(110.0));
+        }
+
+        { // for complex<float>
+            std::vector<std::complex<float>> vec{{0.0, 1.0}, {3.0, 10.0}};
+            CHECK(Util::squaredNorm(vec) == Approx(110.0));
+        }
+
+        { // for complex<double>
+            std::vector<std::complex<double>> vec{{0.0, 1.0}, {3.0, 10.0}};
+            CHECK(Util::squaredNorm(vec) == Approx(110.0));
+        }
+    }
 }
 
 TEST_CASE("Utility array and tuples", "[Util]") {
diff --git a/pennylane_lightning/src/util/LinearAlgebra.hpp b/pennylane_lightning/src/util/LinearAlgebra.hpp
index 1cf36e7be9..26a57a4e2f 100644
--- a/pennylane_lightning/src/util/LinearAlgebra.hpp
+++ b/pennylane_lightning/src/util/LinearAlgebra.hpp
@@ -750,15 +750,41 @@ inline auto matrixMatProd(const std::vector<std::complex<T>> m_left,
 }
 
 /**
- * @brief Calculate the squared norm of a vector
+ * @brief @rst
+ * Compute the squared norm of a real/complex vector :math:`\sum_k |v_k|^2`
+ * @endrst
+ *
+ * @param data Data pointer
+ * @param data_size Size of the data
+ */
+template <class T>
+auto squaredNorm(const T *data, size_t data_size) -> remove_complex_t<T> {
+    if constexpr (is_complex_v<T>) {
+        // complex type
+        using PrecisionT = remove_complex_t<T>;
+        return std::transform_reduce(
+            data, data + data_size, PrecisionT{}, std::plus<PrecisionT>(),
+            static_cast<PrecisionT (*)(const std::complex<PrecisionT> &)>(
+                &std::norm<PrecisionT>));
+    } else {
+        using PrecisionT = T;
+        return std::transform_reduce(
+            data, data + data_size, PrecisionT{}, std::plus<PrecisionT>(),
+            static_cast<PrecisionT (*)(PrecisionT)>(std::norm));
+    }
+}
+
+/**
+ * @brief @rst
+ * Compute the squared norm of a real/complex vector :math:`\sum_k |v_k|^2`
+ * @endrst
+ *
+ * @param data Data pointer
+ * @param data_size Size of the data
  */
-template <typename PrecisionT>
-auto squaredNorm(const std::complex<PrecisionT> *data, size_t data_size)
-    -> PrecisionT {
-    return std::transform_reduce(
-        data, data + data_size, PrecisionT{}, std::plus<PrecisionT>(),
-        static_cast<PrecisionT (*)(const std::complex<PrecisionT> &)>(
-            &std::norm<PrecisionT>));
+template <class T, class Alloc>
+auto squaredNorm(const std::vector<T, Alloc> &vec) -> remove_complex_t<T> {
+    return squaredNorm(vec.data(), vec.size());
 }
 
 /**
diff --git a/pennylane_lightning/src/util/Util.hpp b/pennylane_lightning/src/util/Util.hpp
index adbe6d9f42..b101dc9e35 100644
--- a/pennylane_lightning/src/util/Util.hpp
+++ b/pennylane_lightning/src/util/Util.hpp
@@ -405,4 +405,16 @@ auto chunkData(const Container<T> &data, std::size_t num_chunks)
 // type alias
 template <class T> using remove_cvref_t = typename remove_cvref<T>::type;
 
+template <typename T> struct remove_complex { using type = T; };
+template <typename T> struct remove_complex<std::complex<T>> {
+    using type = T;
+};
+template <typename T> using remove_complex_t = typename remove_complex<T>::type;
+
+template <typename T> struct is_complex : std::false_type {};
+
+template <typename T> struct is_complex<std::complex<T>> : std::true_type {};
+
+template <typename T> constexpr bool is_complex_v = is_complex<T>::value;
+
 } // namespace Pennylane::Util

From 92f0ee65fffcd152722fabe13a5dd646953de2f5 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Mon, 14 Mar 2022 20:22:30 -0400
Subject: [PATCH 70/94] Add correct arg

---
 bin/utils.py                                  | 10 ++++++----
 pennylane_lightning/src/bindings/Bindings.hpp |  4 ++++
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/bin/utils.py b/bin/utils.py
index 6d9dab9420..6834078976 100644
--- a/bin/utils.py
+++ b/bin/utils.py
@@ -1,13 +1,13 @@
 from pathlib import Path
-import re
-import fnmatch
+from re import compile as re_compile
+from fnmatch import fnmatch
 
 SRCFILE_EXT = ["c", "cc", "cpp", "cxx", "cu"]
 HEADERFILE_EXT = ["h", "hh", "hpp", "hxx", "cuh"]
 
 LIGHTNING_SOURCE_DIR = Path(__file__).resolve().parent.parent
 
-rgx_gitignore_comment = re.compile("#.*$")
+rgx_gitignore_comment = re_compile("#.*$")
 
 def get_cpp_files_from_path(path, ignore_patterns = None, use_gitignore = True, header_only = False):
     """return set of C++ source files from a path
@@ -16,6 +16,7 @@ def get_cpp_files_from_path(path, ignore_patterns = None, use_gitignore = True,
         paths (pathlib.Path or str): a path to process 
         ignore_patterns: patterns to ignore
         use_gitignore: find ignore patterns from .gitignore
+        header_only: find only header files when true
     """
     path = Path(path)
     files_rel = set() # file paths relative to path
@@ -44,7 +45,7 @@ def get_cpp_files_from_path(path, ignore_patterns = None, use_gitignore = True,
     files_to_remove = set()
     for ignore_pattern in ignore_patterns:
         for f in files_rel:
-            if fnmatch.fnmatch(str(f), ignore_pattern):
+            if fnmatch(str(f), ignore_pattern):
                 files_to_remove.add(f)
 
     files_rel -= files_to_remove
@@ -58,6 +59,7 @@ def get_cpp_files(paths, ignore_patterns = None, use_gitignore = True, header_on
         paths (list): list of all paths to process
         ignore_patterns: patterns to ignore
         use_gitignore: find ignore patterns from .gitignore
+        header_only: find only header files when true
     """
     files = set()
     for path in paths:
diff --git a/pennylane_lightning/src/bindings/Bindings.hpp b/pennylane_lightning/src/bindings/Bindings.hpp
index d247144a8a..3f03d4a0d6 100644
--- a/pennylane_lightning/src/bindings/Bindings.hpp
+++ b/pennylane_lightning/src/bindings/Bindings.hpp
@@ -258,6 +258,10 @@ auto getCompileInfo() -> pybind11::dict {
             return "Clang";
         case Compiler::MSVC:
             return "MSVC";
+        case Compiler::NVCC:
+            return "NVCC";
+        case Compiler::NVHPC:
+            return "NVHPC";
         default:
             return "Unknown";
         }

From cbe36ace3c949551a90004bf2cd134721dd7556b Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Mon, 14 Mar 2022 20:56:12 -0400
Subject: [PATCH 71/94] Slightly refactor static_lookup

---
 .../src/gates/SelectKernel.hpp                | 32 -------------------
 .../src/simulator/DynamicDispatcher.cpp       | 11 +++----
 .../src/simulator/StateVectorBase.hpp         | 10 +++---
 .../src/tests/Test_DynamicDispatcher.cpp      |  2 +-
 .../Test_GateImplementations_Generator.cpp    |  6 ++--
 .../Test_GateImplementations_Inverse.cpp      |  2 +-
 .../src/tests/Test_OpToMemberFuncPtr.cpp      |  2 +-
 .../src/util/LinearAlgebra.hpp                |  3 +-
 pennylane_lightning/src/util/Util.hpp         | 19 +++++++++++
 9 files changed, 35 insertions(+), 52 deletions(-)

diff --git a/pennylane_lightning/src/gates/SelectKernel.hpp b/pennylane_lightning/src/gates/SelectKernel.hpp
index 5057ed9b42..54056db379 100644
--- a/pennylane_lightning/src/gates/SelectKernel.hpp
+++ b/pennylane_lightning/src/gates/SelectKernel.hpp
@@ -28,38 +28,6 @@
 #include <variant>
 
 namespace Pennylane::Gates {
-/**
- * @brief For lookup from any array of pair whose first elements are
- * GateOperation.
- *
- * As Util::lookup can be used in constexpr context, this function is redundant
- * (by the standard). But GCC 9 still does not accept Util::lookup in constexpr
- * some cases.
- */
-///@{
-template <GateOperation op, class T, size_t size>
-constexpr auto
-static_lookup(const std::array<std::pair<GateOperation, T>, size> &arr) -> T {
-    for (size_t idx = 0; idx < size; idx++) {
-        if (std::get<0>(arr[idx]) == op) {
-            return std::get<1>(arr[idx]);
-        }
-    }
-    return T{};
-}
-
-template <GeneratorOperation op, class T, size_t size>
-constexpr auto
-static_lookup(const std::array<std::pair<GeneratorOperation, T>, size> &arr)
-    -> T {
-    for (size_t idx = 0; idx < size; idx++) {
-        if (std::get<0>(arr[idx]) == op) {
-            return std::get<1>(arr[idx]);
-        }
-    }
-    return T{};
-}
-///@}
 
 /// @cond DEV
 namespace Internal {
diff --git a/pennylane_lightning/src/simulator/DynamicDispatcher.cpp b/pennylane_lightning/src/simulator/DynamicDispatcher.cpp
index 315b7a102e..034612d573 100644
--- a/pennylane_lightning/src/simulator/DynamicDispatcher.cpp
+++ b/pennylane_lightning/src/simulator/DynamicDispatcher.cpp
@@ -24,7 +24,6 @@
 #include "SelectKernel.hpp"
 
 using namespace Pennylane;
-using namespace Pennylane::Util;
 
 /// @cond DEV
 namespace {
@@ -50,7 +49,7 @@ constexpr auto gateOpToFunctor() {
             Gates::GateOpToMemberFuncPtr<PrecisionT, ParamT, GateImplementation,
                                          gate_op>::value;
         assert(params.size() ==
-               Gates::static_lookup<gate_op>(Gates::Constant::gate_num_params));
+               Util::static_lookup<gate_op>(Gates::Constant::gate_num_params));
         Gates::callGateOps(func_ptr, data, num_qubits, wires, inverse, params);
     };
 }
@@ -77,7 +76,7 @@ constexpr auto constructGateOpsFunctorTupleIter() {
             return constructGateOpsFunctorTupleIter<
                 PrecisionT, ParamT, GateImplementation, gate_idx + 1>();
         } else {
-            return prepend_to_tuple(
+            return Util::prepend_to_tuple(
                 std::pair{gate_op,
                           gateOpToFunctor<PrecisionT, ParamT,
                                           GateImplementation, gate_op>()},
@@ -97,7 +96,7 @@ constexpr auto constructGeneratorOpsFunctorTupleIter() {
     } else if (gntr_idx < GateImplementation::implemented_generators.size()) {
         constexpr auto gntr_op =
             GateImplementation::implemented_generators[gntr_idx];
-        return prepend_to_tuple(
+        return Util::prepend_to_tuple(
             std::pair{gntr_op,
                       Gates::GeneratorOpToMemberFuncPtr<
                           PrecisionT, GateImplementation, gntr_op>::value},
@@ -144,7 +143,7 @@ void registerAllImplementedGateOps() {
                                         const auto &gate_op_func_pair) {
         const auto &[gate_op, func] = gate_op_func_pair;
         std::string op_name =
-            std::string(lookup(Gates::Constant::gate_names, gate_op));
+            std::string(Util::lookup(Gates::Constant::gate_names, gate_op));
         dispatcher.registerGateOperation(op_name, GateImplementation::kernel_id,
                                          func);
         return gate_op;
@@ -170,7 +169,7 @@ void registerAllImplementedGeneratorOps() {
         [&dispatcher](const auto &gntr_op_func_pair) {
             const auto &[gntr_op, func] = gntr_op_func_pair;
             std::string op_name =
-                std::string(lookup(Gates::Constant::generator_names, gntr_op));
+                std::string(Util::lookup(Gates::Constant::generator_names, gntr_op));
             dispatcher.registerGeneratorOperation(
                 op_name, GateImplementation::kernel_id, func);
             return gntr_op;
diff --git a/pennylane_lightning/src/simulator/StateVectorBase.hpp b/pennylane_lightning/src/simulator/StateVectorBase.hpp
index dec223408c..82d528745d 100644
--- a/pennylane_lightning/src/simulator/StateVectorBase.hpp
+++ b/pennylane_lightning/src/simulator/StateVectorBase.hpp
@@ -48,7 +48,7 @@
     inline void apply##GATE_NAME##_(const std::vector<size_t> &wires,          \
                                     bool inverse, Ts &&...args) {              \
         auto *arr = getData();                                                 \
-        static_assert(Gates::static_lookup<Gates::GateOperation::GATE_NAME>(   \
+        static_assert(Util::static_lookup<Gates::GateOperation::GATE_NAME>(   \
                           Gates::Constant::gate_num_params) == sizeof...(Ts),  \
                       "The provided number of parameters for gate " #GATE_NAME \
                       " is wrong.");                                           \
@@ -65,7 +65,7 @@
     inline void apply##GATE_NAME(const std::vector<size_t> &wires,             \
                                  bool inverse, Ts &&...args) {                 \
         constexpr auto kernel =                                                \
-            Gates::static_lookup<Gates::GateOperation::GATE_NAME>(             \
+            Util::static_lookup<Gates::GateOperation::GATE_NAME>(             \
                 Gates::Constant::default_kernel_for_gates);                    \
         apply##GATE_NAME##_<kernel>(wires, inverse,                            \
                                     std::forward<Ts>(args)...);                \
@@ -299,9 +299,8 @@ template <class PrecisionT, class Derived> class StateVectorBase {
         namespace Constant = Gates::Constant;
         using Gates::GateOperation;
         using Gates::SelectKernel;
-        using Gates::static_lookup;
 
-        constexpr auto kernel = static_lookup<GateOperation::Matrix>(
+        constexpr auto kernel = Util::static_lookup<GateOperation::Matrix>(
             Constant::default_kernel_for_gates);
         static_assert(
             Util::array_has_elt(SelectKernel<kernel>::implemented_gates,
@@ -315,9 +314,8 @@ template <class PrecisionT, class Derived> class StateVectorBase {
         namespace Constant = Gates::Constant;
         using Gates::GateOperation;
         using Gates::SelectKernel;
-        using Gates::static_lookup;
 
-        constexpr auto kernel = static_lookup<GateOperation::Matrix>(
+        constexpr auto kernel = Util::static_lookup<GateOperation::Matrix>(
             Constant::default_kernel_for_gates);
         static_assert(
             Util::array_has_elt(SelectKernel<kernel>::implemented_gates,
diff --git a/pennylane_lightning/src/tests/Test_DynamicDispatcher.cpp b/pennylane_lightning/src/tests/Test_DynamicDispatcher.cpp
index f4dcf3b4c2..e429b676a6 100644
--- a/pennylane_lightning/src/tests/Test_DynamicDispatcher.cpp
+++ b/pennylane_lightning/src/tests/Test_DynamicDispatcher.cpp
@@ -55,7 +55,7 @@ struct testDispatchForKernel {
         // and compare it to the dynamic dispatcher
         auto test_st = ini_st;
         const auto gate_name =
-            std::string(static_lookup<gate_op>(Constant::gate_names));
+            std::string(Util::static_lookup<gate_op>(Constant::gate_names));
         DynamicDispatcher<PrecisionT>::getInstance().applyOperation(
             GateImplementation::kernel_id, test_st.data(), num_qubits,
             gate_name, wires, false, params);
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp
index d2ecd00a4a..d2e957a745 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp
@@ -38,7 +38,7 @@ constexpr std::string_view remove_prefix(const std::string_view &str,
 template <GeneratorOperation gntr_op>
 constexpr auto findGateOpForGenerator() -> GateOperation {
     constexpr auto gntr_name =
-        remove_prefix(static_lookup<gntr_op>(Constant::generator_names), 9);
+        remove_prefix(Util::static_lookup<gntr_op>(Constant::generator_names), 9);
 
     for (const auto &[gate_op, gate_name] : Constant::gate_names) {
         if (gate_name == gntr_name) {
@@ -76,8 +76,8 @@ void testGeneratorForGate(RandomEngine &re, size_t num_qubits) {
 
     constexpr ParamT eps = 1e-4; // For finite difference
 
-    constexpr auto gate_op = static_lookup<gntr_op>(generator_gate_pairs);
-    constexpr auto gate_name = static_lookup<gate_op>(Constant::gate_names);
+    constexpr auto gate_op = Util::static_lookup<gntr_op>(generator_gate_pairs);
+    constexpr auto gate_name = Util::static_lookup<gate_op>(Constant::gate_names);
 
     DYNAMIC_SECTION("Test generator of " << gate_name << " for kernel "
                                          << GateImplementation::name) {
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Inverse.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Inverse.cpp
index 6e33d781a5..d05f3444c1 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Inverse.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Inverse.cpp
@@ -27,7 +27,7 @@ template <typename PrecisionT, typename ParamT, class GateImplementation,
           GateOperation gate_op, class RandomEngine>
 void testInverseKernelGate(RandomEngine &re, size_t num_qubits) {
     if constexpr (gate_op != GateOperation::Matrix) {
-        constexpr auto gate_name = static_lookup<gate_op>(Constant::gate_names);
+        constexpr auto gate_name = Util::static_lookup<gate_op>(Constant::gate_names);
         DYNAMIC_SECTION("Test inverse of " << gate_name << " for kernel "
                                            << GateImplementation::name) {
             const auto ini_st = createRandomState<PrecisionT>(re, num_qubits);
diff --git a/pennylane_lightning/src/tests/Test_OpToMemberFuncPtr.cpp b/pennylane_lightning/src/tests/Test_OpToMemberFuncPtr.cpp
index 81f85038e4..77193c9c3b 100644
--- a/pennylane_lightning/src/tests/Test_OpToMemberFuncPtr.cpp
+++ b/pennylane_lightning/src/tests/Test_OpToMemberFuncPtr.cpp
@@ -210,7 +210,7 @@ constexpr auto gateOpFuncPtrPairsWithNumParamsIter() {
                       decltype(gate_op_func_ptr_pairs<PrecisionT, ParamT>)>) {
         constexpr auto elt =
             std::get<tuple_idx>(gate_op_func_ptr_pairs<PrecisionT, ParamT>);
-        if constexpr (static_lookup<elt.first>(Constant::gate_num_params) ==
+        if constexpr (Util::static_lookup<elt.first>(Constant::gate_num_params) ==
                       num_params) {
             return Util::prepend_to_tuple(
                 elt, gateOpFuncPtrPairsWithNumParamsIter<
diff --git a/pennylane_lightning/src/util/LinearAlgebra.hpp b/pennylane_lightning/src/util/LinearAlgebra.hpp
index 26a57a4e2f..f27546d673 100644
--- a/pennylane_lightning/src/util/LinearAlgebra.hpp
+++ b/pennylane_lightning/src/util/LinearAlgebra.hpp
@@ -779,8 +779,7 @@ auto squaredNorm(const T *data, size_t data_size) -> remove_complex_t<T> {
  * Compute the squared norm of a real/complex vector :math:`\sum_k |v_k|^2`
  * @endrst
  *
- * @param data Data pointer
- * @param data_size Size of the data
+ * @param vec std::vector containing data
  */
 template <class T, class Alloc>
 auto squaredNorm(const std::vector<T, Alloc> &vec) -> remove_complex_t<T> {
diff --git a/pennylane_lightning/src/util/Util.hpp b/pennylane_lightning/src/util/Util.hpp
index b101dc9e35..274bc27f26 100644
--- a/pennylane_lightning/src/util/Util.hpp
+++ b/pennylane_lightning/src/util/Util.hpp
@@ -402,6 +402,25 @@ auto chunkData(const Container<T> &data, std::size_t num_chunks)
     return chunkDataSize(data, div);
 }
 
+/**
+ * @brief For lookup from any array of pair whose first elements are
+ * GateOperation.
+ *
+ * As Util::lookup can be used in constexpr context, this function is redundant
+ * (by the standard). But GCC 9 still does not accept Util::lookup in constexpr
+ * some cases.
+ */
+template <auto op, class T, size_t size>
+constexpr auto
+static_lookup(const std::array<std::pair<decltype(op), T>, size> &arr) -> T {
+    for (size_t idx = 0; idx < size; idx++) {
+        if (std::get<0>(arr[idx]) == op) {
+            return std::get<1>(arr[idx]);
+        }
+    }
+    return T{};
+}
+
 // type alias
 template <class T> using remove_cvref_t = typename remove_cvref<T>::type;
 

From 26a508dc0e0972ba3974ac7ab834e13b50952ce6 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Mon, 14 Mar 2022 23:05:34 -0400
Subject: [PATCH 72/94] Rename AMD64 to x86_64

---
 pennylane_lightning/src/bindings/Bindings.hpp              | 4 ++--
 pennylane_lightning/src/simulator/DynamicDispatcher.cpp    | 4 ++--
 pennylane_lightning/src/simulator/StateVectorBase.hpp      | 4 ++--
 .../src/tests/Test_GateImplementations_Generator.cpp       | 7 ++++---
 .../src/tests/Test_GateImplementations_Inverse.cpp         | 3 ++-
 pennylane_lightning/src/tests/Test_OpToMemberFuncPtr.cpp   | 4 ++--
 pennylane_lightning/src/util/Macros.hpp                    | 6 +++---
 pennylane_lightning/src/util/RuntimeInfo.hpp               | 2 +-
 8 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/pennylane_lightning/src/bindings/Bindings.hpp b/pennylane_lightning/src/bindings/Bindings.hpp
index 3f03d4a0d6..142f11223b 100644
--- a/pennylane_lightning/src/bindings/Bindings.hpp
+++ b/pennylane_lightning/src/bindings/Bindings.hpp
@@ -239,8 +239,8 @@ auto getCompileInfo() -> pybind11::dict {
 
     const std::string_view cpu_arch_str = [] {
         switch (cpu_arch) {
-        case CPUArch::AMD64:
-            return "AMD64";
+        case CPUArch::X86_64:
+            return "x86_64";
         case CPUArch::PPC64:
             return "PPC64";
         case CPUArch::ARM:
diff --git a/pennylane_lightning/src/simulator/DynamicDispatcher.cpp b/pennylane_lightning/src/simulator/DynamicDispatcher.cpp
index 034612d573..5caaf99e55 100644
--- a/pennylane_lightning/src/simulator/DynamicDispatcher.cpp
+++ b/pennylane_lightning/src/simulator/DynamicDispatcher.cpp
@@ -168,8 +168,8 @@ void registerAllImplementedGeneratorOps() {
     auto registerGeneratorToDispatcher =
         [&dispatcher](const auto &gntr_op_func_pair) {
             const auto &[gntr_op, func] = gntr_op_func_pair;
-            std::string op_name =
-                std::string(Util::lookup(Gates::Constant::generator_names, gntr_op));
+            std::string op_name = std::string(
+                Util::lookup(Gates::Constant::generator_names, gntr_op));
             dispatcher.registerGeneratorOperation(
                 op_name, GateImplementation::kernel_id, func);
             return gntr_op;
diff --git a/pennylane_lightning/src/simulator/StateVectorBase.hpp b/pennylane_lightning/src/simulator/StateVectorBase.hpp
index 82d528745d..a157aacd10 100644
--- a/pennylane_lightning/src/simulator/StateVectorBase.hpp
+++ b/pennylane_lightning/src/simulator/StateVectorBase.hpp
@@ -48,7 +48,7 @@
     inline void apply##GATE_NAME##_(const std::vector<size_t> &wires,          \
                                     bool inverse, Ts &&...args) {              \
         auto *arr = getData();                                                 \
-        static_assert(Util::static_lookup<Gates::GateOperation::GATE_NAME>(   \
+        static_assert(Util::static_lookup<Gates::GateOperation::GATE_NAME>(    \
                           Gates::Constant::gate_num_params) == sizeof...(Ts),  \
                       "The provided number of parameters for gate " #GATE_NAME \
                       " is wrong.");                                           \
@@ -65,7 +65,7 @@
     inline void apply##GATE_NAME(const std::vector<size_t> &wires,             \
                                  bool inverse, Ts &&...args) {                 \
         constexpr auto kernel =                                                \
-            Util::static_lookup<Gates::GateOperation::GATE_NAME>(             \
+            Util::static_lookup<Gates::GateOperation::GATE_NAME>(              \
                 Gates::Constant::default_kernel_for_gates);                    \
         apply##GATE_NAME##_<kernel>(wires, inverse,                            \
                                     std::forward<Ts>(args)...);                \
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp
index d2e957a745..1440fcda1a 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Generator.cpp
@@ -37,8 +37,8 @@ constexpr std::string_view remove_prefix(const std::string_view &str,
 
 template <GeneratorOperation gntr_op>
 constexpr auto findGateOpForGenerator() -> GateOperation {
-    constexpr auto gntr_name =
-        remove_prefix(Util::static_lookup<gntr_op>(Constant::generator_names), 9);
+    constexpr auto gntr_name = remove_prefix(
+        Util::static_lookup<gntr_op>(Constant::generator_names), 9);
 
     for (const auto &[gate_op, gate_name] : Constant::gate_names) {
         if (gate_name == gntr_name) {
@@ -77,7 +77,8 @@ void testGeneratorForGate(RandomEngine &re, size_t num_qubits) {
     constexpr ParamT eps = 1e-4; // For finite difference
 
     constexpr auto gate_op = Util::static_lookup<gntr_op>(generator_gate_pairs);
-    constexpr auto gate_name = Util::static_lookup<gate_op>(Constant::gate_names);
+    constexpr auto gate_name =
+        Util::static_lookup<gate_op>(Constant::gate_names);
 
     DYNAMIC_SECTION("Test generator of " << gate_name << " for kernel "
                                          << GateImplementation::name) {
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Inverse.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Inverse.cpp
index d05f3444c1..fd73b7edfe 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Inverse.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Inverse.cpp
@@ -27,7 +27,8 @@ template <typename PrecisionT, typename ParamT, class GateImplementation,
           GateOperation gate_op, class RandomEngine>
 void testInverseKernelGate(RandomEngine &re, size_t num_qubits) {
     if constexpr (gate_op != GateOperation::Matrix) {
-        constexpr auto gate_name = Util::static_lookup<gate_op>(Constant::gate_names);
+        constexpr auto gate_name =
+            Util::static_lookup<gate_op>(Constant::gate_names);
         DYNAMIC_SECTION("Test inverse of " << gate_name << " for kernel "
                                            << GateImplementation::name) {
             const auto ini_st = createRandomState<PrecisionT>(re, num_qubits);
diff --git a/pennylane_lightning/src/tests/Test_OpToMemberFuncPtr.cpp b/pennylane_lightning/src/tests/Test_OpToMemberFuncPtr.cpp
index 77193c9c3b..558e558f1a 100644
--- a/pennylane_lightning/src/tests/Test_OpToMemberFuncPtr.cpp
+++ b/pennylane_lightning/src/tests/Test_OpToMemberFuncPtr.cpp
@@ -210,8 +210,8 @@ constexpr auto gateOpFuncPtrPairsWithNumParamsIter() {
                       decltype(gate_op_func_ptr_pairs<PrecisionT, ParamT>)>) {
         constexpr auto elt =
             std::get<tuple_idx>(gate_op_func_ptr_pairs<PrecisionT, ParamT>);
-        if constexpr (Util::static_lookup<elt.first>(Constant::gate_num_params) ==
-                      num_params) {
+        if constexpr (Util::static_lookup<elt.first>(
+                          Constant::gate_num_params) == num_params) {
             return Util::prepend_to_tuple(
                 elt, gateOpFuncPtrPairsWithNumParamsIter<
                          PrecisionT, ParamT, num_params, tuple_idx + 1>());
diff --git a/pennylane_lightning/src/util/Macros.hpp b/pennylane_lightning/src/util/Macros.hpp
index c4c20d07bb..09f2d7b32f 100644
--- a/pennylane_lightning/src/util/Macros.hpp
+++ b/pennylane_lightning/src/util/Macros.hpp
@@ -101,11 +101,11 @@
 #endif
 
 namespace Pennylane::Util::Constant {
-enum class CPUArch { AMD64, PPC64, ARM, Unknown };
+enum class CPUArch { X86_64, PPC64, ARM, Unknown };
 
 constexpr auto getCPUArchClangGCC() {
 #if defined(__x86_64__)
-    return CPUArch::AMD64;
+    return CPUArch::X86_64;
 #elif defined(__powerpc64__)
     return CPUArch::PPC64;
 #elif defined(__arm__)
@@ -117,7 +117,7 @@ constexpr auto getCPUArchClangGCC() {
 
 constexpr auto getCPUArchMSVC() {
 #if defined(_M_AMD64)
-    return CPUArch::AMD64;
+    return CPUArch::X86_64;
 #elif defined(_M_PPC)
     return CPUArch::PPC64;
 #elif defined(_M_ARM)
diff --git a/pennylane_lightning/src/util/RuntimeInfo.hpp b/pennylane_lightning/src/util/RuntimeInfo.hpp
index 416422bd45..2286009349 100644
--- a/pennylane_lightning/src/util/RuntimeInfo.hpp
+++ b/pennylane_lightning/src/util/RuntimeInfo.hpp
@@ -20,7 +20,7 @@
 
 namespace Pennylane::Util {
 /**
- * @brief This class is only usable in x86 or AMD64 architecture.
+ * @brief This class is only usable in x86 or x86_64 architecture.
  */
 class RuntimeInfo {
   private:

From 8cbef2675e5946516ca10e031128040b9e03e603 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Mon, 14 Mar 2022 23:52:20 -0400
Subject: [PATCH 73/94] Add docstring

---
 .../src/tests/CreateAllWires.hpp              | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/pennylane_lightning/src/tests/CreateAllWires.hpp b/pennylane_lightning/src/tests/CreateAllWires.hpp
index f462b4ae20..d923d5538c 100644
--- a/pennylane_lightning/src/tests/CreateAllWires.hpp
+++ b/pennylane_lightning/src/tests/CreateAllWires.hpp
@@ -14,6 +14,16 @@ class WiresGenerator {
     [[nodiscard]] virtual auto all_perms() const
         -> const std::vector<std::vector<size_t>> & = 0;
 };
+
+/**
+ * @brief 
+ * @rst Generating all permutation of wires without ordering (often called
+ * as combination). The size of all combination is given as :math:`n \choose r`.
+ *
+ * We use the recursion formula 
+ * :math:`{n \choose r} = {n \choose r-1} + {n-1 \choose r}`
+ * @endrst
+ */
 class CombinationGenerator : public WiresGenerator {
   private:
     std::vector<size_t> v_;
@@ -45,6 +55,16 @@ class CombinationGenerator : public WiresGenerator {
         return all_perms_;
     }
 };
+
+/**
+ * @brief 
+ * @rst Generating all permutation of wires with ordering. The size of all 
+ * permutation is given as :math:`{}_{n}P_r=n!/(n-r)!r!`.
+ * @endrst
+ *
+ * We use the recursion formula 
+ * :math:`{}_n P_r = n {}_{n-1} P_{r-1}`
+ */
 class PermutationGenerator : public WiresGenerator {
   private:
     std::vector<std::vector<size_t>> all_perms_;

From 3ea7b4987ddbe6e98a5eec74600772baa32f79ca Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Tue, 15 Mar 2022 11:13:40 -0400
Subject: [PATCH 74/94] add docstring

---
 .../tests/Test_GateImplementations_CompareKernels.cpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
index 0c87e07154..0412dfc5d8 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
@@ -31,6 +31,9 @@ using namespace Pennylane::Gates::Constant;
 
 using std::vector;
 
+/**
+ * @brief Change the given type list of kernels to string
+ */
 template <typename TypeList> std::string kernelsToString() {
     if constexpr (!std::is_same_v<TypeList, void>) {
         return std::string(TypeList::Type::name) + ", " +
@@ -56,6 +59,10 @@ struct KernelsImplementingGateHelper<gate_op, void> {
     using Type = void;
 };
 
+
+/**
+ * @brief Type list of kernels implementing the given gate operation.
+ */
 template <Gates::GateOperation gate_op> struct KernelsImplementingGate {
     using Type =
         typename KernelsImplementingGateHelper<gate_op, TestKernels>::Type;
@@ -102,6 +109,10 @@ auto applyGateForImplemetingKernels(
             ini, num_qubits, wires, inverse, params)...);
 }
 
+/**
+ * @brief Apply the given gate using all implementing kernels and compare
+ * the results.
+ */
 template <Gates::GateOperation gate_op, typename PrecisionT, typename ParamT,
           class RandomEngine>
 void testApplyGate(RandomEngine &re, size_t num_qubits) {

From cb3256e17b7c7a5b32b01d825bad3cff9c38bbcb Mon Sep 17 00:00:00 2001
From: Dev version update bot <chae-yeun@xanadu.ai>
Date: Tue, 15 Mar 2022 18:22:44 +0000
Subject: [PATCH 75/94] Auto update version

---
 pennylane_lightning/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/_version.py b/pennylane_lightning/_version.py
index ac926443a4..3bbb514373 100644
--- a/pennylane_lightning/_version.py
+++ b/pennylane_lightning/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.23.0-dev0"
+__version__ = "0.23.0-dev1"

From 4be5ac6aeb924c367417705ca9f512f0ef5b46fe Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Tue, 15 Mar 2022 14:23:56 -0400
Subject: [PATCH 76/94] Format

---
 pennylane_lightning/src/tests/CreateAllWires.hpp       | 10 +++++-----
 .../tests/Test_GateImplementations_CompareKernels.cpp  |  1 -
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/pennylane_lightning/src/tests/CreateAllWires.hpp b/pennylane_lightning/src/tests/CreateAllWires.hpp
index d923d5538c..a05a7f8e9b 100644
--- a/pennylane_lightning/src/tests/CreateAllWires.hpp
+++ b/pennylane_lightning/src/tests/CreateAllWires.hpp
@@ -16,11 +16,11 @@ class WiresGenerator {
 };
 
 /**
- * @brief 
+ * @brief
  * @rst Generating all permutation of wires without ordering (often called
  * as combination). The size of all combination is given as :math:`n \choose r`.
  *
- * We use the recursion formula 
+ * We use the recursion formula
  * :math:`{n \choose r} = {n \choose r-1} + {n-1 \choose r}`
  * @endrst
  */
@@ -57,12 +57,12 @@ class CombinationGenerator : public WiresGenerator {
 };
 
 /**
- * @brief 
- * @rst Generating all permutation of wires with ordering. The size of all 
+ * @brief
+ * @rst Generating all permutation of wires with ordering. The size of all
  * permutation is given as :math:`{}_{n}P_r=n!/(n-r)!r!`.
  * @endrst
  *
- * We use the recursion formula 
+ * We use the recursion formula
  * :math:`{}_n P_r = n {}_{n-1} P_{r-1}`
  */
 class PermutationGenerator : public WiresGenerator {
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
index 0412dfc5d8..82a62b81b1 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
@@ -59,7 +59,6 @@ struct KernelsImplementingGateHelper<gate_op, void> {
     using Type = void;
 };
 
-
 /**
  * @brief Type list of kernels implementing the given gate operation.
  */

From ebbcf688efa0d209ffa9b02420b7a062c7dd3305 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 16 Mar 2022 09:48:44 -0400
Subject: [PATCH 77/94] Small fix

---
 pennylane_lightning/src/tests/Test_Util.cpp   | 43 +++++++++----------
 pennylane_lightning/src/util/ConstantUtil.hpp | 10 +++++
 2 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/pennylane_lightning/src/tests/Test_Util.cpp b/pennylane_lightning/src/tests/Test_Util.cpp
index 0b4d4f7e9d..23b386b3e6 100644
--- a/pennylane_lightning/src/tests/Test_Util.cpp
+++ b/pennylane_lightning/src/tests/Test_Util.cpp
@@ -460,6 +460,27 @@ TEMPLATE_TEST_CASE("Utility math functions", "[Util][LinearAlgebra]", float,
                                        "the input right matrix"));
         }
     }
+    SECTION("SquaredNorm") {
+        { // for float
+            std::vector<float> vec{0.0, 1.0, 3.0, 10.0};
+            CHECK(Util::squaredNorm(vec) == Approx(110.0));
+        }
+
+        { // for double
+            std::vector<double> vec{0.0, 1.0, 3.0, 10.0};
+            CHECK(Util::squaredNorm(vec) == Approx(110.0));
+        }
+
+        { // for complex<float>
+            std::vector<std::complex<float>> vec{{0.0, 1.0}, {3.0, 10.0}};
+            CHECK(Util::squaredNorm(vec) == Approx(110.0));
+        }
+
+        { // for complex<double>
+            std::vector<std::complex<double>> vec{{0.0, 1.0}, {3.0, 10.0}};
+            CHECK(Util::squaredNorm(vec) == Approx(110.0));
+        }
+    }
 }
 
 /**
@@ -568,28 +589,6 @@ TEST_CASE("Utility bit operations", "[Util][BitUtil]") {
             }
         }
     }
-
-    SECTION("SquaredNorm") {
-        { // for float
-            std::vector<float> vec{0.0, 1.0, 3.0, 10.0};
-            CHECK(Util::squaredNorm(vec) == Approx(110.0));
-        }
-
-        { // for double
-            std::vector<double> vec{0.0, 1.0, 3.0, 10.0};
-            CHECK(Util::squaredNorm(vec) == Approx(110.0));
-        }
-
-        { // for complex<float>
-            std::vector<std::complex<float>> vec{{0.0, 1.0}, {3.0, 10.0}};
-            CHECK(Util::squaredNorm(vec) == Approx(110.0));
-        }
-
-        { // for complex<double>
-            std::vector<std::complex<double>> vec{{0.0, 1.0}, {3.0, 10.0}};
-            CHECK(Util::squaredNorm(vec) == Approx(110.0));
-        }
-    }
 }
 
 TEST_CASE("Utility array and tuples", "[Util]") {
diff --git a/pennylane_lightning/src/util/ConstantUtil.hpp b/pennylane_lightning/src/util/ConstantUtil.hpp
index d3995e7642..208ab30a28 100644
--- a/pennylane_lightning/src/util/ConstantUtil.hpp
+++ b/pennylane_lightning/src/util/ConstantUtil.hpp
@@ -200,6 +200,8 @@ reverse_pairs_helper(const std::array<std::pair<T, U>, size> &arr,
  * @tparam T Type of first elements
  * @tparam U Type of second elements
  * @tparam size Size of the array
+ * @param arr Array to reverse
+ * @return reversed array
  */
 template <class T, class U, size_t size>
 constexpr auto reverse_pairs(const std::array<std::pair<T, U>, size> &arr)
@@ -208,6 +210,14 @@ constexpr auto reverse_pairs(const std::array<std::pair<T, U>, size> &arr)
                                           std::make_index_sequence<size>{});
 }
 
+/**
+ * @brief Constexpr function that check whether the given value is a power of 2.
+ *
+ * Can be merged with isPerfectPowerOf2 in C++20 using constexpr std::popcount.
+ *
+ * @param value Value to check
+ * @return True when the given value is a power of 2
+ */
 constexpr auto constIsPerfectPowerOf2(size_t value) -> bool {
     while ((value & 1U) == 0) {
         value >>= 1U;

From 97bc9b90f5d0d6b445451a1deea95d2b2c99a4cb Mon Sep 17 00:00:00 2001
From: Dev version update bot <chae-yeun@xanadu.ai>
Date: Wed, 16 Mar 2022 13:52:24 +0000
Subject: [PATCH 78/94] Auto update version

---
 pennylane_lightning/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/_version.py b/pennylane_lightning/_version.py
index 3bbb514373..8bad23fea1 100644
--- a/pennylane_lightning/_version.py
+++ b/pennylane_lightning/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.23.0-dev1"
+__version__ = "0.23.0-dev2"

From f9ec2af2ad327b57d7bfe4f887362d786c6f5a05 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 16 Mar 2022 09:55:52 -0400
Subject: [PATCH 79/94] Some more fix

---
 pennylane_lightning/src/util/RuntimeInfo.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pennylane_lightning/src/util/RuntimeInfo.hpp b/pennylane_lightning/src/util/RuntimeInfo.hpp
index 2286009349..933bcf0642 100644
--- a/pennylane_lightning/src/util/RuntimeInfo.hpp
+++ b/pennylane_lightning/src/util/RuntimeInfo.hpp
@@ -27,10 +27,10 @@ class RuntimeInfo {
     struct InternalRuntimeInfo {
         InternalRuntimeInfo();
 
-        std::bitset<32> f_1_ecx;
-        std::bitset<32> f_1_edx;
-        std::bitset<32> f_7_ebx;
-        std::bitset<32> f_7_ecx;
+        std::bitset<32> f_1_ecx{};
+        std::bitset<32> f_1_edx{};
+        std::bitset<32> f_7_ebx{};
+        std::bitset<32> f_7_ecx{};
     };
 
     static const inline InternalRuntimeInfo internal_runtime_info_;

From 865557e7c8293f8bb9e09869a4a1fac418795174 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 16 Mar 2022 09:58:00 -0400
Subject: [PATCH 80/94] Trigger CI


From 86531d5e2f1d60785fcfa99727f2fe6394896981 Mon Sep 17 00:00:00 2001
From: Dev version update bot <chae-yeun@xanadu.ai>
Date: Thu, 17 Mar 2022 17:18:14 +0000
Subject: [PATCH 81/94] Auto update version

---
 pennylane_lightning/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/_version.py b/pennylane_lightning/_version.py
index 8bad23fea1..97154f70b9 100644
--- a/pennylane_lightning/_version.py
+++ b/pennylane_lightning/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.23.0-dev2"
+__version__ = "0.23.0-dev3"

From cf654a79710c120afbf34485c09a48b0797cd374 Mon Sep 17 00:00:00 2001
From: Dev version update bot <chae-yeun@xanadu.ai>
Date: Thu, 17 Mar 2022 20:39:45 +0000
Subject: [PATCH 82/94] Auto update version

---
 pennylane_lightning/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/_version.py b/pennylane_lightning/_version.py
index e4d2b0b628..b1866b428a 100644
--- a/pennylane_lightning/_version.py
+++ b/pennylane_lightning/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.23.0-dev4"
+__version__ = "0.23.0-dev5"

From b341f8cfa4af8a703fb56556d3c6308388dbb8e2 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Thu, 17 Mar 2022 16:41:32 -0400
Subject: [PATCH 83/94] Trigger CI


From ac985c6fd3192cb5bfeb564bee8c063672fc115d Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Thu, 17 Mar 2022 17:26:24 -0400
Subject: [PATCH 84/94] Trigger CI


From 00f4dc7d739ac201b1c4ceb1d65da9d0707ee033 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Thu, 17 Mar 2022 20:36:39 -0400
Subject: [PATCH 85/94] Trigger CI


From a46a41fb9455b8750b319d694f38e81a9c97f69e Mon Sep 17 00:00:00 2001
From: Dev version update bot <chae-yeun@xanadu.ai>
Date: Fri, 18 Mar 2022 00:36:57 +0000
Subject: [PATCH 86/94] Auto update version

---
 pennylane_lightning/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/_version.py b/pennylane_lightning/_version.py
index b1866b428a..01c4a6e800 100644
--- a/pennylane_lightning/_version.py
+++ b/pennylane_lightning/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.23.0-dev5"
+__version__ = "0.23.0-dev6"

From 3bcba89b518f6be45682369274f9d2f1369eead7 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Sat, 19 Mar 2022 19:33:28 -0400
Subject: [PATCH 87/94] Fix doc

---
 pennylane_lightning/src/simulator/DynamicDispatcher.hpp | 2 ++
 pennylane_lightning/src/util/TypeList.hpp               | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/pennylane_lightning/src/simulator/DynamicDispatcher.hpp b/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
index 469873751f..4622676387 100644
--- a/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
+++ b/pennylane_lightning/src/simulator/DynamicDispatcher.hpp
@@ -59,6 +59,7 @@ namespace Pennylane {
  */
 template <class PrecisionT, class ParamT> struct registerBeforeMain;
 
+/// @cond DEV
 template <> struct registerBeforeMain<float, float> {
     const static inline int dummy =
         Internal::registerAllAvailableKernels<float, float>();
@@ -68,6 +69,7 @@ template <> struct registerBeforeMain<double, double> {
     const static inline int dummy =
         Internal::registerAllAvailableKernels<double, double>();
 };
+/// @endcond
 
 /**
  * @brief DynamicDispatcher class
diff --git a/pennylane_lightning/src/util/TypeList.hpp b/pennylane_lightning/src/util/TypeList.hpp
index a53c3cbd5d..d87c3c540a 100644
--- a/pennylane_lightning/src/util/TypeList.hpp
+++ b/pennylane_lightning/src/util/TypeList.hpp
@@ -27,6 +27,7 @@ template <typename T, typename... Ts> struct TypeNode {
     using Type = T;
     using Next = TypeNode<Ts...>;
 };
+///@cond DEV
 template <typename T> struct TypeNode<T, void> {
     using Type = T;
     using Next = void;
@@ -35,6 +36,7 @@ template <typename T> struct TypeNode<T> {
     using Type = T;
     using Next = void;
 };
+///@endcond
 
 /**
  * @brief Define type list

From d358a3c6ea30c6b8d44e271a6be718f476917ab1 Mon Sep 17 00:00:00 2001
From: Dev version update bot <chae-yeun@xanadu.ai>
Date: Tue, 22 Mar 2022 23:33:35 +0000
Subject: [PATCH 88/94] Auto update version

---
 pennylane_lightning/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/_version.py b/pennylane_lightning/_version.py
index 4091fc7c62..adb0d6145d 100644
--- a/pennylane_lightning/_version.py
+++ b/pennylane_lightning/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.23.0-dev11"
+__version__ = "0.23.0-dev12"

From 80243581545caa80824e9b4e502b7d9c072fa185 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Tue, 22 Mar 2022 21:54:11 -0400
Subject: [PATCH 89/94] Format

---
 .../src/algorithms/AdjointDiff.hpp               | 16 +++++++++-------
 .../Test_GateImplementations_CompareKernels.cpp  |  6 +++---
 .../tests/Test_GateImplementations_Inverse.cpp   |  3 ++-
 pennylane_lightning/src/util/ConstantUtil.hpp    |  2 +-
 pennylane_lightning/src/util/LinearAlgebra.hpp   |  1 -
 pennylane_lightning/src/util/TypeTraits.hpp      |  2 +-
 6 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/pennylane_lightning/src/algorithms/AdjointDiff.hpp b/pennylane_lightning/src/algorithms/AdjointDiff.hpp
index 92c8a8d78c..62e5946f4d 100644
--- a/pennylane_lightning/src/algorithms/AdjointDiff.hpp
+++ b/pennylane_lightning/src/algorithms/AdjointDiff.hpp
@@ -157,9 +157,10 @@ template <class T = double> class AdjointJacobian {
      * @param reference_state Reference statevector
      * @param observables Vector of observables to apply to each statevector.
      */
-    inline void applyObservables(std::vector<StateVectorManagedCPU<T>> &states,
-                                 const StateVectorManagedCPU<T> &reference_state,
-                                 const std::vector<ObsDatum<T>> &observables) {
+    inline void
+    applyObservables(std::vector<StateVectorManagedCPU<T>> &states,
+                     const StateVectorManagedCPU<T> &reference_state,
+                     const std::vector<ObsDatum<T>> &observables) {
         // clang-format off
         // Globally scoped exception value to be captured within OpenMP block.
         // See the following for OpenMP design decisions:
@@ -207,9 +208,9 @@ template <class T = double> class AdjointJacobian {
      * @param op_idx Index of given operation within operations list to take
      * adjoint of.
      */
-    inline void applyOperationsAdj(std::vector<StateVectorManagedCPU<T>> &states,
-                                   const OpsData<T> &operations,
-                                   size_t op_idx) {
+    inline void
+    applyOperationsAdj(std::vector<StateVectorManagedCPU<T>> &states,
+                       const OpsData<T> &operations, size_t op_idx) {
         // clang-format off
         // Globally scoped exception value to be captured within OpenMP block.
         // See the following for OpenMP design decisions:
@@ -333,7 +334,8 @@ template <class T = double> class AdjointJacobian {
             num_param_ops - 1; // total number of parametric ops
 
         // Create $U_{1:p}\vert \lambda \rangle$
-        StateVectorManagedCPU<T> lambda(jd.getPtrStateVec(), jd.getSizeStateVec());
+        StateVectorManagedCPU<T> lambda(jd.getPtrStateVec(),
+                                        jd.getSizeStateVec());
 
         // Apply given operations to statevector if requested
         if (apply_operations) {
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
index 55ac7f975e..1470393e18 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_CompareKernels.cpp
@@ -79,9 +79,9 @@ template <Gates::GateOperation gate_op> struct KernelsImplementingGate {
  */
 template <Gates::GateOperation gate_op, typename PrecisionT, typename ParamT,
           typename GateImplementation, class Alloc>
-auto applyGate(std::vector<std::complex<PrecisionT>, Alloc> ini, size_t num_qubits,
-               const std::vector<size_t> &wires, bool inverse,
-               const std::vector<ParamT> &params)
+auto applyGate(std::vector<std::complex<PrecisionT>, Alloc> ini,
+               size_t num_qubits, const std::vector<size_t> &wires,
+               bool inverse, const std::vector<ParamT> &params)
     -> std::vector<std::complex<PrecisionT>, Alloc> {
     callGateOps(GateOpToMemberFuncPtr<PrecisionT, ParamT, GateImplementation,
                                       gate_op>::value,
diff --git a/pennylane_lightning/src/tests/Test_GateImplementations_Inverse.cpp b/pennylane_lightning/src/tests/Test_GateImplementations_Inverse.cpp
index 20bc9784ce..876b67f56e 100644
--- a/pennylane_lightning/src/tests/Test_GateImplementations_Inverse.cpp
+++ b/pennylane_lightning/src/tests/Test_GateImplementations_Inverse.cpp
@@ -26,7 +26,8 @@ using namespace Pennylane::Gates;
 template <typename PrecisionT, typename ParamT, class GateImplementation,
           GateOperation gate_op, class RandomEngine>
 void testInverseKernelGate(RandomEngine &re, size_t num_qubits) {
-    constexpr auto gate_name = Util::static_lookup<gate_op>(Constant::gate_names);
+    constexpr auto gate_name =
+        Util::static_lookup<gate_op>(Constant::gate_names);
     DYNAMIC_SECTION("Test inverse of " << gate_name << " for kernel "
                                        << GateImplementation::name) {
         const auto ini_st = createRandomState<PrecisionT>(re, num_qubits);
diff --git a/pennylane_lightning/src/util/ConstantUtil.hpp b/pennylane_lightning/src/util/ConstantUtil.hpp
index 6a7e1490c2..8de2972ea8 100644
--- a/pennylane_lightning/src/util/ConstantUtil.hpp
+++ b/pennylane_lightning/src/util/ConstantUtil.hpp
@@ -17,8 +17,8 @@
  */
 #pragma once
 
-#include "Util.hpp"
 #include "TypeTraits.hpp"
+#include "Util.hpp"
 
 #include <array>
 #include <cstdlib>
diff --git a/pennylane_lightning/src/util/LinearAlgebra.hpp b/pennylane_lightning/src/util/LinearAlgebra.hpp
index 4af7701100..b85114caab 100644
--- a/pennylane_lightning/src/util/LinearAlgebra.hpp
+++ b/pennylane_lightning/src/util/LinearAlgebra.hpp
@@ -27,7 +27,6 @@
 #include <random>
 #include <vector>
 
-
 /// @cond DEV
 #if __has_include(<cblas.h>) && defined _ENABLE_BLAS
 #include <cblas.h>
diff --git a/pennylane_lightning/src/util/TypeTraits.hpp b/pennylane_lightning/src/util/TypeTraits.hpp
index b979ab6742..cc10f3ef11 100644
--- a/pennylane_lightning/src/util/TypeTraits.hpp
+++ b/pennylane_lightning/src/util/TypeTraits.hpp
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 /**
- * @file 
+ * @file
  * Contains type traits
  */
 #pragma once

From 04897a0139b6955c1fa458bd5ad839342b762603 Mon Sep 17 00:00:00 2001
From: Dev version update bot <chae-yeun@xanadu.ai>
Date: Wed, 23 Mar 2022 14:12:25 +0000
Subject: [PATCH 90/94] Auto update version

---
 pennylane_lightning/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/_version.py b/pennylane_lightning/_version.py
index adb0d6145d..fc586eba69 100644
--- a/pennylane_lightning/_version.py
+++ b/pennylane_lightning/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.23.0-dev12"
+__version__ = "0.23.0-dev13"

From 95284810107d08f437be0f1f9a090ac707a8b71a Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Wed, 23 Mar 2022 14:54:57 -0400
Subject: [PATCH 91/94] Trigger CI


From 6e35950027d81b45c2b351f89ea26ad544b6e14a Mon Sep 17 00:00:00 2001
From: Dev version update bot <chae-yeun@xanadu.ai>
Date: Sat, 7 May 2022 18:52:26 +0000
Subject: [PATCH 92/94] Auto update version

---
 pennylane_lightning/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/_version.py b/pennylane_lightning/_version.py
index d1a7ed1d64..018b2a730b 100644
--- a/pennylane_lightning/_version.py
+++ b/pennylane_lightning/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.24.0-dev5"
+__version__ = "0.24.0-dev6"

From d20339cbe2a2c00412e0875e8506906f20737cb8 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Sat, 7 May 2022 14:56:38 -0400
Subject: [PATCH 93/94] Remove files

---
 doc/conf.py                                   |   1 +
 .../src/examples/benchmark_gate.cpp           | 205 ------------------
 .../src/examples/benchmark_generator.cpp      | 202 -----------------
 .../src/examples/benchmark_matrix.cpp         | 143 ------------
 .../src/examples/run_benchmark.py             | 153 -------------
 5 files changed, 1 insertion(+), 703 deletions(-)
 delete mode 100644 pennylane_lightning/src/examples/benchmark_gate.cpp
 delete mode 100644 pennylane_lightning/src/examples/benchmark_generator.cpp
 delete mode 100644 pennylane_lightning/src/examples/benchmark_matrix.cpp
 delete mode 100755 pennylane_lightning/src/examples/run_benchmark.py

diff --git a/doc/conf.py b/doc/conf.py
index 2da0d0f85d..cc2249239e 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -31,6 +31,7 @@
 CPP_SOURCE_DIR = PROJECT_SOURCE_DIR.joinpath("pennylane_lightning/src")
 CPP_EXCLUDE_DIRS = ["examples", "tests", "benchmarks"]  # relative to CPP_SOURCE_DIR
 
+
 def obtain_cpp_files():
     script_path = PROJECT_SOURCE_DIR.joinpath("bin/cpp-files")
 
diff --git a/pennylane_lightning/src/examples/benchmark_gate.cpp b/pennylane_lightning/src/examples/benchmark_gate.cpp
deleted file mode 100644
index a20b92d4b9..0000000000
--- a/pennylane_lightning/src/examples/benchmark_gate.cpp
+++ /dev/null
@@ -1,205 +0,0 @@
-#include <algorithm>
-#include <chrono>
-#include <cstdlib>
-#include <iostream>
-#include <map>
-#include <random>
-#include <stdexcept>
-#include <string>
-
-#include "Constant.hpp"
-#include "ExampleUtil.hpp"
-#include "StateVectorManagedCPU.hpp"
-
-#ifdef USE_SINGLE_PRECISION
-using PrecisionT = float;
-#pragma message "Using single precision"
-#else
-using PrecisionT = double;
-#endif
-
-using namespace Pennylane;
-using Util::operator<<;
-
-struct GateDesc {
-    std::string name;
-    std::vector<size_t> wires;
-    bool inverse;
-    std::vector<PrecisionT> params;
-
-    template <typename Arg0, typename Arg1, typename Arg2, typename Arg3>
-    GateDesc(Arg0 &&arg0, Arg1 &&arg1, Arg2 &&arg2, Arg3 &&arg3)
-        : name{std::forward<Arg0>(arg0)}, wires{std::forward<Arg1>(arg1)},
-          inverse{std::forward<Arg2>(arg2)}, params{std::forward<Arg3>(arg3)} {}
-};
-
-std::ostream &operator<<(std::ostream &os, GateDesc &desc) {
-    os << desc.name << ", " << desc.wires << "," << desc.inverse << ","
-       << desc.params << std::endl;
-    return os;
-}
-
-template <class RandomEngine>
-auto generateGateSequence(RandomEngine &re, const std::string &gate_name,
-                          const size_t num_reps, const size_t num_qubits,
-                          const size_t num_wires_for_multi_qubit)
-    -> std::vector<GateDesc> {
-    using namespace Gates::Constant;
-    using Gates::GateOperation;
-
-    const GateOperation gate_op = Util::lookup(Util::reverse_pairs(gate_names),
-                                               std::string_view(gate_name));
-    const size_t num_wires = [=]() {
-        if (Util::array_has_elt(multi_qubit_gates, gate_op)) {
-            // if multi qubit gate
-            return num_wires_for_multi_qubit;
-        }
-        return Util::lookup(gate_wires, gate_op);
-    }();
-    const size_t num_params = Util::lookup(gate_num_params, gate_op);
-
-    std::vector<GateDesc> gate_seq;
-    std::uniform_int_distribution<size_t> inverse_dist(0, 1);
-    std::uniform_real_distribution<PrecisionT> param_dist(0.0, 2 * M_PI);
-
-    for (uint32_t k = 0; k < num_reps; k++) {
-        std::vector<PrecisionT> params;
-        params.reserve(num_params);
-
-        bool inverse = static_cast<bool>(inverse_dist(re));
-        auto wires = generateNeighboringWires(re, num_qubits, num_wires);
-
-        for (size_t idx = 0; idx < num_params; idx++) {
-            params.emplace_back(param_dist(re));
-        }
-
-        gate_seq.emplace_back(gate_name, std::move(wires), inverse,
-                              std::move(params));
-    }
-    return gate_seq;
-}
-
-double benchmarkGate(Gates::KernelType kernel, const size_t num_qubits,
-                     const std::vector<GateDesc> &gate_seq) {
-    // Run benchmark. Total num_reps number of gates is used.
-    StateVectorManagedCPU<PrecisionT> svdat{num_qubits};
-
-    std::chrono::time_point<std::chrono::high_resolution_clock> t_start =
-        std::chrono::high_resolution_clock::now();
-    for (const auto &gate : gate_seq) {
-        svdat.applyOperation(kernel, gate.name, gate.wires, gate.inverse,
-                             gate.params);
-    }
-    std::chrono::time_point<std::chrono::high_resolution_clock> t_end =
-        std::chrono::high_resolution_clock::now();
-
-    return std::chrono::duration<double, std::milli>(t_end - t_start).count();
-}
-
-template <typename RandomEngine>
-double runBenchmarkGate(RandomEngine &re, Gates::KernelType kernel,
-                        const std::string &gate_name, size_t num_reps,
-                        size_t num_qubits, size_t num_wires_for_multi_qubit) {
-    auto gate_seq = generateGateSequence(re, gate_name, num_reps, num_qubits,
-                                         num_wires_for_multi_qubit);
-
-    // Log generated sequence if LOG is turned on
-    const char *env_p = std::getenv("LOG");
-    try {
-        if (env_p != nullptr && std::stoi(env_p) != 0) {
-            for (const auto &gate : gate_seq) {
-                std::cerr << gate.name << ", " << gate.wires << ","
-                          << gate.inverse << "," << gate.params << std::endl;
-            }
-        }
-    } catch (std::exception &e) {
-        // Just do not print log
-    }
-
-    return benchmarkGate(kernel, num_qubits, gate_seq);
-}
-
-/**
- * @brief Benchmark Pennylane-Lightning for a given generator
- *
- * @param argc Number of arguments
- * @param argv Command line arguments
- * @return Returns 0 is completed successfully
- */
-int main(int argc, char *argv[]) {
-    using namespace Pennylane::Gates;
-    // Handle input
-    if (argc != 5 && argc != 6) { // NOLINT(readability-magic-numbers)
-        std::cerr
-            << "Wrong number of inputs. User provided " << argc - 1
-            << " inputs. \n"
-            << "Usage: " + std::string(argv[0]) +
-                   " num_reps num_qubits kernel [generator|gate] [num_wires]\n"
-                   "Examples: \n"
-            << "\t" << argv[0] << " 1000 10 PI GeneratorCRX\n"
-            << "\t" << argv[0] << " 1000 10 LM CRX\n"
-            << "\t" << argv[0] << " 1000 10 LM MutliRZ 3\n";
-        return -1;
-    }
-
-    size_t num_reps;
-    size_t num_qubits;
-
-    try {
-        num_reps = std::stoi(argv[1]);
-        num_qubits = std::stoi(argv[2]);
-    } catch (std::exception &e) {
-        std::cerr << "Arguments num_reps and num_qubits must be integers."
-                  << std::endl;
-        return -1;
-    }
-
-    std::string_view kernel_name = argv[3];
-    KernelType kernel = string_to_kernel(kernel_name);
-    if (kernel == KernelType::None) {
-        std::cerr << "Kernel " << kernel_name << " is unknown." << std::endl;
-        return 1;
-    }
-
-    std::string_view gate_name = argv[4];
-    if (!Util::array_has_elt(Util::second_elts_of(Constant::gate_names),
-                             gate_name)) {
-        std::cerr << "Unknown gate name " << gate_name << " is provided"
-                  << std::endl;
-        return 1;
-    }
-
-    Gates::GateOperation gate_op =
-        Util::lookup(Util::reverse_pairs(Constant::gate_names), gate_name);
-
-    size_t num_wires_for_multi_qubit = 0;
-    if (Util::array_has_elt(Constant::multi_qubit_gates, gate_op)) {
-        // User provided a multi-qubit gates
-        if (argc != 6) { // NOLINT(readability-magic-numbers)
-            std::cerr << "One should provide the number of wires when using "
-                         "multi qubit gates."
-                      << std::endl;
-            return 1;
-        }
-
-        try {
-            // NOLINTNEXTLINE(readability-magic-numbers)
-            num_wires_for_multi_qubit = std::stoi(argv[5]);
-        } catch (std::exception &e) {
-            std::cerr << "Number of wires must be an integer" << std::endl;
-            return 1;
-        }
-    }
-
-    std::random_device rd;
-    std::mt19937 re(rd());
-
-    double walltime =
-        runBenchmarkGate(re, kernel, std::string(gate_name), num_reps,
-                         num_qubits, num_wires_for_multi_qubit);
-
-    // Output walltime in csv format (Num Qubits, Time (milliseconds))
-    std::cout << num_qubits << ", " << walltime / static_cast<double>(num_reps)
-              << std::endl;
-    return 0;
-}
diff --git a/pennylane_lightning/src/examples/benchmark_generator.cpp b/pennylane_lightning/src/examples/benchmark_generator.cpp
deleted file mode 100644
index c1ea726ec3..0000000000
--- a/pennylane_lightning/src/examples/benchmark_generator.cpp
+++ /dev/null
@@ -1,202 +0,0 @@
-#include <algorithm>
-#include <chrono>
-#include <cstdlib>
-#include <iostream>
-#include <map>
-#include <random>
-#include <stdexcept>
-#include <string>
-
-#include "Constant.hpp"
-#include "DynamicDispatcher.hpp"
-#include "ExampleUtil.hpp"
-#include "StateVectorManagedCPU.hpp"
-
-#ifdef USE_SINGLE_PRECISION
-using PrecisionT = float;
-#pragma message "Using single precision"
-#else
-using PrecisionT = double;
-#endif
-
-using namespace Pennylane;
-using namespace Pennylane::Gates;
-using namespace Pennylane::Util;
-
-auto generatorOp(const std::string_view &name) -> Gates::GeneratorOperation {
-    auto &dispatcher = DynamicDispatcher<PrecisionT>::getInstance();
-    return dispatcher.strToGeneratorOp(std::string(name));
-}
-
-struct GeneratorDesc {
-    std::string name;
-    std::vector<size_t> wires;
-    bool inverse;
-
-    template <typename Arg0, typename Arg1, typename Arg2>
-    GeneratorDesc(Arg0 &&arg0, Arg1 &&arg1, Arg2 &&arg2)
-        : name{std::forward<Arg0>(arg0)}, wires{std::forward<Arg1>(arg1)},
-          inverse{std::forward<Arg2>(arg2)} {}
-};
-
-std::ostream &operator<<(std::ostream &os, GeneratorDesc &desc) {
-    os << desc.name << ", " << desc.wires << "," << desc.inverse << std::endl;
-    return os;
-}
-
-template <class RandomEngine>
-auto generateGeneratorSequence(RandomEngine &re,
-                               const GeneratorOperation gntr_op,
-                               const size_t num_reps, const size_t num_qubits,
-                               const size_t num_wires_for_multi_qubit)
-    -> std::vector<GeneratorDesc> {
-    namespace Constant = Gates::Constant;
-    using Gates::GeneratorOperation;
-
-    const auto gntr_name =
-        Util::lookup(Constant::generator_names, gntr_op).substr(9);
-
-    const size_t num_wires = [=]() {
-        if (Util::array_has_elt(Constant::multi_qubit_generators, gntr_op)) {
-            // if multi qubit gate
-            return num_wires_for_multi_qubit;
-        }
-        return Util::lookup(Constant::generator_wires, gntr_op);
-    }();
-
-    std::vector<GeneratorDesc> gntr_seq;
-    std::uniform_int_distribution<size_t> inverse_dist(0, 1);
-
-    for (uint32_t k = 0; k < num_reps; k++) {
-        bool inverse = static_cast<bool>(inverse_dist(re));
-        auto wires = generateNeighboringWires(re, num_qubits, num_wires);
-
-        gntr_seq.emplace_back(gntr_name, std::move(wires), inverse);
-    }
-    return gntr_seq;
-}
-
-double benchmarkGenerator(KernelType kernel, const size_t num_qubits,
-                          const std::vector<GeneratorDesc> &gntr_seq) {
-    // Run benchmark. Total num_reps number of gates is used.
-    StateVectorManagedCPU<PrecisionT> svdat{num_qubits};
-
-    std::chrono::time_point<std::chrono::high_resolution_clock> t_start =
-        std::chrono::high_resolution_clock::now();
-    for (const auto &gntr : gntr_seq) {
-        [[maybe_unused]] PrecisionT scale =
-            svdat.applyGenerator(kernel, gntr.name, gntr.wires, gntr.inverse);
-    }
-    std::chrono::time_point<std::chrono::high_resolution_clock> t_end =
-        std::chrono::high_resolution_clock::now();
-
-    return std::chrono::duration<double, std::milli>(t_end - t_start).count();
-}
-
-template <typename RandomEngine>
-double runBenchmarkGenerator(RandomEngine &re, KernelType kernel,
-                             const GeneratorOperation gntr_op, size_t num_reps,
-                             size_t num_qubits,
-                             size_t num_wires_for_multi_qubit) {
-    auto gntr_seq = generateGeneratorSequence(re, gntr_op, num_reps, num_qubits,
-                                              num_wires_for_multi_qubit);
-
-    // Log generated sequence if LOG is turned on
-    const char *env_p = std::getenv("LOG");
-    try {
-        if (env_p != nullptr && std::stoi(env_p) != 0) {
-            for (const auto &gntr : gntr_seq) {
-                std::cerr << gntr.name << ", " << gntr.wires << ","
-                          << gntr.inverse << std::endl;
-            }
-        }
-    } catch (std::exception &e) {
-        // Just do not print log
-    }
-
-    return benchmarkGenerator(kernel, num_qubits, gntr_seq);
-}
-
-/**
- * @brief Benchmark Pennylane-Lightning for a given generator
- *
- * @param argc Number of arguments
- * @param argv Command line arguments
- * @return Returns 0 is completed successfully
- */
-int main(int argc, char *argv[]) {
-    namespace Constant = Gates::Constant;
-    // Handle input
-    if (argc != 5 && argc != 6) { // NOLINT(readability-magic-numbers)
-        std::cerr
-            << "Wrong number of inputs. User provided " << argc - 1
-            << " inputs. \n"
-            << "Usage: " + std::string(argv[0]) +
-                   " num_reps num_qubits kernel [generator|gate] [num_wires]\n"
-                   "Examples: \n"
-            << "\t" << argv[0] << " 1000 10 PI GeneratorCRX\n"
-            << "\t" << argv[0] << " 1000 10 LM CRX\n"
-            << "\t" << argv[0] << " 1000 10 LM MutliRZ 3\n";
-        return -1;
-    }
-
-    size_t num_reps;
-    size_t num_qubits;
-
-    try {
-        num_reps = std::stoi(argv[1]);
-        num_qubits = std::stoi(argv[2]);
-    } catch (std::exception &e) {
-        std::cerr << "Arguments num_reps and num_qubits must be integers."
-                  << std::endl;
-        return -1;
-    }
-
-    std::string_view kernel_name = argv[3];
-    KernelType kernel = string_to_kernel(kernel_name);
-    if (kernel == KernelType::None) {
-        std::cerr << "Kernel " << kernel_name << " is unknown." << std::endl;
-        return 1;
-    }
-
-    std::string_view gntr_name = argv[4];
-    Gates::GeneratorOperation gntr_op;
-
-    try {
-        gntr_op = generatorOp(gntr_name);
-    } catch (std::exception &e) {
-        std::cout << "Unknown generator " + std::string(gntr_name) + " provided"
-                  << std::endl;
-        return 1;
-    }
-
-    size_t num_wires_for_multi_qubit = 0;
-    if (Util::array_has_elt(Constant::multi_qubit_generators, gntr_op)) {
-        // User provided a multi-qubit gates
-        if (argc != 6) { // NOLINT(readability-magic-numbers)
-            std::cerr << "One should provide the number of wires when using "
-                         "multi qubit generators."
-                      << std::endl;
-            return 1;
-        }
-
-        try {
-            // NOLINTNEXTLINE(readability-magic-numbers)
-            num_wires_for_multi_qubit = std::stoi(argv[5]);
-        } catch (std::exception &e) {
-            std::cerr << "Number of wires must be an integer" << std::endl;
-            return 1;
-        }
-    }
-
-    std::random_device rd;
-    std::mt19937 re(rd());
-
-    double walltime = runBenchmarkGenerator(
-        re, kernel, gntr_op, num_reps, num_qubits, num_wires_for_multi_qubit);
-
-    // Output walltime in csv format (Num Qubits, Time (milliseconds))
-    std::cout << num_qubits << ", " << walltime / static_cast<double>(num_reps)
-              << std::endl;
-    return 0;
-}
diff --git a/pennylane_lightning/src/examples/benchmark_matrix.cpp b/pennylane_lightning/src/examples/benchmark_matrix.cpp
deleted file mode 100644
index 26d6ec45df..0000000000
--- a/pennylane_lightning/src/examples/benchmark_matrix.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-#include <algorithm>
-#include <chrono>
-#include <cstdlib>
-#include <iostream>
-#include <map>
-#include <random>
-#include <stdexcept>
-#include <string>
-
-#include "Constant.hpp"
-#include "ExampleUtil.hpp"
-#include "LinearAlgebra.hpp"
-#include "StateVectorManagedCPU.hpp"
-
-#ifdef USE_SINGLE_PRECISION
-using PrecisionT = float;
-#pragma message "Using single precision"
-#else
-using PrecisionT = double;
-#endif
-
-using namespace Pennylane;
-using namespace Pennylane::Gates;
-using namespace Pennylane::Util;
-
-struct MatOpDesc {
-    std::vector<size_t> wires;
-    bool inverse;
-    std::vector<std::complex<PrecisionT>> mat;
-
-    template <typename Arg0, typename Arg1, typename Arg2>
-    MatOpDesc(Arg0 &&arg0, Arg1 &&arg1, Arg2 &&arg2)
-        : wires{std::forward<Arg0>(arg0)}, inverse{std::forward<Arg1>(arg1)},
-          mat{std::forward<Arg2>(arg2)} {}
-};
-
-template <class RandomEngine>
-auto generateMatrixSequence(RandomEngine &re, const size_t num_reps,
-                            const size_t num_qubits, const size_t num_wires)
-    -> std::vector<MatOpDesc> {
-    std::vector<MatOpDesc> matrix_seq;
-    matrix_seq.reserve(num_reps);
-    std::uniform_int_distribution<size_t> inverse_dist(0, 1);
-    for (uint32_t k = 0; k < num_reps; k++) {
-        bool inverse = static_cast<bool>(inverse_dist(re));
-        auto wires = generateNeighboringWires(re, num_qubits, num_wires);
-
-        matrix_seq.emplace_back(std::move(wires), inverse,
-                                Util::randomUnitary<PrecisionT>(re, num_wires));
-    }
-    return matrix_seq;
-}
-
-double benchmarkMatrix(KernelType kernel, const size_t num_qubits,
-                       const std::vector<MatOpDesc> &mat_seq) {
-    // Run benchmark. Total num_reps number of gates is used.
-    StateVectorManagedCPU<PrecisionT> svdat{num_qubits};
-
-    std::chrono::time_point<std::chrono::high_resolution_clock> t_start =
-        std::chrono::high_resolution_clock::now();
-    for (const auto &mat_desc : mat_seq) {
-        svdat.applyMatrix(kernel, mat_desc.mat.data(), mat_desc.wires,
-                          mat_desc.inverse);
-    }
-    std::chrono::time_point<std::chrono::high_resolution_clock> t_end =
-        std::chrono::high_resolution_clock::now();
-
-    return std::chrono::duration<double, std::milli>(t_end - t_start).count();
-}
-
-template <typename RandomEngine>
-double runBenchmarkMatrix(RandomEngine &re, KernelType kernel, size_t num_reps,
-                          size_t num_qubits, size_t num_wires) {
-    auto mat_seq = generateMatrixSequence(re, num_reps, num_qubits, num_wires);
-
-    // Log generated sequence if LOG is turned on
-    const char *env_p = std::getenv("LOG");
-    try {
-        if (env_p != nullptr && std::stoi(env_p) != 0) {
-            for (const auto &mat_desc : mat_seq) {
-                std::cerr << mat_desc.wires << ", " << mat_desc.inverse << ", "
-                          << mat_desc.mat << std::endl;
-            }
-        }
-    } catch (std::exception &e) {
-        // Just do not print log
-    }
-
-    return benchmarkMatrix(kernel, num_qubits, mat_seq);
-}
-
-/**
- * @brief Benchmark Pennylane-Lightning for a given generator
- *
- * @param argc Number of arguments
- * @param argv Command line arguments
- * @return Returns 0 is completed successfully
- */
-int main(int argc, char *argv[]) {
-    namespace Constant = Gates::Constant;
-    // Handle input
-    if (argc != 5) { // NOLINT(readability-magic-numbers)
-        std::cerr << "Wrong number of inputs. User provided " << argc - 1
-                  << " inputs. \n"
-                  << "Usage: " + std::string(argv[0]) +
-                         " num_reps num_qubits kernel num_wires\n"
-                         "Examples: \n"
-                  << "\t" << argv[0] << " 1000 10 PI 4\n";
-        return -1;
-    }
-
-    size_t num_reps;
-    size_t num_qubits;
-    size_t num_wires;
-
-    try {
-        num_reps = std::stoi(argv[1]);
-        num_qubits = std::stoi(argv[2]);
-        num_wires = std::stoi(argv[4]);
-    } catch (std::exception &e) {
-        std::cerr << "Arguments num_reps and num_qubits must be integers."
-                  << std::endl;
-        return -1;
-    }
-
-    std::string_view kernel_name = argv[3];
-    KernelType kernel = string_to_kernel(kernel_name);
-    if (kernel == KernelType::None) {
-        std::cerr << "Kernel " << kernel_name << " is unknown." << std::endl;
-        return 1;
-    }
-
-    std::random_device rd;
-    std::mt19937 re(rd());
-
-    double walltime =
-        runBenchmarkMatrix(re, kernel, num_reps, num_qubits, num_wires);
-
-    // Output walltime in csv format (Num Qubits, Time (milliseconds))
-    std::cout << num_qubits << ", " << walltime / static_cast<double>(num_reps)
-              << std::endl;
-    return 0;
-}
diff --git a/pennylane_lightning/src/examples/run_benchmark.py b/pennylane_lightning/src/examples/run_benchmark.py
deleted file mode 100755
index ae20d520b0..0000000000
--- a/pennylane_lightning/src/examples/run_benchmark.py
+++ /dev/null
@@ -1,153 +0,0 @@
-#!/usr/bin/env python3
-import subprocess
-import argparse
-import json
-from pathlib import Path
-from typing import final
-import abc
-
-MIN_NUM_QUBITS = 8
-MAX_NUM_QUBITS = 24
-STEP_NUM_QUBITS = 2
-NUM_GATE_REPS = 1000
-
-
-class BenchmarkRunner:
-    def __init__(self, kernel, operation):
-        self.kernel = kernel
-        self.operation = operation
-
-    @final
-    def benchmark(self, res_path):
-        result = []
-        ext_info = self.external_info()
-        if ext_info:
-            result.append(ext_info)
-        try:
-            for num_qubit in range(MIN_NUM_QUBITS, MAX_NUM_QUBITS + 1, STEP_NUM_QUBITS):
-                cmd = self.command(num_qubit)
-                print(f"Run N={num_qubit}, {self.kernel}, {self.operation}")
-                output = subprocess.run([str(c) for c in cmd], capture_output=True, check=True)
-                time = output.stdout.decode("utf-8").strip().split(",")[1]
-                result.append({"N": num_qubit, "time": time})
-        except subprocess.CalledProcessError as err:
-            print("Error from subprocess call. Message:")
-            print(err.stderr.decode("utf-8"))
-        except KeyboardInterrupt:
-            pass
-
-        res_path = Path(res_path)
-        if not res_path.exists():
-            res_path.mkdir(parents=True)
-
-        with res_path.joinpath(self.filename()).open("w") as f:
-            json.dump(result, f, indent=4)
-
-    @abc.abstractmethod
-    def command(self, num_qubits):
-        pass
-
-    @abc.abstractmethod
-    def external_info(self):
-        pass
-
-    @abc.abstractmethod
-    def filename(self):
-        pass
-
-
-class MatrixBenchmarkRunner(BenchmarkRunner):
-    def __init__(self, kernel, operation, num_wires):
-        super().__init__(kernel, operation)
-        self.num_wires = num_wires
-
-    def command(self, num_qubits):
-        return ["./benchmark_matrix", NUM_GATE_REPS, num_qubits, self.kernel, self.num_wires]
-
-    def external_info(self):
-        return {"num_wires": self.num_wires}
-
-    def filename(self):
-        return f"Matrix_{self.kernel}_{self.num_wires}.json"
-
-
-class GateBenchmarkRunner(BenchmarkRunner):
-    def __init__(self, kernel, operation, num_wires=None):
-        super().__init__(kernel, operation)
-        self.num_wires = num_wires
-
-    def command(self, num_qubits):
-        cmd = ["./benchmark_gate", NUM_GATE_REPS, num_qubits, self.kernel, self.operation]
-        if self.num_wires:
-            cmd.append(self.num_wires)
-        return cmd
-
-    def external_info(self):
-        if self.num_wires:
-            return {"num_wires": self.num_wires}
-        return None
-
-    def filename(self):
-        if self.num_wires:
-            return f"{self.operation}_{self.kernel}_{self.num_wires}.json"
-        return f"{self.operation}_{self.kernel}.json"
-
-
-class GeneratorBenchmarkRunner(BenchmarkRunner):
-    def __init__(self, kernel, operation, num_wires=None):
-        super().__init__(kernel, operation)
-        self.num_wires = num_wires
-
-    def command(self, num_qubits):
-        cmd = ["./benchmark_generator", NUM_GATE_REPS, num_qubits, self.kernel, self.operation[9:]]
-        if self.num_wires is not None:
-            cmd.append(self.num_wires)
-        return cmd
-
-    def external_info(self):
-        if self.num_wires:
-            return {"num_wires": self.num_wires}
-        return None
-
-    def filename(self):
-        if self.num_wires:
-            return f"{self.operation}_{self.kernel}_{self.num_wires}.json"
-        return f"{self.operation}_{self.kernel}.json"
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Run C++ benchmarks")
-    parser.add_argument("kernel", help="Kernel to benchmark")
-    parser.add_argument("operation", help="Operation to benchmark.")
-
-    parser.add_argument(
-        "num_wires",
-        help="Number of wires (optional for multi-qubit operations).",
-        nargs="?",
-        default=None,
-        type=int,
-    )
-
-    args = parser.parse_args()
-
-    compiler_info_file = "compiler_info.txt"
-
-    try:
-        with open(compiler_info_file, "r") as f:
-            res_path = "res_" + f.readline().strip()
-    except OSError:
-        print("Encountered an error while opening '{}'".format(compiler_info_file))
-        sys.exit(1)
-
-    if args.operation == "Matrix":
-        if args.num_wires == 0:
-            raise ValueError(
-                "Parameter num_wires must be provided and larger than 0 for matrix benchmark."
-            )
-        runner = MatrixBenchmarkRunner(args.kernel, args.operation, args.num_wires)
-    elif args.operation.startswith("Generator"):
-        runner = GeneratorBenchmarkRunner(args.kernel, args.operation, args.num_wires)
-    else:
-        runner = GateBenchmarkRunner(args.kernel, args.operation, args.num_wires)
-
-    runner.benchmark(res_path)

From 0eced01b9a75aaec285288a7a2bdf379dc6859a3 Mon Sep 17 00:00:00 2001
From: Chae-Yeun Park <chae-yeun@Xanadu.ai>
Date: Sat, 7 May 2022 16:55:17 -0400
Subject: [PATCH 94/94] Fix

---
 .../src/gates/OpToMemberFuncPtr.hpp           |   2 +-
 .../cpu_kernels/GateImplementationsLM.hpp     |   2 +-
 .../cpu_kernels/GateImplementationsPI.hpp     |   3 +-
 .../src/simulator/DefaultKernels.hpp          | 131 -------------
 .../src/simulator/Measures.cpp                |   3 +-
 .../src/simulator/StateVectorBase.hpp         |  14 +-
 .../src/simulator/StateVectorManagedCPU.hpp   |   2 +-
 .../src/simulator/StateVectorRawCPU.hpp       |   2 +-
 .../src/tests/TestAvailableKernels.hpp        |   1 -
 .../src/tests/TestConstant.hpp                |   1 -
 .../src/tests/Test_StateVectorManaged.cpp     | 174 ------------------
 11 files changed, 14 insertions(+), 321 deletions(-)
 delete mode 100644 pennylane_lightning/src/simulator/DefaultKernels.hpp
 delete mode 100644 pennylane_lightning/src/tests/Test_StateVectorManaged.cpp

diff --git a/pennylane_lightning/src/gates/OpToMemberFuncPtr.hpp b/pennylane_lightning/src/gates/OpToMemberFuncPtr.hpp
index d2ca7afc9b..04cf23248a 100644
--- a/pennylane_lightning/src/gates/OpToMemberFuncPtr.hpp
+++ b/pennylane_lightning/src/gates/OpToMemberFuncPtr.hpp
@@ -427,7 +427,7 @@ template <class PrecisionT>
 using GeneratorFuncPtrT = typename Internal::GeneratorFuncPtr<PrecisionT>::Type;
 
 /**
- * @brief Convinient type alias for MatrixfuncPtr.
+ * @brief Convenient type alias for MatrixfuncPtr.
  */
 template <class PrecisionT>
 using MatrixFuncPtrT = typename Internal::MatrixFuncPtr<PrecisionT>::Type;
diff --git a/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp b/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp
index c45e62e9c4..adb1f287bd 100644
--- a/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp
+++ b/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsLM.hpp
@@ -22,8 +22,8 @@
 #include "GateOperation.hpp"
 #include "Gates.hpp"
 #include "KernelType.hpp"
-#include "PauliGenerator.hpp"
 #include "LinearAlgebra.hpp"
+#include "PauliGenerator.hpp"
 
 #include <complex>
 #include <vector>
diff --git a/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsPI.hpp b/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsPI.hpp
index b144d90b21..f94d7b12a2 100644
--- a/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsPI.hpp
+++ b/pennylane_lightning/src/gates/cpu_kernels/GateImplementationsPI.hpp
@@ -24,14 +24,13 @@
 #endif
 /// @endcond
 
-#include "PauliGenerator.hpp"
-
 #include "BitUtil.hpp"
 #include "GateOperation.hpp"
 #include "GateUtil.hpp"
 #include "Gates.hpp"
 #include "KernelType.hpp"
 #include "LinearAlgebra.hpp"
+#include "PauliGenerator.hpp"
 
 #include <complex>
 #include <vector>
diff --git a/pennylane_lightning/src/simulator/DefaultKernels.hpp b/pennylane_lightning/src/simulator/DefaultKernels.hpp
deleted file mode 100644
index 10421ee933..0000000000
--- a/pennylane_lightning/src/simulator/DefaultKernels.hpp
+++ /dev/null
@@ -1,131 +0,0 @@
-// Copyright 2022 Xanadu Quantum Technologies Inc.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//     http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-/**
- * @file
- * Defines default kernels for operations
- */
-#pragma once
-
-#include "Constant.hpp"
-#include "ConstantUtil.hpp"
-#include "KernelType.hpp"
-
-namespace Pennylane {
-/**
- *
- * @brief Define which kernel to use for each gate operation.
- *
- * @rst
- * Check
- * `this repository
- * <https://github.com/PennyLaneAI/pennylane-lightning-compare-kernels>`_ to see
- * the benchmark results for each gate
- * @endrst
- *
- * This value is used for:
- * 1. StateVector `apply##GATE_NAME` methods. The kernel function is statically
- * binded to the given kernel and cannot be modified.
- * 2. Default kernel functions for DynamicDispatcher. The kernel function is
- * dynamically binded and can be changed using DynamicDispatcher singleton
- * class.
- * 3. For the Python binding.
- */
-[[maybe_unused]] constexpr std::array default_kernel_for_gates = {
-    std::pair{Gates::GateOperation::Identity, Gates::KernelType::LM},
-    std::pair{Gates::GateOperation::PauliX, Gates::KernelType::LM},
-    std::pair{Gates::GateOperation::PauliY, Gates::KernelType::LM},
-    std::pair{Gates::GateOperation::PauliZ, Gates::KernelType::LM},
-    std::pair{Gates::GateOperation::Hadamard, Gates::KernelType::LM},
-    std::pair{Gates::GateOperation::S, Gates::KernelType::LM},
-    std::pair{Gates::GateOperation::T, Gates::KernelType::LM},
-    std::pair{Gates::GateOperation::RX, Gates::KernelType::LM},
-    std::pair{Gates::GateOperation::RY, Gates::KernelType::LM},
-    std::pair{Gates::GateOperation::RZ, Gates::KernelType::LM},
-    std::pair{Gates::GateOperation::PhaseShift, Gates::KernelType::LM},
-    std::pair{Gates::GateOperation::Rot, Gates::KernelType::LM},
-    std::pair{Gates::GateOperation::ControlledPhaseShift,
-              Gates::KernelType::PI},
-    std::pair{Gates::GateOperation::CNOT, Gates::KernelType::LM},
-    std::pair{Gates::GateOperation::CY, Gates::KernelType::LM},
-    std::pair{Gates::GateOperation::CZ, Gates::KernelType::LM},
-    std::pair{Gates::GateOperation::SWAP, Gates::KernelType::LM},
-    std::pair{Gates::GateOperation::IsingXX, Gates::KernelType::LM},
-    std::pair{Gates::GateOperation::IsingYY, Gates::KernelType::LM},
-    std::pair{Gates::GateOperation::IsingZZ, Gates::KernelType::LM},
-    std::pair{Gates::GateOperation::CRX, Gates::KernelType::LM},
-    std::pair{Gates::GateOperation::CRY, Gates::KernelType::LM},
-    std::pair{Gates::GateOperation::CRZ, Gates::KernelType::LM},
-    std::pair{Gates::GateOperation::CRot, Gates::KernelType::LM},
-    std::pair{Gates::GateOperation::Toffoli, Gates::KernelType::PI},
-    std::pair{Gates::GateOperation::CSWAP, Gates::KernelType::PI},
-    std::pair{Gates::GateOperation::MultiRZ, Gates::KernelType::LM},
-};
-/**
- * @brief Define which kernel to use for each generator operation.
- */
-[[maybe_unused]] constexpr std::array default_kernel_for_generators = {
-    std::pair{Gates::GeneratorOperation::PhaseShift, Gates::KernelType::PI},
-    std::pair{Gates::GeneratorOperation::RX, Gates::KernelType::LM},
-    std::pair{Gates::GeneratorOperation::RY, Gates::KernelType::LM},
-    std::pair{Gates::GeneratorOperation::RZ, Gates::KernelType::LM},
-    std::pair{Gates::GeneratorOperation::IsingXX, Gates::KernelType::LM},
-    std::pair{Gates::GeneratorOperation::IsingYY, Gates::KernelType::LM},
-    std::pair{Gates::GeneratorOperation::IsingZZ, Gates::KernelType::LM},
-    std::pair{Gates::GeneratorOperation::CRX, Gates::KernelType::LM},
-    std::pair{Gates::GeneratorOperation::CRY, Gates::KernelType::LM},
-    std::pair{Gates::GeneratorOperation::CRZ, Gates::KernelType::LM},
-    std::pair{Gates::GeneratorOperation::ControlledPhaseShift,
-              Gates::KernelType::LM},
-    std::pair{Gates::GeneratorOperation::MultiRZ, Gates::KernelType::LM},
-};
-
-/**
- * @brief Define which kernel to use for each generator operation.
- */
-[[maybe_unused]] constexpr std::array default_kernel_for_matrices = {
-    std::pair{Gates::MatrixOperation::SingleQubitOp, Gates::KernelType::LM},
-    std::pair{Gates::MatrixOperation::TwoQubitOp, Gates::KernelType::LM},
-    std::pair{Gates::MatrixOperation::MultiQubitOp, Gates::KernelType::PI},
-};
-
-/**
- * @brief Return default kernel for gate operation
- *
- * @param gate_op Gate operation
- */
-constexpr auto getDefaultKernelForGate(Gates::GateOperation gate_op)
-    -> Gates::KernelType {
-    return Util::lookup(default_kernel_for_gates, gate_op);
-}
-
-/**
- * @brief Return default kernel for generator operation
- *
- * @param gntr_op Generator operation
- */
-constexpr auto getDefaultKernelForGenerator(Gates::GeneratorOperation gntr_op)
-    -> Gates::KernelType {
-    return Util::lookup(default_kernel_for_generators, gntr_op);
-}
-
-/**
- * @brief Return default kernel for matrix operation
- *
- * @param mat_op Matrix operation
- */
-constexpr auto getDefaultKernelForMatrix(Gates::MatrixOperation mat_op)
-    -> Gates::KernelType {
-    return Util::lookup(default_kernel_for_matrices, mat_op);
-}
-} // namespace Pennylane
diff --git a/pennylane_lightning/src/simulator/Measures.cpp b/pennylane_lightning/src/simulator/Measures.cpp
index 6e961f7fd5..ff04a033ab 100644
--- a/pennylane_lightning/src/simulator/Measures.cpp
+++ b/pennylane_lightning/src/simulator/Measures.cpp
@@ -16,4 +16,5 @@
 
 // explicit instantiation
 template class Pennylane::Measures<float, Pennylane::StateVectorRawCPU<float>>;
-template class Pennylane::Measures<double, Pennylane::StateVectorRawCPU<double>>;
+template class Pennylane::Measures<double,
+                                   Pennylane::StateVectorRawCPU<double>>;
diff --git a/pennylane_lightning/src/simulator/StateVectorBase.hpp b/pennylane_lightning/src/simulator/StateVectorBase.hpp
index 811174466d..6d2b0a3002 100644
--- a/pennylane_lightning/src/simulator/StateVectorBase.hpp
+++ b/pennylane_lightning/src/simulator/StateVectorBase.hpp
@@ -233,16 +233,16 @@ template <class T, class Derived> class StateVectorBase {
      * inverted.
      */
     void applyOperations(const std::vector<std::string> &ops,
-                         const std::vector<std::vector<size_t>> &wires,
-                         const std::vector<bool> &inverse) {
+                         const std::vector<std::vector<size_t>> &ops_wires,
+                         const std::vector<bool> &ops_inverse) {
         const size_t numOperations = ops.size();
-        if (numOperations != wires.size()) {
+        if (numOperations != ops_wires.size()) {
             throw std::invalid_argument(
                 "Invalid arguments: number of operations, wires, and "
                 "parameters must all be equal");
         }
         for (size_t i = 0; i < numOperations; i++) {
-            applyOperation(ops[i], wires[i], inverse[i], {});
+            applyOperation(ops[i], ops_wires[i], ops_inverse[i], {});
         }
     }
 
@@ -275,9 +275,9 @@ template <class T, class Derived> class StateVectorBase {
                                       bool adj = false) -> PrecisionT {
         auto *arr = getData();
         const auto &dispatcher = DynamicDispatcher<PrecisionT>::getInstance();
-        return dispatcher.applyGenerator(
-            getKernelForGenerator(dispatcher.strToGeneratorOp(opName)), arr,
-            num_qubits_, opName, wires, adj);
+        const auto gntr_op = dispatcher.strToGeneratorOp(opName);
+        return dispatcher.applyGenerator(getKernelForGenerator(gntr_op), arr,
+                                         num_qubits_, opName, wires, adj);
     }
 
     /**
diff --git a/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp b/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
index 35e93478e1..f5cd98aad0 100644
--- a/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
+++ b/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
@@ -49,7 +49,7 @@ class StateVectorManagedCPU
      * @param memory_model Memory model the statevector will use
      */
     explicit StateVectorManagedCPU(
-        size_t num_qubits, Threading threading = bestThreading(),
+        size_t num_qubits, Threading threading = Threading::SingleThread,
         CPUMemoryModel memory_model = bestCPUMemoryModel())
         : BaseType{num_qubits, threading, memory_model},
           data_{Util::exp2(num_qubits), ComplexPrecisionT{0.0, 0.0},
diff --git a/pennylane_lightning/src/simulator/StateVectorRawCPU.hpp b/pennylane_lightning/src/simulator/StateVectorRawCPU.hpp
index 92de97be20..eca8d9c694 100644
--- a/pennylane_lightning/src/simulator/StateVectorRawCPU.hpp
+++ b/pennylane_lightning/src/simulator/StateVectorRawCPU.hpp
@@ -65,7 +65,7 @@ class StateVectorRawCPU
      * @param threading Threading option the statevector to use
      */
     StateVectorRawCPU(ComplexPrecisionT *data, size_t length,
-                      Threading threading = bestThreading())
+                      Threading threading = Threading::SingleThread)
         : BaseType{Util::log2PerfectPower(length), threading,
                    getMemoryModel(static_cast<void *>(data))},
           data_{data}, length_(length) {
diff --git a/pennylane_lightning/src/tests/TestAvailableKernels.hpp b/pennylane_lightning/src/tests/TestAvailableKernels.hpp
index b11a567d72..1139abb961 100644
--- a/pennylane_lightning/src/tests/TestAvailableKernels.hpp
+++ b/pennylane_lightning/src/tests/TestAvailableKernels.hpp
@@ -1,6 +1,5 @@
 #include "AvailableKernels.hpp"
 #include "Constant.hpp"
-#include "DefaultKernels.hpp"
 #include "KernelType.hpp"
 #include "SelectKernel.hpp"
 #include "Util.hpp"
diff --git a/pennylane_lightning/src/tests/TestConstant.hpp b/pennylane_lightning/src/tests/TestConstant.hpp
index 7f6d3b67f9..478041b32a 100644
--- a/pennylane_lightning/src/tests/TestConstant.hpp
+++ b/pennylane_lightning/src/tests/TestConstant.hpp
@@ -1,6 +1,5 @@
 #include "Constant.hpp"
 #include "ConstantUtil.hpp"
-#include "DefaultKernels.hpp"
 #include "GateOperation.hpp"
 #include "Util.hpp"
 
diff --git a/pennylane_lightning/src/tests/Test_StateVectorManaged.cpp b/pennylane_lightning/src/tests/Test_StateVectorManaged.cpp
deleted file mode 100644
index fdbb30af06..0000000000
--- a/pennylane_lightning/src/tests/Test_StateVectorManaged.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-#include "DefaultKernels.hpp"
-#include "LinearAlgebra.hpp"
-#include "StateVectorManaged.hpp"
-#include "StateVectorRaw.hpp"
-#include "Util.hpp"
-#include "cpu_kernels/GateImplementationsPI.hpp"
-
-#include "TestHelpers.hpp"
-#include <catch2/catch.hpp>
-
-#include <algorithm>
-#include <complex>
-#include <iostream>
-#include <limits>
-#include <random>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-using namespace Pennylane;
-
-TEMPLATE_TEST_CASE("StateVectorManaged::StateVectorManaged",
-                   "[StateVectorManaged]", float, double) {
-    using PrecisionT = TestType;
-
-    SECTION("StateVectorManaged") {
-        REQUIRE(!std::is_constructible_v<StateVectorManaged<>>);
-    }
-    SECTION("StateVectorManaged<TestType>") {
-        REQUIRE(!std::is_constructible_v<StateVectorManaged<TestType>>);
-    }
-    SECTION("StateVectorManaged<TestType> {size_t}") {
-        REQUIRE(std::is_constructible_v<StateVectorManaged<TestType>, size_t>);
-        const size_t num_qubits = 4;
-        StateVectorManaged<PrecisionT> sv(num_qubits);
-
-        REQUIRE(sv.getNumQubits() == 4);
-        REQUIRE(sv.getLength() == 16);
-        REQUIRE(sv.getDataVector().size() == 16);
-    }
-    SECTION("StateVectorManaged<TestType> {const StateVectorRaw<TestType>&}") {
-        REQUIRE(std::is_constructible_v<StateVectorManaged<TestType>,
-                                        const StateVectorRaw<TestType> &>);
-    }
-    SECTION(
-        "StateVectorManaged<TestType> {const StateVectorManaged<TestType>&}") {
-        REQUIRE(std::is_copy_constructible_v<StateVectorManaged<TestType>>);
-    }
-    SECTION("StateVectorManaged<TestType> {StateVectorManaged<TestType>&&}") {
-        REQUIRE(std::is_move_constructible_v<StateVectorManaged<TestType>>);
-    }
-}
-
-TEMPLATE_TEST_CASE("StateVectorManaged::applyMatrix with std::vector",
-                   "[StateVectorManaged]", float, double) {
-    using PrecisionT = TestType;
-    SECTION("Test wrong matrix size") {
-        std::vector<std::complex<TestType>> m(7, 0.0);
-        const size_t num_qubits = 4;
-        StateVectorManaged<PrecisionT> sv(num_qubits);
-        REQUIRE_THROWS_WITH(
-            sv.applyMatrix(m, {0, 1}),
-            Catch::Contains(
-                "The size of matrix does not match with the given"));
-    }
-
-    SECTION("Test wrong number of wires") {
-        std::vector<std::complex<TestType>> m(8, 0.0);
-        const size_t num_qubits = 4;
-        StateVectorManaged<PrecisionT> sv(num_qubits);
-        REQUIRE_THROWS_WITH(
-            sv.applyMatrix(m, {0}),
-            Catch::Contains(
-                "The size of matrix does not match with the given"));
-    }
-}
-
-TEMPLATE_TEST_CASE("StateVectorManaged::applyMatrix with a pointer",
-                   "[StateVectorManaged]", float, double) {
-    using PrecisionT = TestType;
-    SECTION("Test wrong matrix") {
-        std::vector<std::complex<TestType>> m(8, 0.0);
-        const size_t num_qubits = 4;
-        StateVectorManaged<PrecisionT> sv(num_qubits);
-        REQUIRE_THROWS_WITH(sv.applyMatrix(m.data(), {}),
-                            Catch::Contains("must be larger than 0"));
-    }
-
-    SECTION("Test with different number of wires") {
-        std::default_random_engine re{1337};
-        const size_t num_qubits = 5;
-        for (size_t num_wires = 1; num_wires < num_qubits; num_wires++) {
-            StateVectorManaged<PrecisionT> sv1(num_qubits);
-            StateVectorManaged<PrecisionT> sv2(num_qubits);
-
-            std::vector<size_t> wires(num_wires);
-            std::iota(wires.begin(), wires.end(), 0);
-
-            const auto m = Util::randomUnitary<PrecisionT>(re, num_wires);
-            sv1.applyMatrix(m, wires);
-            Gates::GateImplementationsPI::applyMultiQubitOp<PrecisionT>(
-                sv2.getData(), num_qubits, m.data(), wires, false);
-            REQUIRE(sv1.getDataVector() ==
-                    approx(sv2.getDataVector()).margin(PrecisionT{1e-5}));
-        }
-    }
-}
-
-TEMPLATE_TEST_CASE("StateVectorManaged::applyOperations",
-                   "[StateVectorManaged]", float, double) {
-    using PrecisionT = TestType;
-
-    std::mt19937 re{1337};
-
-    SECTION("Test invalid arguments without params") {
-        const size_t num_qubits = 4;
-        StateVectorManaged<PrecisionT> sv(num_qubits);
-        REQUIRE_THROWS_WITH(
-            sv.applyOperations({"PauliX", "PauliY"}, {{0}}, {false, false}),
-            Catch::Contains("must all be equal")); // invalid wires
-        REQUIRE_THROWS_WITH(
-            sv.applyOperations({"PauliX", "PauliY"}, {{0}, {1}}, {false}),
-            Catch::Contains("must all be equal")); // invalid inverse
-    }
-
-    SECTION("applyOperations without params works as expected") {
-        const size_t num_qubits = 3;
-        StateVectorManaged<PrecisionT> sv1(num_qubits);
-
-        sv1.updateData(createRandomState<PrecisionT>(re, num_qubits));
-        StateVectorManaged<PrecisionT> sv2 = sv1;
-
-        sv1.applyOperations({"PauliX", "PauliY"}, {{0}, {1}}, {false, false});
-
-        sv2.applyOperation("PauliX", {0}, false);
-        sv2.applyOperation("PauliY", {1}, false);
-
-        REQUIRE(sv1.getDataVector() == approx(sv2.getDataVector()));
-    }
-
-    SECTION("Test invalid arguments with params") {
-        const size_t num_qubits = 4;
-        StateVectorManaged<PrecisionT> sv(num_qubits);
-        REQUIRE_THROWS_WITH(
-            sv.applyOperations({"RX", "RY"}, {{0}}, {false, false},
-                               {{0.0}, {0.0}}),
-            Catch::Contains("must all be equal")); // invalid wires
-        REQUIRE_THROWS_WITH(
-            sv.applyOperations({"RX", "RY"}, {{0}, {1}}, {false},
-                               {{0.0}, {0.0}}),
-            Catch::Contains("must all be equal")); // invalid inverse
-
-        REQUIRE_THROWS_WITH(
-            sv.applyOperations({"RX", "RY"}, {{0}, {1}}, {false, false},
-                               {{0.0}}),
-            Catch::Contains("must all be equal")); // invalid params
-    }
-
-    SECTION("applyOperations with params works as expected") {
-        const size_t num_qubits = 3;
-        StateVectorManaged<PrecisionT> sv1(num_qubits);
-
-        sv1.updateData(createRandomState<PrecisionT>(re, num_qubits));
-        StateVectorManaged<PrecisionT> sv2 = sv1;
-
-        sv1.applyOperations({"RX", "RY"}, {{0}, {1}}, {false, false},
-                            {{0.1}, {0.2}});
-
-        sv2.applyOperation("RX", {0}, false, {0.1});
-        sv2.applyOperation("RY", {1}, false, {0.2});
-
-        REQUIRE(sv1.getDataVector() == approx(sv2.getDataVector()));
-    }
-}