Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize LQ::probs for large number of targets #807

Merged
merged 4 commits into from
Jul 19, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@

### Improvements

* Optimize the OpenMP parallelization of Lightning-Qubit's `probs` for all number of targets.
[(#807)](https://github.com/PennyLaneAI/pennylane-lightning/pull/807)

* Add GPU device compute capability check for Lightning-Tensor.
[(#803)](https://github.com/PennyLaneAI/pennylane-lightning/pull/803)

Expand Down
2 changes: 1 addition & 1 deletion pennylane_lightning/core/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@
Version number (major.minor.patch[-label])
"""

__version__ = "0.38.0-dev11"
__version__ = "0.38.0-dev12"
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,12 @@ class Measurements final
[[maybe_unused]] const std::vector<std::size_t> &device_wires = {})
-> std::vector<PrecisionT> {
const std::size_t n_wires = wires.size();
if (n_wires == 0) {
return {1.0};
}
const std::size_t num_qubits = this->_statevector.getNumQubits();
// is_equal_to_all_wires is True if `wires` includes all wires in order
// and false otherwise
bool is_equal_to_all_wires = n_wires == num_qubits;
for (std::size_t k = 0; k < n_wires; k++) {
if (!is_equal_to_all_wires) {
Expand All @@ -119,7 +124,7 @@ class Measurements final

const ComplexT *arr_data = this->_statevector.getData();

// Templated 1-4 qubit cases
// Templated 1-4 wire cases; return probs
PROBS_SPECIAL_CASE(1);
PROBS_SPECIAL_CASE(2);
PROBS_SPECIAL_CASE(3);
Expand All @@ -132,15 +137,18 @@ class Measurements final
const std::size_t n_probs = PUtil::exp2(n_wires);
std::vector<PrecisionT> probabilities(n_probs, 0);
auto *probs = probabilities.data();
std::size_t ind_probs = 0;
for (auto index : all_indices) {
// For 5 wires and more, there are at least 32 probs entries to
// parallelize over This scheme was found most favorable in terms of
// memory accesses and it prevents the stack overflow caused by
// `reduction(+ : probs[ : n_probs])` when n_probs approaches 2**20
#if defined PL_LQ_KERNEL_OMP && defined _OPENMP
#pragma omp parallel for reduction(+ : probs[ : n_probs])
#pragma omp parallel for collapse(1)
#endif
for (std::size_t ind_probs = 0; ind_probs < n_probs; ind_probs++) {
for (auto offset : all_offsets) {
probs[ind_probs] += std::norm(arr_data[index + offset]);
probs[ind_probs] +=
std::norm(arr_data[all_indices[ind_probs] + offset]);
}
ind_probs++;
}
return probabilities;
}
Expand Down
Loading