From 58897a92e191b79a73e98182e8fc73faaaa371df Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Thu, 2 Mar 2023 20:58:21 -0500
Subject: [PATCH] Oxidize the internals of Optimize1qGatesDecomposition (#9578)

* Oxidize the internals of Optimize1qGatesDecomposition

This commit rewrites the internals of the Optimize1qGatesDecomposition
transpiler pass to leverage more Rust. As the size of circuits are
growing the amount of time the transpiler typically spends in
Optimize1qGatesDecomposition grows linearly with the number of
circuits. Since #9185 (which converted the angle calculation in the
synthesis routine to Rust) the time spent constructing intermediate
DAGCircuit objects for each possible decomposition has been dominating
the total runtime of the pass. To attempt to alleviate this bottleneck
this commit mvoes as much of the circuit construction to rust as
possible. The one qubit euler decomposition is now done in Rust and a
sequence of gate names along with their corresponding angles are
returned to the pass with the lowest error rate is returned. The pass
will then convert that sequence into a DAGCircuit object if the
decomposition will be substituted into the output dag. This has the
advantage of both speeding up the computation of the output circuit
and also deferring the creation of DAGCircuit and Gate objects until
they're actually needed.

* Move all error calculation to rust

This commit makes 2 changes to the split between python and rust in the
transpiler pass code. First the error mapping is converted to a rust
native pyclass that increases the efficiency of getting the error rates
for gates into the rust side. The second is any intermediate error
scoring is done in rust. This is primarily to simplify the code as we're
already doing the calculation with the new class in Rust.

* Remove parallel iteration over multiple target basis

This commit removes the usage of rayon for parallel iteration over the
multiple target basis. In local benchmarking the overhead of using rayon
to spawn a threadpool and divide the work over multiple threads hurts
performance. The execution of the decomposition is sufficiently fast
that iterating serially will be faster than spawning the threadpool
for basis from current backends. So it's better to just remove the
overhead. We can revisit parallelism in the future if it makes sense

* Fix small oversights in internal pass usage

This commit fixes the majority (if not all) the test failures that
occured in earlier test failures. The primary cause of the failures were
places that were calling private functions of the
Optimize1qGatesDecomposition pass internally and not accounting for the
new return types from some of those methods. This has been updated to
handle these edge cases correctly now. Additionally, there was a small
oversight in the porting of the numerics for the psx basis circuit
generator function which was causing incorrect decompositions in some
cases that has been fixed (the missing abs() call was added).

* Add release note

* Simplify logic to construct error map

Co-authored-by: John Lapeyre <jlapeyre@users.noreply.github.com>

* Update comments, docstrings, and variable names in optimize_1q_decomposition

* Make constant list of valid bases more local

* Remove clippy unwrap suppression and use match syntax

* Update releasenotes/notes/speedup-one-qubit-optimize-pass-483429af948a415e.yaml

Co-authored-by: Jake Lishman <jake@binhbar.com>

* Use to_object() instead of clone().into_py()

* Remove out of date comment

* Use FnOnce for X type in circuit_psx_gen

* Add rem_euclid comment

* Fix u3/u321 condition in _possible_decomposers

---------

Co-authored-by: John Lapeyre <jlapeyre@users.noreply.github.com>
Co-authored-by: Jake Lishman <jake@binhbar.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
---
 .../optimization/optimize_1q_commutation.py   |   4 +-
 .../optimization/optimize_1q_decomposition.py | 154 ++--
 .../passes/synthesis/unitary_synthesis.py     |   4 +-
 ...-qubit-optimize-pass-483429af948a415e.yaml |  10 +
 src/euler_one_qubit_decomposer.rs             | 678 +++++++++++++++++-
 .../test_optimize_1q_decomposition.py         |  32 -
 6 files changed, 786 insertions(+), 96 deletions(-)
 create mode 100644 releasenotes/notes/speedup-one-qubit-optimize-pass-483429af948a415e.yaml

diff --git a/qiskit/transpiler/passes/optimization/optimize_1q_commutation.py b/qiskit/transpiler/passes/optimization/optimize_1q_commutation.py
index 11513ef36126..98f000df5d34 100644
--- a/qiskit/transpiler/passes/optimization/optimize_1q_commutation.py
+++ b/qiskit/transpiler/passes/optimization/optimize_1q_commutation.py
@@ -165,7 +165,9 @@ def _resynthesize(self, run, qubit):
         operator = run[0].op.to_matrix()
         for gate in run[1:]:
             operator = gate.op.to_matrix().dot(operator)
-        return self._optimize1q._resynthesize_run(operator, qubit)
+        return self._optimize1q._gate_sequence_to_dag(
+            self._optimize1q._resynthesize_run(operator, qubit)
+        )
 
     @staticmethod
     def _replace_subdag(dag, old_run, new_dag):
diff --git a/qiskit/transpiler/passes/optimization/optimize_1q_decomposition.py b/qiskit/transpiler/passes/optimization/optimize_1q_decomposition.py
index 4822c78c42e9..28caccc19340 100644
--- a/qiskit/transpiler/passes/optimization/optimize_1q_decomposition.py
+++ b/qiskit/transpiler/passes/optimization/optimize_1q_decomposition.py
@@ -13,15 +13,48 @@
 """Optimize chains of single-qubit gates using Euler 1q decomposer"""
 
 import logging
-from functools import partial
-import numpy as np
+import math
 
 from qiskit.transpiler.basepasses import TransformationPass
 from qiskit.transpiler.passes.utils import control_flow
 from qiskit.quantum_info.synthesis import one_qubit_decompose
+from qiskit._accelerate import euler_one_qubit_decomposer
+from qiskit.circuit.library.standard_gates import (
+    UGate,
+    PhaseGate,
+    U3Gate,
+    U2Gate,
+    U1Gate,
+    RXGate,
+    RYGate,
+    RZGate,
+    RGate,
+    SXGate,
+    XGate,
+)
+from qiskit.circuit import Qubit
+from qiskit.dagcircuit.dagcircuit import DAGCircuit
+
 
 logger = logging.getLogger(__name__)
 
+# When expanding the list of supported gates this needs to updated in
+# lockstep with the VALID_BASES constant in src/euler_one_qubit_decomposer.rs
+# and the global variables in qiskit/quantum_info/synthesis/one_qubit_decompose.py
+NAME_MAP = {
+    "u": UGate,
+    "u1": U1Gate,
+    "u2": U2Gate,
+    "u3": U3Gate,
+    "p": PhaseGate,
+    "rx": RXGate,
+    "ry": RYGate,
+    "rz": RZGate,
+    "r": RGate,
+    "sx": SXGate,
+    "x": XGate,
+}
+
 
 class Optimize1qGatesDecomposition(TransformationPass):
     """Optimize chains of single-qubit gates by combining them into a single gate.
@@ -58,6 +91,23 @@ def __init__(self, basis=None, target=None):
             self._global_decomposers = _possible_decomposers(None)
             self._basis_gates = None
 
+        self.error_map = self._build_error_map()
+
+    def _build_error_map(self):
+        if self._target is not None:
+            error_map = euler_one_qubit_decomposer.OneQubitGateErrorMap(self._target.num_qubits)
+            for qubit in range(self._target.num_qubits):
+                gate_error = {}
+                for gate, gate_props in self._target.items():
+                    if gate_props is not None:
+                        props = gate_props.get((qubit,), None)
+                        if props is not None and props.error is not None:
+                            gate_error[gate] = props.error
+                error_map.add_qubit(gate_error)
+            return error_map
+        else:
+            return None
+
     def _resynthesize_run(self, matrix, qubit=None):
         """
         Resynthesizes one 2x2 `matrix`, typically extracted via `dag.collect_1q_runs`.
@@ -81,13 +131,23 @@ def _resynthesize_run(self, matrix, qubit=None):
                 self._local_decomposers_cache[qubits_tuple] = decomposers
         else:
             decomposers = self._global_decomposers
+        best_synth_circuit = euler_one_qubit_decomposer.unitary_to_gate_sequence(
+            matrix,
+            decomposers,
+            qubit,
+            self.error_map,
+        )
+        return best_synth_circuit
 
-        new_circs = [decomposer._decompose(matrix) for decomposer in decomposers]
+    def _gate_sequence_to_dag(self, best_synth_circuit):
+        qubits = [Qubit()]
+        out_dag = DAGCircuit()
+        out_dag.add_qubits(qubits)
+        out_dag.global_phase = best_synth_circuit.global_phase
 
-        if len(new_circs) == 0:
-            return None
-        else:
-            return min(new_circs, key=partial(_error, target=self._target, qubit=qubit))
+        for gate_name, angles in best_synth_circuit:
+            out_dag.apply_operation_back(NAME_MAP[gate_name](*angles), qubits)
+        return out_dag
 
     def _substitution_checks(self, dag, old_run, new_circ, basis, qubit):
         """
@@ -115,11 +175,8 @@ def _substitution_checks(self, dag, old_run, new_circ, basis, qubit):
         #    then we _try_ to decompose, using the results if we see improvement.
         return (
             uncalibrated_and_not_basis_p
-            or (
-                uncalibrated_p
-                and _error(new_circ, self._target, qubit) < _error(old_run, self._target, qubit)
-            )
-            or np.isclose(_error(new_circ, self._target, qubit), 0)
+            or (uncalibrated_p and self._error(new_circ, qubit) < self._error(old_run, qubit))
+            or math.isclose(self._error(new_circ, qubit)[0], 0)
         )
 
     @control_flow.trivial_recurse
@@ -139,14 +196,17 @@ def run(self, dag):
             operator = run[0].op.to_matrix()
             for node in run[1:]:
                 operator = node.op.to_matrix().dot(operator)
-            new_dag = self._resynthesize_run(operator, qubit)
+            best_circuit_sequence = self._resynthesize_run(operator, qubit)
 
             if self._target is None:
                 basis = self._basis_gates
             else:
                 basis = self._target.operation_names_for_qargs((qubit,))
 
-            if new_dag is not None and self._substitution_checks(dag, run, new_dag, basis, qubit):
+            if best_circuit_sequence is not None and self._substitution_checks(
+                dag, run, best_circuit_sequence, basis, qubit
+            ):
+                new_dag = self._gate_sequence_to_dag(best_circuit_sequence)
                 dag.substitute_node_with_dag(run[0], new_dag)
                 # Delete the other nodes in the run
                 for current_node in run[1:]:
@@ -154,55 +214,35 @@ def run(self, dag):
 
         return dag
 
+    def _error(self, circuit, qubit):
+        """
+        Calculate a rough error for a `circuit` that runs on a specific
+        `qubit` of `target` (`circuit` can either be an OneQubitGateSequence
+        from Rust or a list of DAGOPNodes).
+
+        Use basis errors from target if available, otherwise use length
+        of circuit as a weak proxy for error.
+        """
+        if isinstance(circuit, euler_one_qubit_decomposer.OneQubitGateSequence):
+            return euler_one_qubit_decomposer.compute_error_one_qubit_sequence(
+                circuit, qubit, self.error_map
+            )
+        else:
+            circuit_list = [(x.op.name, []) for x in circuit]
+            return euler_one_qubit_decomposer.compute_error_list(
+                circuit_list, qubit, self.error_map
+            )
+
 
 def _possible_decomposers(basis_set):
     decomposers = []
     if basis_set is None:
-        decomposers = [
-            one_qubit_decompose.OneQubitEulerDecomposer(basis, use_dag=True)
-            for basis in one_qubit_decompose.ONE_QUBIT_EULER_BASIS_GATES
-        ]
+        decomposers = list(one_qubit_decompose.ONE_QUBIT_EULER_BASIS_GATES)
     else:
         euler_basis_gates = one_qubit_decompose.ONE_QUBIT_EULER_BASIS_GATES
         for euler_basis_name, gates in euler_basis_gates.items():
             if set(gates).issubset(basis_set):
-                decomposer = one_qubit_decompose.OneQubitEulerDecomposer(
-                    euler_basis_name, use_dag=True
-                )
-                decomposers.append(decomposer)
+                decomposers.append(euler_basis_name)
+        if "U3" in decomposers and "U321" in decomposers:
+            decomposers.remove("U3")
     return decomposers
-
-
-def _error(circuit, target=None, qubit=None):
-    """
-    Calculate a rough error for a `circuit` that runs on a specific
-    `qubit` of `target` (circuit could also be a list of DAGNodes)
-
-    Use basis errors from target if available, otherwise use length
-    of circuit as a weak proxy for error.
-    """
-    if target is None:
-        if isinstance(circuit, list):
-            return len(circuit)
-        else:
-            return len(circuit._multi_graph) - 2
-    else:
-        if isinstance(circuit, list):
-            gate_fidelities = [
-                1 - getattr(target[node.name].get((qubit,)), "error", 0.0) for node in circuit
-            ]
-        else:
-            gate_fidelities = [
-                1 - getattr(target[inst.op.name].get((qubit,)), "error", 0.0)
-                for inst in circuit.op_nodes()
-            ]
-        gate_error = 1 - np.product(gate_fidelities)
-        if gate_error == 0.0:
-            if isinstance(circuit, list):
-                return -100 + len(circuit)
-            else:
-                return -100 + len(
-                    circuit._multi_graph
-                )  # prefer shorter circuits among those with zero error
-        else:
-            return gate_error
diff --git a/qiskit/transpiler/passes/synthesis/unitary_synthesis.py b/qiskit/transpiler/passes/synthesis/unitary_synthesis.py
index 53f60a3b15a7..a9c850b4f11c 100644
--- a/qiskit/transpiler/passes/synthesis/unitary_synthesis.py
+++ b/qiskit/transpiler/passes/synthesis/unitary_synthesis.py
@@ -725,7 +725,9 @@ def run(self, unitary, **options):
 
         if unitary.shape == (2, 2):
             _decomposer1q = Optimize1qGatesDecomposition(basis_gates, target)
-            return _decomposer1q._resynthesize_run(unitary, qubits[0])  # already in dag format
+            return _decomposer1q._gate_sequence_to_dag(
+                _decomposer1q._resynthesize_run(unitary, qubits[0])
+            )
         elif unitary.shape == (4, 4):
             # select synthesizers that can lower to the target
             if target is not None:
diff --git a/releasenotes/notes/speedup-one-qubit-optimize-pass-483429af948a415e.yaml b/releasenotes/notes/speedup-one-qubit-optimize-pass-483429af948a415e.yaml
new file mode 100644
index 000000000000..cd4cc7a8d749
--- /dev/null
+++ b/releasenotes/notes/speedup-one-qubit-optimize-pass-483429af948a415e.yaml
@@ -0,0 +1,10 @@
+---
+features:
+  - |
+    The runtime performance of the :class:`~.Optimize1qGatesDecomposition`
+    transpiler pass has been significantly improved. This was done by both
+    rewriting all the computation for the pass in Rust and also decreasing
+    the amount of intermediate objects created as part of the pass's
+    execution. This should also correspond to a similar improvement
+    in the runtime performance of :func:`~.transpile` with the
+    ``optimization_level`` keyword argument set to ``1``, ``2``, or ``3``.
diff --git a/src/euler_one_qubit_decomposer.rs b/src/euler_one_qubit_decomposer.rs
index 07a84c585959..2aec4d7b3428 100644
--- a/src/euler_one_qubit_decomposer.rs
+++ b/src/euler_one_qubit_decomposer.rs
@@ -10,15 +10,635 @@
 // copyright notice, and modified files need to carry a notice indicating
 // that they have been altered from the originals.
 
+#![allow(clippy::too_many_arguments)]
+
+use hashbrown::HashMap;
 use num_complex::{Complex64, ComplexFloat};
+use std::cmp::Ordering;
+use std::f64::consts::PI;
+
+use pyo3::exceptions::{PyIndexError, PyTypeError};
 use pyo3::prelude::*;
+use pyo3::types::PySlice;
 use pyo3::wrap_pyfunction;
 use pyo3::Python;
-use std::f64::consts::PI;
 
 use ndarray::prelude::*;
 use numpy::PyReadonlyArray2;
 
+const DEFAULT_ATOL: f64 = 1e-12;
+
+#[derive(FromPyObject)]
+enum SliceOrInt<'a> {
+    Slice(&'a PySlice),
+    Int(isize),
+}
+
+#[pyclass]
+pub struct OneQubitGateErrorMap {
+    error_map: Vec<HashMap<String, f64>>,
+}
+
+#[pymethods]
+impl OneQubitGateErrorMap {
+    #[new]
+    fn new(num_qubits: Option<usize>) -> Self {
+        OneQubitGateErrorMap {
+            error_map: match num_qubits {
+                Some(n) => Vec::with_capacity(n),
+                None => Vec::new(),
+            },
+        }
+    }
+    fn add_qubit(&mut self, error_map: HashMap<String, f64>) {
+        self.error_map.push(error_map);
+    }
+}
+
+#[pyclass(sequence)]
+pub struct OneQubitGateSequence {
+    gates: Vec<(String, Vec<f64>)>,
+    #[pyo3(get)]
+    global_phase: f64,
+}
+
+#[pymethods]
+impl OneQubitGateSequence {
+    #[new]
+    fn new() -> Self {
+        OneQubitGateSequence {
+            gates: Vec::new(),
+            global_phase: 0.,
+        }
+    }
+    fn __getstate__(&self) -> (Vec<(String, Vec<f64>)>, f64) {
+        (self.gates.clone(), self.global_phase)
+    }
+
+    fn __setstate__(&mut self, state: (Vec<(String, Vec<f64>)>, f64)) {
+        self.gates = state.0;
+        self.global_phase = state.1;
+    }
+
+    fn __len__(&self) -> PyResult<usize> {
+        Ok(self.gates.len())
+    }
+
+    fn __getitem__(&self, py: Python, idx: SliceOrInt) -> PyResult<PyObject> {
+        match idx {
+            SliceOrInt::Slice(slc) => {
+                let len = self.gates.len().try_into().unwrap();
+                let indices = slc.indices(len)?;
+                let mut out_vec: Vec<(String, Vec<f64>)> = Vec::new();
+                // Start and stop will always be positive the slice api converts
+                // negatives to the index for example:
+                // list(range(5))[-1:-3:-1]
+                // will return start=4, stop=2, and step=-1
+                let mut pos: isize = indices.start;
+                let mut cond = if indices.step < 0 {
+                    pos > indices.stop
+                } else {
+                    pos < indices.stop
+                };
+                while cond {
+                    if pos < len as isize {
+                        out_vec.push(self.gates[pos as usize].clone());
+                    }
+                    pos += indices.step;
+                    if indices.step < 0 {
+                        cond = pos > indices.stop;
+                    } else {
+                        cond = pos < indices.stop;
+                    }
+                }
+                Ok(out_vec.into_py(py))
+            }
+            SliceOrInt::Int(idx) => {
+                let len = self.gates.len() as isize;
+                if idx >= len || idx < -len {
+                    Err(PyIndexError::new_err(format!("Invalid index, {idx}")))
+                } else if idx < 0 {
+                    let len = self.gates.len();
+                    Ok(self.gates[len - idx.unsigned_abs()].to_object(py))
+                } else {
+                    Ok(self.gates[idx as usize].to_object(py))
+                }
+            }
+        }
+    }
+}
+
+fn circuit_kak(
+    theta: f64,
+    phi: f64,
+    lam: f64,
+    phase: f64,
+    k_gate: &str,
+    a_gate: &str,
+    simplify: bool,
+    atol: Option<f64>,
+) -> OneQubitGateSequence {
+    let mut lam = lam;
+    let mut theta = theta;
+    let mut phi = phi;
+    let mut circuit: Vec<(String, Vec<f64>)> = Vec::with_capacity(3);
+    let mut atol = match atol {
+        Some(atol) => atol,
+        None => DEFAULT_ATOL,
+    };
+    if !simplify {
+        atol = -1.0;
+    }
+    let mut global_phase = phase - (phi + lam) / 2.;
+    if theta.abs() < atol {
+        lam += phi;
+        // NOTE: The following normalization is safe, because the gphase correction below
+        // fixes a particular diagonal entry to 1, which prevents any potential phase
+        // slippage coming from _mod_2pi injecting multiples of 2pi.
+        lam = mod_2pi(lam);
+        if lam.abs() > atol {
+            circuit.push((String::from(k_gate), vec![lam]));
+            global_phase += lam / 2.;
+        }
+        return OneQubitGateSequence {
+            gates: circuit,
+            global_phase,
+        };
+    }
+    if (theta - PI).abs() < atol {
+        global_phase += phi;
+        lam -= phi;
+        phi = 0.;
+    }
+    if mod_2pi(lam + PI).abs() < atol || mod_2pi(phi + PI).abs() < atol {
+        lam += PI;
+        theta = -theta;
+        phi += PI;
+    }
+    lam = mod_2pi(lam);
+    if lam.abs() > atol {
+        global_phase += lam / 2.;
+        circuit.push((String::from(k_gate), vec![lam]));
+    }
+    circuit.push((String::from(a_gate), vec![theta]));
+    phi = mod_2pi(phi);
+    if phi.abs() > atol {
+        global_phase += phi / 2.;
+        circuit.push((String::from(k_gate), vec![phi]));
+    }
+    OneQubitGateSequence {
+        gates: circuit,
+        global_phase,
+    }
+}
+
+fn circuit_u3(
+    theta: f64,
+    phi: f64,
+    lam: f64,
+    phase: f64,
+    simplify: bool,
+    atol: Option<f64>,
+) -> OneQubitGateSequence {
+    let mut circuit = Vec::new();
+    let phi = mod_2pi(phi);
+    let lam = mod_2pi(lam);
+    let atol = match atol {
+        Some(atol) => atol,
+        None => DEFAULT_ATOL,
+    };
+    if !simplify || theta.abs() > atol || phi.abs() > atol || lam.abs() > atol {
+        circuit.push((String::from("u3"), vec![theta, phi, lam]));
+    }
+    OneQubitGateSequence {
+        gates: circuit,
+        global_phase: phase,
+    }
+}
+
+fn circuit_u321(
+    theta: f64,
+    phi: f64,
+    lam: f64,
+    phase: f64,
+    simplify: bool,
+    atol: Option<f64>,
+) -> OneQubitGateSequence {
+    let mut circuit = Vec::new();
+    let mut atol = match atol {
+        Some(atol) => atol,
+        None => DEFAULT_ATOL,
+    };
+    if !simplify {
+        atol = -1.0;
+    }
+    if theta.abs() < atol {
+        let tot = mod_2pi(phi + lam);
+        if tot.abs() > atol {
+            circuit.push((String::from("u1"), vec![tot]));
+        }
+    } else if (theta - PI / 2.).abs() < atol {
+        circuit.push((String::from("u2"), vec![mod_2pi(phi), mod_2pi(lam)]));
+    } else {
+        circuit.push((String::from("u3"), vec![theta, mod_2pi(phi), mod_2pi(lam)]));
+    }
+    OneQubitGateSequence {
+        gates: circuit,
+        global_phase: phase,
+    }
+}
+
+fn circuit_u(
+    theta: f64,
+    phi: f64,
+    lam: f64,
+    phase: f64,
+    simplify: bool,
+    atol: Option<f64>,
+) -> OneQubitGateSequence {
+    let mut circuit = Vec::new();
+    let mut atol = match atol {
+        Some(atol) => atol,
+        None => DEFAULT_ATOL,
+    };
+    if !simplify {
+        atol = -1.0;
+    }
+    let phi = mod_2pi(phi);
+    let lam = mod_2pi(lam);
+    if theta.abs() > atol || phi.abs() > atol || lam.abs() > atol {
+        circuit.push((String::from("u"), vec![theta, phi, lam]));
+    }
+    OneQubitGateSequence {
+        gates: circuit,
+        global_phase: phase,
+    }
+}
+
+fn circuit_psx_gen<F, P, X>(
+    theta: f64,
+    phi: f64,
+    lam: f64,
+    phase: f64,
+    simplify: bool,
+    atol: Option<f64>,
+    mut pfun: P,
+    mut xfun: F,
+    xpifun: Option<X>,
+) -> OneQubitGateSequence
+where
+    F: FnMut(&mut OneQubitGateSequence),
+    P: FnMut(&mut OneQubitGateSequence, f64),
+    X: FnOnce(&mut OneQubitGateSequence),
+{
+    let mut phi = phi;
+    let mut lam = lam;
+    let mut theta = theta;
+    let mut circuit = OneQubitGateSequence {
+        gates: Vec::new(),
+        global_phase: phase,
+    };
+    let mut atol = match atol {
+        Some(atol) => atol,
+        None => DEFAULT_ATOL,
+    };
+    if !simplify {
+        atol = -1.0;
+    }
+    // Early return for zero SX decomposition
+    if theta.abs() < atol {
+        pfun(&mut circuit, lam + phi);
+        return circuit;
+    }
+    // Early return for single SX decomposition
+    if (theta - PI / 2.).abs() < atol {
+        pfun(&mut circuit, lam - PI / 2.);
+        xfun(&mut circuit);
+        pfun(&mut circuit, phi + PI / 2.);
+        return circuit;
+    }
+    // General double SX decomposition
+    if (theta - PI).abs() < atol {
+        circuit.global_phase += lam;
+        phi -= lam;
+        lam = 0.;
+    }
+    if mod_2pi(lam + PI).abs() < atol || mod_2pi(phi).abs() < atol {
+        lam += PI;
+        theta = -theta;
+        phi += PI;
+        circuit.global_phase -= theta;
+    }
+    // Shift theta and phi to turn the decomposition from
+    // RZ(phi).RY(theta).RZ(lam) = RZ(phi).RX(-pi/2).RZ(theta).RX(pi/2).RZ(lam)
+    // into RZ(phi+pi).SX.RZ(theta+pi).SX.RZ(lam).
+    theta += PI;
+    phi += PI;
+    circuit.global_phase -= PI / 2.;
+    // emit circuit
+    pfun(&mut circuit, lam);
+    match xpifun {
+        Some(xpifun) if mod_2pi(theta).abs() < atol => xpifun(&mut circuit),
+        _ => {
+            xfun(&mut circuit);
+            pfun(&mut circuit, theta);
+            xfun(&mut circuit);
+        }
+    };
+    pfun(&mut circuit, phi);
+    circuit
+}
+
+fn circuit_rr(
+    theta: f64,
+    phi: f64,
+    lam: f64,
+    phase: f64,
+    simplify: bool,
+    atol: Option<f64>,
+) -> OneQubitGateSequence {
+    let mut circuit = Vec::new();
+    let mut atol = match atol {
+        Some(atol) => atol,
+        None => DEFAULT_ATOL,
+    };
+    if !simplify {
+        atol = -1.0;
+    }
+    if theta.abs() < atol && phi.abs() < atol && lam.abs() < atol {
+        return OneQubitGateSequence {
+            gates: circuit,
+            global_phase: phase,
+        };
+    }
+    if (theta - PI).abs() > atol {
+        circuit.push((String::from("r"), vec![theta - PI, mod_2pi(PI / 2. - lam)]));
+    }
+    circuit.push((String::from("r"), vec![PI, mod_2pi(0.5 * (phi - lam + PI))]));
+    OneQubitGateSequence {
+        gates: circuit,
+        global_phase: phase,
+    }
+}
+
+#[pyfunction]
+pub fn generate_circuit(
+    target_basis: &str,
+    theta: f64,
+    phi: f64,
+    lam: f64,
+    phase: f64,
+    simplify: bool,
+    atol: Option<f64>,
+) -> PyResult<OneQubitGateSequence> {
+    let res = match target_basis {
+        "ZYZ" => circuit_kak(theta, phi, lam, phase, "rz", "ry", simplify, atol),
+        "ZXZ" => circuit_kak(theta, phi, lam, phase, "rz", "rx", simplify, atol),
+        "XZX" => circuit_kak(theta, phi, lam, phase, "rx", "rz", simplify, atol),
+        "XYX" => circuit_kak(theta, phi, lam, phase, "rx", "ry", simplify, atol),
+        "U3" => circuit_u3(theta, phi, lam, phase, simplify, atol),
+        "U321" => circuit_u321(theta, phi, lam, phase, simplify, atol),
+        "U" => circuit_u(theta, phi, lam, phase, simplify, atol),
+        "PSX" => {
+            let mut inner_atol = match atol {
+                Some(atol) => atol,
+                None => DEFAULT_ATOL,
+            };
+            if !simplify {
+                inner_atol = -1.0;
+            }
+            let fnz = |circuit: &mut OneQubitGateSequence, phi: f64| {
+                let phi = mod_2pi(phi);
+                if phi.abs() > inner_atol {
+                    circuit.gates.push((String::from("p"), vec![phi]));
+                }
+            };
+            let fnx = |circuit: &mut OneQubitGateSequence| {
+                circuit.gates.push((String::from("sx"), Vec::new()));
+            };
+
+            circuit_psx_gen(
+                theta,
+                phi,
+                lam,
+                phase,
+                simplify,
+                atol,
+                fnz,
+                fnx,
+                None::<Box<dyn FnOnce(&mut OneQubitGateSequence)>>,
+            )
+        }
+        "ZSX" => {
+            let mut inner_atol = match atol {
+                Some(atol) => atol,
+                None => DEFAULT_ATOL,
+            };
+            if !simplify {
+                inner_atol = -1.0;
+            }
+            let fnz = |circuit: &mut OneQubitGateSequence, phi: f64| {
+                let phi = mod_2pi(phi);
+                if phi.abs() > inner_atol {
+                    circuit.gates.push((String::from("rz"), vec![phi]));
+                    circuit.global_phase += phi / 2.;
+                }
+            };
+            let fnx = |circuit: &mut OneQubitGateSequence| {
+                circuit.gates.push((String::from("sx"), Vec::new()));
+            };
+            circuit_psx_gen(
+                theta,
+                phi,
+                lam,
+                phase,
+                simplify,
+                atol,
+                fnz,
+                fnx,
+                None::<Box<dyn FnOnce(&mut OneQubitGateSequence)>>,
+            )
+        }
+        "U1X" => {
+            let mut inner_atol = match atol {
+                Some(atol) => atol,
+                None => DEFAULT_ATOL,
+            };
+            if !simplify {
+                inner_atol = -1.0;
+            }
+            let fnz = |circuit: &mut OneQubitGateSequence, phi: f64| {
+                let phi = mod_2pi(phi);
+                if phi.abs() > inner_atol {
+                    circuit.gates.push((String::from("u1"), vec![phi]));
+                }
+            };
+            let fnx = |circuit: &mut OneQubitGateSequence| {
+                circuit.global_phase += PI / 4.;
+                circuit.gates.push((String::from("rx"), vec![PI / 2.]));
+            };
+            circuit_psx_gen(
+                theta,
+                phi,
+                lam,
+                phase,
+                simplify,
+                atol,
+                fnz,
+                fnx,
+                None::<Box<dyn FnOnce(&mut OneQubitGateSequence)>>,
+            )
+        }
+        "ZSXX" => {
+            let mut inner_atol = match atol {
+                Some(atol) => atol,
+                None => DEFAULT_ATOL,
+            };
+            if !simplify {
+                inner_atol = -1.0;
+            }
+            let fnz = |circuit: &mut OneQubitGateSequence, phi: f64| {
+                let phi = mod_2pi(phi);
+                if phi.abs() > inner_atol {
+                    circuit.gates.push((String::from("rz"), vec![phi]));
+                    circuit.global_phase += phi / 2.;
+                }
+            };
+            let fnx = |circuit: &mut OneQubitGateSequence| {
+                circuit.gates.push((String::from("sx"), Vec::new()));
+            };
+            let fnxpi = |circuit: &mut OneQubitGateSequence| {
+                circuit.gates.push((String::from("x"), Vec::new()));
+            };
+            circuit_psx_gen(
+                theta,
+                phi,
+                lam,
+                phase,
+                simplify,
+                atol,
+                fnz,
+                fnx,
+                Some(fnxpi),
+            )
+        }
+        "RR" => circuit_rr(theta, phi, lam, phase, simplify, atol),
+        other => {
+            return Err(PyTypeError::new_err(format!(
+                "Invalid target basis: {other}"
+            )))
+        }
+    };
+    Ok(res)
+}
+
+#[inline]
+fn angles_from_unitary(unitary: ArrayView2<Complex64>, target_basis: &str) -> [f64; 4] {
+    match target_basis {
+        "U321" => params_u3_inner(unitary),
+        "U3" => params_u3_inner(unitary),
+        "U" => params_u3_inner(unitary),
+        "PSX" => params_u1x_inner(unitary),
+        "ZSX" => params_u1x_inner(unitary),
+        "ZSXX" => params_u1x_inner(unitary),
+        "U1X" => params_u1x_inner(unitary),
+        "RR" => params_zyz_inner(unitary),
+        "ZYZ" => params_zyz_inner(unitary),
+        "ZXZ" => params_zxz_inner(unitary),
+        "XYX" => params_xyx_inner(unitary),
+        "XZX" => params_xzx_inner(unitary),
+        &_ => unreachable!(),
+    }
+}
+
+#[inline]
+fn compare_error_fn(
+    circuit: &OneQubitGateSequence,
+    error_map: &Option<&OneQubitGateErrorMap>,
+    qubit: usize,
+) -> (f64, usize) {
+    match error_map {
+        Some(global_err_map) => {
+            let err_map = &global_err_map.error_map[qubit];
+            let fidelity_product: f64 = circuit
+                .gates
+                .iter()
+                .map(|x| 1. - err_map.get(&x.0).unwrap_or(&0.))
+                .product();
+            (1. - fidelity_product, circuit.gates.len())
+        }
+        None => (circuit.gates.len() as f64, circuit.gates.len()),
+    }
+}
+
+fn compute_error(
+    gates: &[(String, Vec<f64>)],
+    error_map: Option<&OneQubitGateErrorMap>,
+    qubit: usize,
+) -> (f64, usize) {
+    match error_map {
+        Some(err_map) => {
+            let num_gates = gates.len();
+            let gate_fidelities: f64 = gates
+                .iter()
+                .map(|x| 1. - err_map.error_map[qubit].get(&x.0).unwrap_or(&0.))
+                .product();
+            (1. - gate_fidelities, num_gates)
+        }
+        None => (gates.len() as f64, gates.len()),
+    }
+}
+
+#[pyfunction]
+pub fn compute_error_one_qubit_sequence(
+    circuit: &OneQubitGateSequence,
+    qubit: usize,
+    error_map: Option<&OneQubitGateErrorMap>,
+) -> (f64, usize) {
+    compute_error(&circuit.gates, error_map, qubit)
+}
+
+#[pyfunction]
+pub fn compute_error_list(
+    circuit: Vec<(String, Vec<f64>)>,
+    qubit: usize,
+    error_map: Option<&OneQubitGateErrorMap>,
+) -> (f64, usize) {
+    compute_error(&circuit, error_map, qubit)
+}
+
+#[pyfunction]
+pub fn unitary_to_gate_sequence(
+    unitary: PyReadonlyArray2<Complex64>,
+    target_basis_list: Vec<&str>,
+    qubit: usize,
+    error_map: Option<&OneQubitGateErrorMap>,
+) -> PyResult<Option<OneQubitGateSequence>> {
+    const VALID_BASES: [&str; 12] = [
+        "U321", "U3", "U", "PSX", "ZSX", "ZSXX", "U1X", "RR", "ZYZ", "ZXZ", "XYX", "XZX",
+    ];
+    for basis in &target_basis_list {
+        if !VALID_BASES.contains(basis) {
+            return Err(PyTypeError::new_err(format!(
+                "Invalid target basis {basis}"
+            )));
+        }
+    }
+    let unitary_mat = unitary.as_array();
+    let best_result = target_basis_list
+        .iter()
+        .map(|target_basis| {
+            let [theta, phi, lam, phase] = angles_from_unitary(unitary_mat, target_basis);
+            generate_circuit(target_basis, theta, phi, lam, phase, true, None).unwrap()
+        })
+        .min_by(|a, b| {
+            let error_a = compare_error_fn(a, &error_map, qubit);
+            let error_b = compare_error_fn(b, &error_map, qubit);
+            error_a.partial_cmp(&error_b).unwrap_or(Ordering::Equal)
+        });
+    Ok(best_result)
+}
+
 #[inline]
 fn det_one_qubit(mat: ArrayView2<Complex64>) -> Complex64 {
     mat[[0, 0]] * mat[[1, 1]] - mat[[0, 1]] * mat[[1, 0]]
@@ -31,7 +651,10 @@ fn complex_phase(x: Complex64) -> f64 {
 
 #[inline]
 fn mod_2pi(angle: f64) -> f64 {
-    (angle + PI) % (2. * PI) - PI
+    // f64::rem_euclid() isn't exactly the same as Python's % operator, but because
+    // the RHS here is a constant and positive it is effectively equivalent for
+    // this case
+    (angle + PI).rem_euclid(2. * PI) - PI
 }
 
 fn params_zyz_inner(mat: ArrayView2<Complex64>) -> [f64; 4] {
@@ -58,9 +681,36 @@ pub fn params_zyz(unitary: PyReadonlyArray2<Complex64>) -> [f64; 4] {
     params_zyz_inner(mat)
 }
 
+fn params_u3_inner(mat: ArrayView2<Complex64>) -> [f64; 4] {
+    // The determinant of U3 gate depends on its params
+    // via det(u3(theta, phi, lam)) = exp(1j*(phi+lam))
+    // Since the phase is wrt to a SU matrix we must rescale
+    // phase to correct this
+    let [theta, phi, lam, phase] = params_zyz_inner(mat);
+    [theta, phi, lam, phase - 0.5 * (phi + lam)]
+}
+
 #[pyfunction]
-pub fn params_xyx(unitary: PyReadonlyArray2<Complex64>) -> [f64; 4] {
+pub fn params_u3(unitary: PyReadonlyArray2<Complex64>) -> [f64; 4] {
     let mat = unitary.as_array();
+    params_u3_inner(mat)
+}
+
+fn params_u1x_inner(mat: ArrayView2<Complex64>) -> [f64; 4] {
+    // The determinant of this decomposition depends on its params
+    // Since the phase is wrt to a SU matrix we must rescale
+    // phase to correct this
+    let [theta, phi, lam, phase] = params_zyz_inner(mat);
+    [theta, phi, lam, phase - 0.5 * (theta + phi + lam)]
+}
+
+#[pyfunction]
+pub fn params_u1x(unitary: PyReadonlyArray2<Complex64>) -> [f64; 4] {
+    let mat = unitary.as_array();
+    params_u1x_inner(mat)
+}
+
+fn params_xyx_inner(mat: ArrayView2<Complex64>) -> [f64; 4] {
     let mat_zyz = arr2(&[
         [
             0.5 * (mat[[0, 0]] + mat[[0, 1]] + mat[[1, 0]] + mat[[1, 1]]),
@@ -83,8 +733,12 @@ pub fn params_xyx(unitary: PyReadonlyArray2<Complex64>) -> [f64; 4] {
 }
 
 #[pyfunction]
-pub fn params_xzx(unitary: PyReadonlyArray2<Complex64>) -> [f64; 4] {
-    let umat = unitary.as_array();
+pub fn params_xyx(unitary: PyReadonlyArray2<Complex64>) -> [f64; 4] {
+    let mat = unitary.as_array();
+    params_xyx_inner(mat)
+}
+
+fn params_xzx_inner(umat: ArrayView2<Complex64>) -> [f64; 4] {
     let det = det_one_qubit(umat);
     let phase = (Complex64::new(0., -1.) * det.ln()).re / 2.;
     let sqrt_det = det.sqrt();
@@ -102,6 +756,12 @@ pub fn params_xzx(unitary: PyReadonlyArray2<Complex64>) -> [f64; 4] {
     [theta, phi, lam, phase + phase_zxz]
 }
 
+#[pyfunction]
+pub fn params_xzx(unitary: PyReadonlyArray2<Complex64>) -> [f64; 4] {
+    let umat = unitary.as_array();
+    params_xzx_inner(umat)
+}
+
 #[pyfunction]
 pub fn params_zxz(unitary: PyReadonlyArray2<Complex64>) -> [f64; 4] {
     let mat = unitary.as_array();
@@ -114,5 +774,13 @@ pub fn euler_one_qubit_decomposer(_py: Python, m: &PyModule) -> PyResult<()> {
     m.add_wrapped(wrap_pyfunction!(params_xyx))?;
     m.add_wrapped(wrap_pyfunction!(params_xzx))?;
     m.add_wrapped(wrap_pyfunction!(params_zxz))?;
+    m.add_wrapped(wrap_pyfunction!(params_u3))?;
+    m.add_wrapped(wrap_pyfunction!(params_u1x))?;
+    m.add_wrapped(wrap_pyfunction!(generate_circuit))?;
+    m.add_wrapped(wrap_pyfunction!(unitary_to_gate_sequence))?;
+    m.add_wrapped(wrap_pyfunction!(compute_error_one_qubit_sequence))?;
+    m.add_wrapped(wrap_pyfunction!(compute_error_list))?;
+    m.add_class::<OneQubitGateSequence>()?;
+    m.add_class::<OneQubitGateErrorMap>()?;
     Ok(())
 }
diff --git a/test/python/transpiler/test_optimize_1q_decomposition.py b/test/python/transpiler/test_optimize_1q_decomposition.py
index ed44d643cb01..23151ea6f0d6 100644
--- a/test/python/transpiler/test_optimize_1q_decomposition.py
+++ b/test/python/transpiler/test_optimize_1q_decomposition.py
@@ -35,11 +35,9 @@
 from qiskit.transpiler import PassManager, Target, InstructionProperties
 from qiskit.transpiler.passes import Optimize1qGatesDecomposition
 from qiskit.transpiler.passes import BasisTranslator
-from qiskit.transpiler.passes.optimization.optimize_1q_decomposition import _error
 from qiskit.circuit.equivalence_library import SessionEquivalenceLibrary as sel
 from qiskit.quantum_info import Operator
 from qiskit.test import QiskitTestCase
-from qiskit.converters import circuit_to_dag
 
 
 θ = Parameter("θ")
@@ -146,36 +144,6 @@ def test_optimize_away_idenity_no_target(self):
         result = passmanager.run(circuit)
         self.assertEqual(QuantumCircuit(1), result)
 
-    def test_optimize_error_over_target_1(self):
-        """XZX is re-written as ZXZ, which is cheaper according to target."""
-        qr = QuantumRegister(1, "qr")
-        circuit = QuantumCircuit(qr)
-        circuit.rx(np.pi / 7, qr[0])
-        circuit.rz(np.pi / 4, qr[0])
-        circuit.rx(np.pi / 3, qr[0])
-
-        target = target_rz_rx
-        passmanager = PassManager()
-        passmanager.append(Optimize1qGatesDecomposition(target=target))
-        result = passmanager.run(circuit)
-        self.assertLess(
-            _error(circuit_to_dag(result), target, 0), _error(circuit_to_dag(circuit), target, 0)
-        )
-
-    def test_optimize_error_over_target_2(self):
-        """U is re-written as ZYZ, which is cheaper according to target."""
-        qr = QuantumRegister(1, "qr")
-        circuit = QuantumCircuit(qr)
-        circuit.u(np.pi / 7, np.pi / 4, np.pi / 3, qr[0])
-
-        target = target_rz_ry_u
-        passmanager = PassManager()
-        passmanager.append(Optimize1qGatesDecomposition(target=target))
-        result = passmanager.run(circuit)
-        self.assertLess(
-            _error(circuit_to_dag(result), target, 0), _error(circuit_to_dag(circuit), target, 0)
-        )
-
     def test_optimize_error_over_target_3(self):
         """U is shorter than RZ-RY-RZ or RY-RZ-RY so use it when no error given."""
         qr = QuantumRegister(1, "qr")