amazon-braket
diff --git a/‎doc/devices/braket_local.rst
+26 b/‎doc/devices/braket_local.rst
+26
diff --git a/‎src/braket/pennylane_plugin/braket_device.py
+89-54 b/‎src/braket/pennylane_plugin/braket_device.py
+89-54
@@ -42,6 +42,32 @@ When executed, the circuit will perform the computation on the local machine.
 >>> circuit(0.2, 0.1, 0.3)
 array([0.97517033, 0.04904283])
 
+Enabling the parallel execution of multiple circuits
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Where supported by the backend of the local simulator, the local device can be used to execute multiple
+quantum circuits in parallel. To unlock this feature, instantiate the device using the ``parallel=True`` argument:
+
+>>> local_device = qml.device('braket.local.qubit', [... ,] parallel=True)
+
+The details of the parallelization scheme depend on the PennyLane version you use, as well as the specific local simulator
+backend you use.
+
+For example, PennyLane 0.13.0 and higher supports the parallel execution of circuits created during the computation of gradients.
+Just by instantiating the remote device with the ``parallel=True`` option, this feature is automatically used and can
+lead to significant speedups of your optimization pipeline.
+
+The maximum number of circuits that can be executed in parallel is specified by the ``max_parallel`` argument.
+
+>>> local_device = qml.device('braket.local.qubit', [... ,] parallel=True, max_parallel=20)
+
+If ``max_parallel`` is not specified, the local simulator backend will use its own default. Each parallel execution
+will use additional memory, so be careful not to set ``max_parallel`` so high that you run out of memory on your local
+device. The exact limit will depend on your device. Additionally, setting ``max_parallel`` much higher than the number of
+CPU cores available (if you are using a CPU-based local simulator) or GPUs/GPU streams (if you are using a GPU-based local
+simulator) will not improve and may even degrade performance as too many parallel workers begin to contend for the same
+scarce resources.
+
 Device options
 ~~~~~~~~~~~~~~
 
 
@@ -51,6 +51,7 @@
 from braket.devices import Device, LocalSimulator
 from braket.simulator import BraketSimulator
 from braket.tasks import GateModelQuantumTaskResult, QuantumTask
+from braket.tasks.local_quantum_task_batch import LocalQuantumTaskBatch
 from pennylane import QuantumFunctionError, QubitDevice
 from pennylane import numpy as np
 from pennylane.gradients import param_shift
@@ -105,6 +106,12 @@ class BraketQubitDevice(QubitDevice):
             execution.
         verbatim (bool): Whether to run tasks in verbatim mode. Note that verbatim mode only
             supports the native gate set of the device. Default False.
+        parallel (bool): Whether to run tasks in parallel if supported by the device backend.
+            Default False.
+        max_parallel (int, optional): Maximum number of tasks to run on AWS in parallel.
+            Batch creation will fail if this value is greater than the maximum allowed concurrent
+            tasks on the device. If unspecified, uses defaults defined in ``AwsDevice``.
+            Ignored if ``parallel=False``.
         parametrize_differentiable (bool): Whether to bind differentiable parameters (parameters
             marked with ``required_grad=True``) on the Braket device rather than in PennyLane.
             Default: True.
@@ -124,6 +131,8 @@ def __init__(
         shots: Union[int, None],
         noise_model: Optional[NoiseModel] = None,
         verbatim: bool = False,
+        parallel: bool = False,
+        max_parallel: Optional[int] = None,
         parametrize_differentiable: bool = True,
         **run_kwargs,
     ):
@@ -139,6 +148,8 @@ def __init__(
 
         super().__init__(wires, shots=shots or None)
         self._device = device
+        self._parallel = parallel
+        self._max_parallel = max_parallel
         self._circuit = None
         self._task = None
         self._noise_model = noise_model
@@ -179,6 +190,49 @@ def task(self) -> QuantumTask:
         """QuantumTask: The task corresponding to the last run circuit."""
         return self._task
 
+    @property
+    def parallel(self) -> bool:
+        """bool: Whether the device supports parallel execution of batches."""
+        return self._parallel
+
+    def batch_execute(self, circuits, **run_kwargs):
+        if not self._parallel:
+            return super().batch_execute(circuits)
+
+        for circuit in circuits:
+            self.check_validity(circuit.operations, circuit.observables)
+        all_trainable = []
+        braket_circuits = []
+        for circuit in circuits:
+            trainable = (
+                BraketQubitDevice._get_trainable_parameters(circuit)
+                if self._parametrize_differentiable
+                else {}
+            )
+            all_trainable.append(trainable)
+            braket_circuits.append(
+                self._pl_to_braket_circuit(
+                    circuit,
+                    trainable_indices=frozenset(trainable.keys()),
+                    **run_kwargs,
+                )
+            )
+
+        batch_shots = 0 if self.analytic else self.shots
+
+        batch_inputs = (
+            [{f"p_{k}": v for k, v in trainable.items()} for trainable in all_trainable]
+            if self._parametrize_differentiable
+            else []
+        )
+
+        braket_results_batch = self._run_task_batch(braket_circuits, batch_shots, batch_inputs)
+
+        return [
+            self._braket_to_pl_result(braket_result, circuit)
+            for braket_result, circuit in zip(braket_results_batch, circuits)
+        ]
+
     def _pl_to_braket_circuit(
         self,
         circuit: QuantumTape,
@@ -242,6 +296,17 @@ def _apply_gradient_result_type(self, circuit, braket_circuit):
         )
         return braket_circuit
 
+    def _update_tracker_for_batch(
+        self, task_batch: Union[AwsQuantumTaskBatch, LocalQuantumTaskBatch], batch_shots: int
+    ):
+        for task in task_batch.tasks:
+            tracking_data = self._tracking_data(task)
+            self.tracker.update(**tracking_data)
+        total_executions = len(task_batch.tasks) - len(task_batch.unsuccessful)
+        total_shots = total_executions * batch_shots
+        self.tracker.update(batches=1, executions=total_executions, shots=total_shots)
+        self.tracker.record()
+
     def statistics(
         self, braket_result: GateModelQuantumTaskResult, measurements: Sequence[MeasurementProcess]
     ) -> list[float]:
@@ -525,10 +590,6 @@ class BraketAwsQubitDevice(BraketQubitDevice):
             interactions with AWS services, to be supplied if extra control
             is desired. Default: None
             Default: False
-        max_parallel (int, optional): Maximum number of tasks to run on AWS in parallel.
-            Batch creation will fail if this value is greater than the maximum allowed concurrent
-            tasks on the device. If unspecified, uses defaults defined in ``AwsDevice``.
-            Ignored if ``parallel=False``.
         max_connections (int): The maximum number of connections in the Boto3 connection pool.
             Also the maximum number of thread pool workers for the batch.
             Ignored if ``parallel=False``.
@@ -553,8 +614,6 @@ def __init__(
         poll_timeout_seconds: float = AwsQuantumTask.DEFAULT_RESULTS_POLL_TIMEOUT,
         poll_interval_seconds: float = AwsQuantumTask.DEFAULT_RESULTS_POLL_INTERVAL,
         aws_session: Optional[AwsSession] = None,
-        parallel: bool = False,
-        max_parallel: Optional[int] = None,
         max_connections: int = AwsQuantumTaskBatch.MAX_CONNECTIONS_DEFAULT,
         max_retries: int = AwsQuantumTaskBatch.MAX_RETRIES,
         **run_kwargs,
@@ -579,8 +638,6 @@ def __init__(
         self._s3_folder = s3_destination_folder
         self._poll_timeout_seconds = poll_timeout_seconds
         self._poll_interval_seconds = poll_interval_seconds
-        self._parallel = parallel
-        self._max_parallel = max_parallel
         self._max_connections = max_connections
         self._max_retries = max_retries
 
@@ -594,50 +651,19 @@ def use_grouping(self) -> bool:
         caps = self.capabilities()
         return not ("provides_jacobian" in caps and caps["provides_jacobian"])
 
-    @property
-    def parallel(self):
-        return self._parallel
-
-    def batch_execute(self, circuits, **run_kwargs):
-        if not self._parallel:
-            return super().batch_execute(circuits)
-
-        for circuit in circuits:
-            self.check_validity(circuit.operations, circuit.observables)
-        all_trainable = []
-        braket_circuits = []
-        for circuit in circuits:
-            trainable = (
-                BraketQubitDevice._get_trainable_parameters(circuit)
-                if self._parametrize_differentiable
-                else {}
-            )
-            all_trainable.append(trainable)
-            braket_circuits.append(
-                self._pl_to_braket_circuit(
-                    circuit,
-                    trainable_indices=frozenset(trainable.keys()),
-                    **run_kwargs,
-                )
-            )
-
-        batch_shots = 0 if self.analytic else self.shots
-
+    def _run_task_batch(self, batch_circuits, batch_shots: int, inputs):
         task_batch = self._device.run_batch(
-            braket_circuits,
+            batch_circuits,
             s3_destination_folder=self._s3_folder,
             shots=batch_shots,
             max_parallel=self._max_parallel,
             max_connections=self._max_connections,
             poll_timeout_seconds=self._poll_timeout_seconds,
             poll_interval_seconds=self._poll_interval_seconds,
-            inputs=(
-                [{f"p_{k}": v for k, v in trainable.items()} for trainable in all_trainable]
-                if self._parametrize_differentiable
-                else []
-            ),
+            inputs=inputs,
             **self._run_kwargs,
         )
+
         # Call results() to retrieve the Braket results in parallel.
         try:
             braket_results_batch = task_batch.results(
@@ -647,18 +673,9 @@ def batch_execute(self, circuits, **run_kwargs):
         # Update the tracker before raising an exception further if some circuits do not complete.
         finally:
             if self.tracker.active:
-                for task in task_batch.tasks:
-                    tracking_data = self._tracking_data(task)
-                    self.tracker.update(**tracking_data)
-                total_executions = len(task_batch.tasks) - len(task_batch.unsuccessful)
-                total_shots = total_executions * batch_shots
-                self.tracker.update(batches=1, executions=total_executions, shots=total_shots)
-                self.tracker.record()
+                self._update_tracker_for_batch(task_batch, batch_shots)
 
-        return [
-            self._braket_to_pl_result(braket_result, circuit)
-            for braket_result, circuit in zip(braket_results_batch, circuits)
-        ]
+        return braket_results_batch
 
     def _run_task(self, circuit, inputs=None):
         return self._device.run(
@@ -1012,6 +1029,24 @@ def __init__(
         device = LocalSimulator(backend)
         super().__init__(wires, device, shots=shots, **run_kwargs)
 
+    def _run_task_batch(self, batch_circuits, batch_shots: int, inputs):
+        task_batch = self._device.run_batch(
+            batch_circuits,
+            shots=batch_shots,
+            max_parallel=self._max_parallel,
+            inputs=inputs,
+            **self._run_kwargs,
+        )
+
+        # Should not need try-except here as this is a local sim.
+        braket_results_batch = task_batch.results()
+
+        # Update the tracker
+        if self.tracker.active:
+            self._update_tracker_for_batch(task_batch, batch_shots)
+
+        return braket_results_batch
+
     def _run_task(self, circuit, inputs=None):
         return self._device.run(
             circuit,