diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 4500beba13e56..2cf9f31bb908c 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -52,6 +52,7 @@ jobs:
           level: warning
           filter_mode: added
           lib: true
+          pyright_version: 1.1.291
       - name: pylint
         uses: dciborow/action-pylint@0.0.7
         with:
diff --git a/onnxruntime/python/onnxruntime_inference_collection.py b/onnxruntime/python/onnxruntime_inference_collection.py
index f733c13b6d085..557e6dcaedabe 100644
--- a/onnxruntime/python/onnxruntime_inference_collection.py
+++ b/onnxruntime/python/onnxruntime_inference_collection.py
@@ -105,7 +105,6 @@ class Session:
     """
 
     def __init__(self):
-
         # self._sess is managed by the derived class and relies on bindings from C.InferenceSession
         self._sess = None
         self._enable_fallback = True
diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
index b431647313ad4..5ed51b45d0e57 100644
--- a/onnxruntime/python/tools/quantization/calibrate.py
+++ b/onnxruntime/python/tools/quantization/calibrate.py
@@ -633,7 +633,6 @@ def collect_value(self, name_to_arr):
                 )
 
     def merge_histogram(self, old_histogram, data_arr, new_min, new_max, new_threshold):
-
         (old_hist, old_hist_edges, old_min, old_max, old_threshold) = old_histogram
 
         if new_threshold <= old_threshold:
@@ -849,7 +848,6 @@ def create_calibrator(
     use_external_data_format=False,
     extra_options={},
 ):
-
     calibrator = None
     if calibrate_method == CalibrationMethod.MinMax:
         # default settings for min-max algorithm
diff --git a/onnxruntime/python/tools/quantization/onnx_quantizer.py b/onnxruntime/python/tools/quantization/onnx_quantizer.py
index 3c54748ea9df0..98ba1ce24d359 100644
--- a/onnxruntime/python/tools/quantization/onnx_quantizer.py
+++ b/onnxruntime/python/tools/quantization/onnx_quantizer.py
@@ -49,7 +49,6 @@ def __init__(
         op_types_to_quantize,
         extra_options=None,
     ):
-
         if not model_has_infer_metadata(model):
             model = save_and_reload_model(model)
         self.value_infos = {vi.name: vi for vi in model.graph.value_info}
@@ -808,7 +807,11 @@ def __quantize_inputs(
             initializer = find_by_name(node_input, self.model.initializer())
             if initializer is not None:
                 if self.per_channel and op_level_per_channel:
-                    (q_weight_name, zp_name, scale_name,) = self.quantize_weight_per_channel(
+                    (
+                        q_weight_name,
+                        zp_name,
+                        scale_name,
+                    ) = self.quantize_weight_per_channel(
                         initializer.name,
                         self.weight_qType if initializer_use_weight_qType else self.activation_qType,
                         axis,
diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark.py b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
index 7bb23084e1ca9..f76b63c6f1ce1 100644
--- a/onnxruntime/python/tools/tensorrt/perf/benchmark.py
+++ b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
@@ -233,7 +233,6 @@ def get_latency_result(runtimes, batch_size):
 
 
 def get_ort_session_inputs_and_outputs(name, session, ort_input):
-
     sess_inputs = {}
     sess_outputs = None
 
@@ -428,7 +427,6 @@ def inference_ort(
 
 
 def inference_ort_and_get_prediction(name, session, ort_inputs):
-
     ort_outputs = []
     for ort_input in ort_inputs:
         sess_inputs, sess_outputs = get_ort_session_inputs_and_outputs(name, session, ort_input)
@@ -548,7 +546,6 @@ def generate_onnx_model_random_input(test_times, ref_input):
     inputs = []
 
     for i in range(test_times):
-
         input_data = []
         for tensor in ref_input:
             shape = tensor.shape
@@ -743,7 +740,6 @@ def update_metrics_map_ori(model_to_metrics, name, ep_to_operator):
 #
 ###################################################################################################
 def update_fail_model_map(model_to_fail_ep, model_name, ep, e_type, e):
-
     if model_name in model_to_fail_ep and ep in model_to_fail_ep[model_name]:
         return
 
@@ -766,7 +762,6 @@ def update_fail_model_map(model_to_fail_ep, model_name, ep, e_type, e):
 
 
 def update_fail_model_map_ori(model_to_fail_ep, fail_results, model_name, ep, e_type, e):
-
     if model_name in model_to_fail_ep and ep in model_to_fail_ep[model_name]:
         return
 
@@ -785,7 +780,6 @@ def update_fail_model_map_ori(model_to_fail_ep, fail_results, model_name, ep, e_
 
 
 def skip_ep(model_name, ep, model_to_fail_ep):
-
     if model_name not in model_to_fail_ep:
         return False
 
@@ -969,7 +963,6 @@ def find_test_data_directory(path):
 
 
 def parse_models_info_from_directory(path, models):
-
     test_data_dir = find_test_data_directory(path)
 
     if test_data_dir:
@@ -996,7 +989,6 @@ def parse_models_info_from_directory(path, models):
 
 
 def parse_models_info_from_file(root_dir, path, models):
-
     # default working directory
     root_working_directory = root_dir + "perf/"
 
@@ -1004,7 +996,6 @@ def parse_models_info_from_file(root_dir, path, models):
         data = json.load(f)
 
         for row in data:
-
             if "root_working_directory" in row:
                 root_working_directory = row["root_working_directory"]
                 continue
@@ -1185,7 +1176,6 @@ def output_details(results, csv_filename):
 
 
 def output_fail(model_to_fail_ep, csv_filename):
-
     with open(csv_filename, mode="w", newline="") as csv_file:
         column_names = ["model", "ep", "error type", "error message"]
 
@@ -1220,7 +1210,6 @@ def add_status_dict(status_dict, model_name, ep, status):
 
 
 def build_status(status_dict, results, is_fail):
-
     if is_fail:
         for model, model_info in results.items():
             for ep, ep_info in model_info.items():
@@ -1240,7 +1229,6 @@ def build_status(status_dict, results, is_fail):
 
 
 def output_status(results, csv_filename):
-
     need_write_header = True
     if os.path.exists(csv_filename):
         need_write_header = False
@@ -1533,7 +1521,6 @@ def output_metrics(model_to_metrics, csv_filename):
 
         results = []
         for model, ep_info in model_to_metrics.items():
-
             result = {}
             result_fp16 = {}
             result["model_name"] = model
@@ -1663,7 +1650,6 @@ def test_models_eps(args, models):
         ep_results = {"latency": {}, "metrics": {}, "session": {}}
 
         for exec_provider in ep_list:
-
             # Skip model + EP combinations that have already failed in a previous run.
             if skip_ep(name, exec_provider, model_to_fail_ep):
                 continue
@@ -1752,7 +1738,6 @@ def run_model_on_ep(
 
     # use float16.py for cuda fp16 only
     if cuda_fp16 == exec_provider:
-
         # handle model
         if "model_path_fp16" in model_info:
             model_path = os.path.normpath(os.path.join(model_work_dir, model_info["model_path_fp16"]))
@@ -1942,7 +1927,6 @@ def benchmark_model_on_ep(
             return
 
     if result:
-
         ep_results["latency"][exec_provider] = {}
         ep_results["latency"][exec_provider]["average_latency_ms"] = result["average_latency_ms"]
         ep_results["latency"][exec_provider]["latency_90_percentile"] = result["latency_90_percentile"]
diff --git a/onnxruntime/python/tools/tensorrt/perf/perf_utils.py b/onnxruntime/python/tools/tensorrt/perf/perf_utils.py
index 61cac72b271c1..82263d93dc669 100644
--- a/onnxruntime/python/tools/tensorrt/perf/perf_utils.py
+++ b/onnxruntime/python/tools/tensorrt/perf/perf_utils.py
@@ -119,7 +119,6 @@ def pretty_print(pp, json_object):
 
 
 def parse_single_file(f):
-
     try:
         data = json.load(f)
     except Exception as e:
diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
index 23f1be3eeed2f..e347e9ec2d3a3 100644
--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@@ -178,7 +178,12 @@ def run_onnxruntime(
                         fusion_options,
                     )
             if "tf" in model_source:
-                (onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length,) = export_onnx_model_from_tf(
+                (
+                    onnx_model_file,
+                    is_valid_onnx_model,
+                    vocab_size,
+                    max_sequence_length,
+                ) = export_onnx_model_from_tf(
                     model_name,
                     MODELS[model_name][1],
                     MODELS[model_name][2],
diff --git a/onnxruntime/python/tools/transformers/bert_perf_test.py b/onnxruntime/python/tools/transformers/bert_perf_test.py
index 022ee076770be..a0a69786077dd 100644
--- a/onnxruntime/python/tools/transformers/bert_perf_test.py
+++ b/onnxruntime/python/tools/transformers/bert_perf_test.py
@@ -517,7 +517,7 @@ def main():
     with open(summary_file, "w+", newline="") as tsv_file:
         tsv_writer = csv.writer(tsv_file, delimiter="\t", lineterminator="\n")
         headers = None
-        for (key, perf_result) in sorted_results:
+        for key, perf_result in sorted_results:
             params = key.split(",")
             if headers is None:
                 headers = [
diff --git a/onnxruntime/python/tools/transformers/compare_bert_results.py b/onnxruntime/python/tools/transformers/compare_bert_results.py
index 07f5dba88065f..d12348c645bd2 100644
--- a/onnxruntime/python/tools/transformers/compare_bert_results.py
+++ b/onnxruntime/python/tools/transformers/compare_bert_results.py
@@ -90,7 +90,6 @@ def run_test(
     segment_ids_name,
     input_mask_name,
 ):
-
     # Try deduce input names from optimized model.
     input_ids, segment_ids, input_mask = get_bert_inputs(
         optimized_model, input_ids_name, segment_ids_name, input_mask_name
diff --git a/onnxruntime/python/tools/transformers/convert_generation.py b/onnxruntime/python/tools/transformers/convert_generation.py
index 122a574064b47..b7b75847625ad 100644
--- a/onnxruntime/python/tools/transformers/convert_generation.py
+++ b/onnxruntime/python/tools/transformers/convert_generation.py
@@ -1165,7 +1165,6 @@ def generate_gpt2_init_decoder(
 
     # Try without the Casts before and after the MatMuls
     if logits_matmul_to_residual_add_path is None:
-
         # Normalization Node is : LayerNormalization
         logits_matmul_to_residual_add_path = gpt2_init_decoder_model.match_parent_path(
             logits_matmul_node,
diff --git a/onnxruntime/python/tools/transformers/fusion_embedlayer.py b/onnxruntime/python/tools/transformers/fusion_embedlayer.py
index f4ae184bdf825..ac20c3a5381e3 100644
--- a/onnxruntime/python/tools/transformers/fusion_embedlayer.py
+++ b/onnxruntime/python/tools/transformers/fusion_embedlayer.py
@@ -112,7 +112,12 @@ def check_attention_subgraph(
                 logger.debug("No Attention like subgraph in children of LayerNormalization")
                 return False
         else:
-            if children_types != ["Add", "MatMul", "MatMul", "MatMul",] and children_types != [
+            if children_types != [
+                "Add",
+                "MatMul",
+                "MatMul",
+                "MatMul",
+            ] and children_types != [
                 "MatMul",
                 "MatMul",
                 "MatMul",
diff --git a/onnxruntime/python/tools/transformers/models/bart/export.py b/onnxruntime/python/tools/transformers/models/bart/export.py
index c1e0f3224a445..df2de055b317a 100644
--- a/onnxruntime/python/tools/transformers/models/bart/export.py
+++ b/onnxruntime/python/tools/transformers/models/bart/export.py
@@ -32,7 +32,6 @@ def print_args(args):
 
 
 def user_command():
-
     parent_parser = argparse.ArgumentParser(add_help=False)
     parent_parser.add_argument("--max_length", type=int, default=20, help="default to 20")
     parent_parser.add_argument("--min_length", type=int, default=0, help="default to 0")
@@ -66,7 +65,6 @@ def user_command():
 
 
 if __name__ == "__main__":
-
     args = user_command()
     if args.opset_version < 14:
         raise ValueError(f"The minimum supported opset version is 14! The given one was {args.opset_version}.")
diff --git a/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_edinit.py b/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_edinit.py
index f8dc0051b2b9e..d37c677055a94 100644
--- a/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_edinit.py
+++ b/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_edinit.py
@@ -89,7 +89,6 @@ def _create_encoder_export(args, config: BartConfig):
     def _prepare_encoder_decoder_kwargs_for_generation(
         self, input_ids: torch.Tensor, model_kwargs, model_input_name: Optional[str] = None
     ) -> Dict[str, Any]:
-
         # retrieve encoder hidden states
         # 1. get encoder
         encoder = self.get_encoder()
@@ -189,7 +188,6 @@ def export_encoder(args):
     config, tokenizer = export_helper.initialize_config(args)
 
     with torch.no_grad():
-
         model, input_data = export_helper.initialize_model(config, tokenizer, args)
         start_time = time.time()
         model._prepare_encoder_decoder_kwargs_for_generation = _create_encoder_export(args, config).__get__(
diff --git a/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_enc_dec_past.py b/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_enc_dec_past.py
index 7e50c1dbc2aac..07f571cf4f907 100644
--- a/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_enc_dec_past.py
+++ b/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_enc_dec_past.py
@@ -249,7 +249,6 @@ def export_decoder(args):
     config = decoder_config_update(config)
 
     with torch.no_grad():
-
         model, input_data = export_helper.initialize_model(config, tokenizer, args)
         start_time = time.time()
 
diff --git a/onnxruntime/python/tools/transformers/models/bart/utils/onnx_inference.py b/onnxruntime/python/tools/transformers/models/bart/utils/onnx_inference.py
index 5375fa9aac5f1..c4c8a2dcba697 100644
--- a/onnxruntime/python/tools/transformers/models/bart/utils/onnx_inference.py
+++ b/onnxruntime/python/tools/transformers/models/bart/utils/onnx_inference.py
@@ -32,7 +32,6 @@ def run_inference(args):
     config, tokenizer = export_helper.initialize_config(args)
 
     with torch.no_grad():
-
         model, input_data = export_helper.initialize_model(config, tokenizer, args)
         batch_num = 3
         input_data = input_data.repeat(batch_num, 1)
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_tester.py b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_tester.py
index be303b4e188bf..e85b4e1d6e233 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_tester.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_tester.py
@@ -130,7 +130,6 @@ def __init__(
         top_k=20,
         top_k_required_order=False,
     ):
-
         self.batch_size = input_ids.shape[0]
         self.input_length = input_ids.shape[1]
         self.n_layer = num_layer
@@ -462,7 +461,10 @@ def test_generation(
                     )
                     Gpt2Helper.auto_increase_buffer_size(output_buffers, output_shapes)
 
-                    (onnx_io_output, avg_latency_ms,) = Gpt2Helper.onnxruntime_inference_with_binded_io(
+                    (
+                        onnx_io_output,
+                        avg_latency_ms,
+                    ) = Gpt2Helper.onnxruntime_inference_with_binded_io(
                         session,
                         onnx_io_runner.get_inputs(),
                         output_buffers,
diff --git a/onnxruntime/python/tools/transformers/models/longformer/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/longformer/convert_to_onnx.py
index 7427b65a2bf36..451296cc2e7c1 100644
--- a/onnxruntime/python/tools/transformers/models/longformer/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/longformer/convert_to_onnx.py
@@ -149,7 +149,6 @@ def parse_arguments():
 
 # Create a dummy input for ONNX export.
 def get_dummy_inputs(config, export_padding, device):
-
     # When sequence length is multiple of windows size, there is no padding logic in ONNX graph
     sequence_length = config.attention_window[0] + 1 if export_padding else config.attention_window[0]
 
diff --git a/onnxruntime/python/tools/transformers/models/longformer/generate_test_data.py b/onnxruntime/python/tools/transformers/models/longformer/generate_test_data.py
index 379efce27b27a..774ff48248938 100644
--- a/onnxruntime/python/tools/transformers/models/longformer/generate_test_data.py
+++ b/onnxruntime/python/tools/transformers/models/longformer/generate_test_data.py
@@ -264,7 +264,6 @@ def create_longformer_test_data(
     global_mask_name,
     num_global_tokens,
 ):
-
     input_ids, input_mask, global_mask = get_longformer_inputs(model, input_ids_name, input_mask_name, global_mask_name)
     all_inputs = generate_test_data(
         batch_size,
diff --git a/onnxruntime/python/tools/transformers/models/t5/t5_decoder.py b/onnxruntime/python/tools/transformers/models/t5/t5_decoder.py
index 8ff5b23cef1bb..7be201995df75 100644
--- a/onnxruntime/python/tools/transformers/models/t5/t5_decoder.py
+++ b/onnxruntime/python/tools/transformers/models/t5/t5_decoder.py
@@ -93,7 +93,6 @@ def __init__(self, decoder, lm_head, config):
         self.config = config
 
     def forward(self, decoder_input_ids, encoder_attention_mask, encoder_hidden_states, *past):
-
         past_key_values = PastKeyValuesHelper.group_by_layer(past, self.config.num_layers)
 
         decoder_outputs = self.decoder(
diff --git a/onnxruntime/python/tools/transformers/models/t5/t5_encoder_decoder_init.py b/onnxruntime/python/tools/transformers/models/t5/t5_encoder_decoder_init.py
index 5f7f4339aea6f..8a16ff97a4cab 100644
--- a/onnxruntime/python/tools/transformers/models/t5/t5_encoder_decoder_init.py
+++ b/onnxruntime/python/tools/transformers/models/t5/t5_encoder_decoder_init.py
@@ -255,7 +255,7 @@ def verify_onnx(
 
         test_cases = [(4, 11), (1, 2), (3, 1), (8, 5)]
         test_cases_max_diff = []
-        for (batch_size, encode_sequence_length) in test_cases[:max_cases]:
+        for batch_size, encode_sequence_length in test_cases[:max_cases]:
             inputs = T5EncoderDecoderInitInputs.create_dummy(
                 model.config,
                 batch_size,
diff --git a/onnxruntime/python/tools/transformers/onnx_exporter.py b/onnxruntime/python/tools/transformers/onnx_exporter.py
index c4dda99496ebe..81cd2f5b98f90 100644
--- a/onnxruntime/python/tools/transformers/onnx_exporter.py
+++ b/onnxruntime/python/tools/transformers/onnx_exporter.py
@@ -461,7 +461,6 @@ def export_onnx_model_from_pt(
     model_fusion_statistics,
     fusion_options,
 ):
-
     config, model = load_pt_model(model_name, model_class, cache_dir, config_modifier)
     # config, model = load_pt_model_from_tf(model_name)
     model.cpu()
diff --git a/onnxruntime/python/tools/transformers/onnx_model_bart.py b/onnxruntime/python/tools/transformers/onnx_model_bart.py
index 33db231c52332..50580cc40d1a9 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_bart.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_bart.py
@@ -159,7 +159,6 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             return
 
         if matmul_v.input[0] == root_input and matmul_q.input[0] == root_input and matmul_v.input[0] == root_input:
-
             mask_nodes = []
             mask_index = None
             attention_last_node = reshape_qkv_2
diff --git a/onnxruntime/python/tools/transformers/onnx_model_tnlr.py b/onnxruntime/python/tools/transformers/onnx_model_tnlr.py
index dc8f6810914a7..5216229fe4648 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_tnlr.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_tnlr.py
@@ -40,7 +40,6 @@ def create_attention_node(
         output: str,
         add_qk_str: str,
     ) -> Union[NodeProto, None]:
-
         assert num_heads > 0
         if hidden_size > 0 and (hidden_size % num_heads) != 0:
             logger.debug(f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}")
diff --git a/onnxruntime/python/tools/transformers/shape_optimizer.py b/onnxruntime/python/tools/transformers/shape_optimizer.py
index 7174af0ac9ba0..43dda700e038a 100644
--- a/onnxruntime/python/tools/transformers/shape_optimizer.py
+++ b/onnxruntime/python/tools/transformers/shape_optimizer.py
@@ -107,7 +107,6 @@ def add_extra_graph_output(self, extra_outputs):
         names_to_evaluate = []
         output_names = [output.name for output in self.model.graph.output]
         for name in extra_outputs:
-
             if self.get_initializer(name) is not None:  # already a constant
                 continue
             names_to_evaluate.append(name)
diff --git a/onnxruntime/test/providers/cpu/rnn/GRU.py b/onnxruntime/test/providers/cpu/rnn/GRU.py
index 3fee29e9928f0..6fd5a29567190 100644
--- a/onnxruntime/test/providers/cpu/rnn/GRU.py
+++ b/onnxruntime/test/providers/cpu/rnn/GRU.py
@@ -70,7 +70,6 @@ def __init__(self, **params):
             self.two = OneDirectionGRU(np.flip(X, 0), Wbw, Rbw, Bbw, H_0bw, LBR)
 
     def run(self):
-
         if self.direction == "bidirectional":
             f_output = self.one.execute()
             r_output = self.two.execute()
@@ -101,7 +100,6 @@ def run(self):
 
 class OneDirectionGRU:
     def __init__(self, X, W, R, B, initial_h, LBR):
-
         self.X = X
         # remove num_directions axis for W, R, B, H_0
         self.W = np.squeeze(W, axis=0)
@@ -170,7 +168,6 @@ def execute(self):
 class ONNXRuntimeTestContext:
     @staticmethod
     def OneDirectionWeights():
-
         hidden_size = 2
 
         W = np.array(
@@ -199,7 +196,18 @@ def OneDirectionWeights():
             ]
         ).astype(np.float32)
 
-        W_B = np.array([[0.381619, 0.0323954, -0.258721, 0.45056, -0.250755, 0.0967895,]]).astype(  # Wbz  # Wbr
+        W_B = np.array(
+            [
+                [
+                    0.381619,
+                    0.0323954,
+                    -0.258721,
+                    0.45056,
+                    -0.250755,
+                    0.0967895,
+                ]
+            ]
+        ).astype(  # Wbz  # Wbr
             np.float32
         )  # Wbh
         R_B = np.zeros((1, 3 * hidden_size)).astype(np.float32)
@@ -209,7 +217,6 @@ def OneDirectionWeights():
 
     @staticmethod
     def BidirectionalWeights():
-
         W1, R1, B1 = ONNXRuntimeTestContext.OneDirectionWeights()
 
         hidden_size = R1.shape[-1]
@@ -226,7 +233,6 @@ def BidirectionalWeights():
 class GRU_ONNXRuntimeUnitTests:
     @staticmethod
     def ForwardDefaultActivationsSimpleWeightsNoBiasTwoRows():
-
         print(GRU_ONNXRuntimeUnitTests.ForwardDefaultActivationsSimpleWeightsNoBiasTwoRows.__name__)
 
         seq_length = 2
@@ -246,7 +252,6 @@ def ForwardDefaultActivationsSimpleWeightsNoBiasTwoRows():
 
     @staticmethod
     def ReverseDefaultActivationsSimpleWeightsNoBiasTwoRows():
-
         print(GRU_ONNXRuntimeUnitTests.ReverseDefaultActivationsSimpleWeightsNoBiasTwoRows.__name__)
 
         seq_length = 2
@@ -266,7 +271,6 @@ def ReverseDefaultActivationsSimpleWeightsNoBiasTwoRows():
 
     @staticmethod
     def BidirectionalDefaultActivationsSimpleWeightsNoBias(linear_before_reset=0):
-
         print(
             GRU_ONNXRuntimeUnitTests.BidirectionalDefaultActivationsSimpleWeightsNoBias.__name__
             + ".linear_before_reset="
@@ -302,7 +306,6 @@ def BidirectionalDefaultActivationsSimpleWeightsNoBias(linear_before_reset=0):
 
     @staticmethod
     def DefaultActivationsSimpleWeightsWithBias(rows=2, direction="forward", linear_before_reset=0):
-
         print(
             GRU_ONNXRuntimeUnitTests.DefaultActivationsSimpleWeightsWithBias.__name__
             + " batch_parallel="
@@ -379,29 +382,24 @@ def ForwardDefaultActivationsSimpleWeightsWithBiasBatchParallel():
 
     @staticmethod
     def ForwardDefaultActivationsSimpleWeightsWithBiasBatchParallelLinearBeforeReset():
-
         GRU_ONNXRuntimeUnitTests.DefaultActivationsSimpleWeightsWithBias(linear_before_reset=1)
 
     @staticmethod
     def ReverseDefaultActivationsSimpleWeightsWithBiasBatchParallelLinearBeforeReset():
-
         GRU_ONNXRuntimeUnitTests.DefaultActivationsSimpleWeightsWithBias(direction="reverse", linear_before_reset=1)
 
     @staticmethod
     def ForwardDefaultActivationsSimpleWeightsWithBiasLinearBeforeReset():
-
         GRU_ONNXRuntimeUnitTests.DefaultActivationsSimpleWeightsWithBias(rows=1, linear_before_reset=1)
 
     @staticmethod
     def ReverseDefaultActivationsSimpleWeightsWithBiasLinearBeforeReset():
-
         GRU_ONNXRuntimeUnitTests.DefaultActivationsSimpleWeightsWithBias(
             rows=1, direction="reverse", linear_before_reset=1
         )
 
     @staticmethod
     def Legacy_TestGRUOpForwardBasic():
-
         print(GRU_ONNXRuntimeUnitTests.Legacy_TestGRUOpForwardBasic.__name__)
 
         input = np.array([[[-0.455351, -0.276391]], [[-0.185934, -0.269585]]]).astype(np.float32)
@@ -424,7 +422,6 @@ def Legacy_TestGRUOpBackwardBasic():
 
     @staticmethod
     def Legacy_TestGRUOpBidirectionalBasic():
-
         print(GRU_ONNXRuntimeUnitTests.Legacy_TestGRUOpBidirectionalBasic.__name__)
 
         input = np.array([[[-0.455351, -0.276391]], [[-0.185934, -0.269585]]]).astype(np.float32)
diff --git a/onnxruntime/test/providers/cpu/rnn/LSTM.py b/onnxruntime/test/providers/cpu/rnn/LSTM.py
index 039a419552586..6f07740a90cd7 100644
--- a/onnxruntime/test/providers/cpu/rnn/LSTM.py
+++ b/onnxruntime/test/providers/cpu/rnn/LSTM.py
@@ -32,7 +32,6 @@ def print_results(Y, Y_h, Y_c):
 
 class LSTM_Helper:
     def __init__(self, **params):  # type: (*Any) -> None
-
         required_inputs = ["X", "W", "R"]
         for i in required_inputs:
             assert i in params, "Missing Required Input: {0}".format(i)
@@ -110,7 +109,6 @@ def __init__(self, **params):  # type: (*Any) -> None
             )
 
     def run(self):
-
         if self.direction == "bidirectional":
             f_output, f_Y_h, f_Y_c = self.one.execute()
             r_output, r_Y_h, r_Y_c = self.two.execute()
@@ -171,7 +169,6 @@ def __init__(
         input_forget=False,
         clip=9999.0,
     ):
-
         self.X = X
         # remove num_directions axis for W, R, B, P, H_0, C_0
         self.W = np.squeeze(W, axis=0)
@@ -196,7 +193,6 @@ def __init__(
         self.clip = clip
 
     def execute(self):  # type: () -> Tuple[np.ndarray, np.ndarray]
-
         [p_i, p_o, p_f] = np.split(self.P, 3)
         h_list = []
 
@@ -253,7 +249,6 @@ def execute(self):  # type: () -> Tuple[np.ndarray, np.ndarray]
 class LSTM:  # Base):
     @staticmethod
     def SimpleWeightsNoBiasTwoRows(direction):  # type: () -> None
-
         print(LSTM.SimpleWeightsNoBiasTwoRows.__name__ + " direction=" + direction)
 
         seq_length = 2
@@ -285,7 +280,6 @@ def SimpleWeightsNoBiasTwoRows(direction):  # type: () -> None
 
     @staticmethod
     def LargeBatchWithClip(clip):
-
         print(LSTM.LargeBatchWithClip.__name__ + " clip=" + str(clip))
 
         seq_length = 2
@@ -343,7 +337,6 @@ def BatchParallelFalseSeqLengthGreaterThanOne():
 
     @staticmethod
     def export_initial_bias():  # type: () -> None
-
         print(LSTM.export_initial_bias.__name__)
 
         input = np.array([[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]]).astype(np.float32)
@@ -408,13 +401,11 @@ def export_peepholes():  # type: () -> None
 
 
 class ONNXRuntimeTestContext:
-
     hidden_size = 2
     input_size = 2
 
     @staticmethod
     def OneDirectionWeights():
-
         num_directions = 1
         hidden_size = ONNXRuntimeTestContext.hidden_size
         input_size = ONNXRuntimeTestContext.input_size
@@ -506,7 +497,6 @@ def OneDirectionWeights():
 
     @staticmethod
     def BidirectionalWeights():
-
         hidden_size = ONNXRuntimeTestContext.hidden_size
         input_size = ONNXRuntimeTestContext.input_size
 
@@ -609,7 +599,6 @@ def ONNXRuntime_TestLSTMForwardCellState():
 
     @staticmethod
     def ONNXRuntime_TestLSTMActivation():
-
         print(ONNXRuntimeUnitTests.ONNXRuntime_TestLSTMActivation.__name__)
 
         input = ONNXRuntimeTestContext.DefaultInput()
@@ -628,7 +617,6 @@ def ONNXRuntime_TestLSTMActivation():
 
     @staticmethod
     def ONNXRuntime_TestLSTMBatchReallocation():
-
         print(ONNXRuntimeUnitTests.ONNXRuntime_TestLSTMBatchReallocation.__name__)
         seq_length = 2
         batch_size = 1
@@ -686,7 +674,6 @@ def ONNXRuntime_TestLSTMBatchReallocation():
 
     @staticmethod
     def ONNXRuntime_TestLSTMOutputWrite():
-
         print(ONNXRuntimeUnitTests.ONNXRuntime_TestLSTMOutputWrite.__name__)
         seq_length = 2
         batch_size = 1
diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py
index 89fd90ad3a19e..f453532e6e516 100644
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@@ -996,7 +996,6 @@ def testRegisterCustomOpsLibrary(self):
         )
 
     def testOrtValue(self):
-
         numpy_arr_input = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
         numpy_arr_output = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
 
diff --git a/onnxruntime/test/python/onnxruntime_test_python_azure.py b/onnxruntime/test/python/onnxruntime_test_python_azure.py
index f10b4ec3e3e78..5dbac603b2407 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_azure.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_azure.py
@@ -7,10 +7,8 @@
 
 
 class TestAmlEndpoint(unittest.TestCase):
-
     # test an endpoint of adding floats
     def testAddf(self):
-
         sess_opt = ort.SessionOptions()
         sess_opt.add_session_config_entry("azure.endpoint_type", "triton")
         sess_opt.add_session_config_entry("azure.uri", "https://endpoint-2930.westus2.inference.ml.azure.com")
@@ -35,7 +33,6 @@ def testAddf(self):
 
     # test an endpoint of adding doubles
     def testAddf8(self):
-
         sess_opt = ort.SessionOptions()
         sess_opt.add_session_config_entry("azure.endpoint_type", "triton")
         sess_opt.add_session_config_entry("azure.uri", "https://endpoint-1364.westus2.inference.ml.azure.com")
@@ -60,7 +57,6 @@ def testAddf8(self):
 
     # test an endpoint of adding int
     def testAddi4(self):
-
         sess_opt = ort.SessionOptions()
         sess_opt.add_session_config_entry("azure.endpoint_type", "triton")
         sess_opt.add_session_config_entry("azure.uri", "https://endpoint-9879.westus2.inference.ml.azure.com")
@@ -85,7 +81,6 @@ def testAddi4(self):
 
     # test an endpoint of "And"
     def testAnd(self):
-
         sess_opt = ort.SessionOptions()
         sess_opt.add_session_config_entry("azure.endpoint_type", "triton")
         sess_opt.add_session_config_entry("azure.uri", "https://endpoint-6811.westus2.inference.ml.azure.com")
diff --git a/onnxruntime/test/python/onnxruntime_test_python_iobinding.py b/onnxruntime/test/python/onnxruntime_test_python_iobinding.py
index ff1c0d17fd3ec..323ccb2671944 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_iobinding.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_iobinding.py
@@ -67,7 +67,6 @@ def test_bind_input_to_cpu_arr(self):
         self.assertTrue(np.array_equal(self.create_expected_output(), ort_output))
 
     def test_bind_input_types(self):
-
         opset = onnx_opset_version()
         devices = [
             (
@@ -99,7 +98,6 @@ def test_bind_input_types(self):
                 np.bool_,
             ]:
                 with self.subTest(dtype=dtype, device=str(device)):
-
                     x = np.arange(8).reshape((-1, 2)).astype(dtype)
                     proto_dtype = NP_TYPE_TO_TENSOR_TYPE[x.dtype]
 
diff --git a/onnxruntime/test/python/onnxruntime_test_python_keras.py b/onnxruntime/test/python/onnxruntime_test_python_keras.py
index fb94f67757844..bb46e19b751f9 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_keras.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_keras.py
@@ -44,7 +44,6 @@ def custom_activation(scope, operator, container):
 
 class TestInferenceSessionKeras(unittest.TestCase):
     def testRunModelConv(self):
-
         # keras model
         N, C, H, W = 2, 3, 5, 5
         x = np.random.rand(N, H, W, C).astype(np.float32, copy=False)
diff --git a/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py b/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
index fed6892f13f4e..a8434c4b5c0fc 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
@@ -39,7 +39,6 @@ def unique_element(lst):
 
 class TestSymbolicShapeInference(unittest.TestCase):
     def test_symbolic_shape_infer(self):
-
         cwd = os.getcwd()
         test_model_dir = os.path.join(cwd, "..", "models")
         for filename in Path(test_model_dir).rglob("*.onnx"):
diff --git a/onnxruntime/test/python/quantization/test_op_where.py b/onnxruntime/test/python/quantization/test_op_where.py
index 43d6fe4fd442a..102716810e74f 100644
--- a/onnxruntime/test/python/quantization/test_op_where.py
+++ b/onnxruntime/test/python/quantization/test_op_where.py
@@ -54,7 +54,6 @@ def construct_model(model_path, input_shape):
         save(model, model_path)
 
     def quantize_where_test(self, activation_type, weight_type, extra_options={}):
-
         model_fp32_path = "where_fp32.onnx"
         input_shape = [2, 2]
         self.construct_model(model_fp32_path, input_shape)
diff --git a/onnxruntime/test/python/quantization/test_qdq_loss_debug.py b/onnxruntime/test/python/quantization/test_qdq_loss_debug.py
index 5a26cd36115f9..545b47c6080ea 100644
--- a/onnxruntime/test/python/quantization/test_qdq_loss_debug.py
+++ b/onnxruntime/test/python/quantization/test_qdq_loss_debug.py
@@ -258,7 +258,6 @@ def test_create_weight_matching(self):
             )
 
     def test_create_weight_matching_per_channel(self):
-
         # float model
         #         (input)
         #           |
diff --git a/onnxruntime/test/python/quantization/test_symmetric_flag.py b/onnxruntime/test/python/quantization/test_symmetric_flag.py
index 26f7ba6ce59b3..b9590bd10c77a 100644
--- a/onnxruntime/test/python/quantization/test_symmetric_flag.py
+++ b/onnxruntime/test/python/quantization/test_symmetric_flag.py
@@ -17,7 +17,6 @@
 
 class TestSymmetricFlag(unittest.TestCase):
     def setUp(self):
-
         # Set up symmetrically and asymmetrically disributed values for activations
         self.symmetric_activations = [
             -1 * np.ones([1, 2, 32, 32], dtype="float32"),
@@ -45,7 +44,6 @@ def setUp(self):
         )
 
     def perform_quantization(self, activations, weight, act_sym, wgt_sym):
-
         # One-layer convolution model
         act = helper.make_tensor_value_info("ACT", TensorProto.FLOAT, activations[0].shape)
         wgt = helper.make_tensor_value_info("WGT", TensorProto.FLOAT, weight.shape)
@@ -86,7 +84,6 @@ def get_next(self):
         return act_zp, act_sc, wgt_zp, wgt_sc
 
     def test_0(self):
-
         act_zp, act_sc, wgt_zp, wgt_sc = self.perform_quantization(
             self.asymmetric_activations,
             self.asymmetric_weights,
@@ -104,7 +101,6 @@ def test_0(self):
         self.assertEqual(wgt_zp, 0)
 
     def test_1(self):
-
         act_zp, act_sc, wgt_zp, wgt_sc = self.perform_quantization(
             self.asymmetric_activations,
             self.asymmetric_weights,
@@ -121,7 +117,6 @@ def test_1(self):
         self.assertNotEqual(wgt_zp, 0)
 
     def test_2(self):
-
         act_zp, act_sc, wgt_zp, wgt_sc = self.perform_quantization(
             self.symmetric_activations,
             self.symmetric_weights,
@@ -138,7 +133,6 @@ def test_2(self):
         self.assertEqual(wgt_zp, 0)
 
     def test_3(self):
-
         act_zp, act_sc, wgt_zp, wgt_sc = self.perform_quantization(
             self.symmetric_activations,
             self.symmetric_weights,
@@ -156,5 +150,4 @@ def test_3(self):
 
 
 if __name__ == "__main__":
-
     unittest.main()
diff --git a/onnxruntime/test/python/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py b/onnxruntime/test/python/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py
index 7f613a8674989..9d2b2f60fa3a4 100644
--- a/onnxruntime/test/python/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py
+++ b/onnxruntime/test/python/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py
@@ -446,7 +446,6 @@ def generate_test_data(
     test_cases=1,
     output_optimized_model=False,
 ):
-
     for test_case in range(test_cases):
         sequence_length = 3
         input_1 = np.random.randint(dictionary_size, size=(batch_size, 1), dtype=np.int64)
diff --git a/onnxruntime/test/python/transformers/test_generation.py b/onnxruntime/test/python/transformers/test_generation.py
index 5f790c702508c..db50755d94f07 100644
--- a/onnxruntime/test/python/transformers/test_generation.py
+++ b/onnxruntime/test/python/transformers/test_generation.py
@@ -74,7 +74,6 @@ def check_for_init_decoder_attr(self, model_path: str):
         self.assertTrue(init_decoder_found)
 
     def run_beam_search(self, extra_arguments: str, sentences=None, append_arguments=True, is_greedy=False):
-
         if append_arguments:
             arguments = " ".join(self.default_arguments + [extra_arguments]).split()
         else:
diff --git a/onnxruntime/test/testdata/transform/fusion/gemm_transpose_gen.py b/onnxruntime/test/testdata/transform/fusion/gemm_transpose_gen.py
index 276330c2064a2..c8b478b893a6d 100644
--- a/onnxruntime/test/testdata/transform/fusion/gemm_transpose_gen.py
+++ b/onnxruntime/test/testdata/transform/fusion/gemm_transpose_gen.py
@@ -77,6 +77,7 @@ def gen_gemm_inputs_output_transposed(model_path):
 gen_gemm_output_transposed("gemm_transpose_output_transposed.onnx")
 gen_gemm_inputs_output_transposed("gemm_transpose_inputs_output_transposed.onnx")
 
+
 # (A'(B')) = BA
 def gen_gemm_inputs_output_transposed_2(model_path):
     nodes = [
diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_scale_gen.py b/onnxruntime/test/testdata/transform/fusion/matmul_scale_gen.py
index 68d4cab6dd6d9..260379d5af9c9 100644
--- a/onnxruntime/test/testdata/transform/fusion/matmul_scale_gen.py
+++ b/onnxruntime/test/testdata/transform/fusion/matmul_scale_gen.py
@@ -191,7 +191,6 @@ def gen_int32(model_path):
 
 
 def gen_scale_input(model_path):
-
     nodes = [
         helper.make_node("Mul", ["input_0", "scale"], ["scaled_input_0"], "scale input_0"),
         helper.make_node(
diff --git a/onnxruntime/test/testdata/transform/propagate_cast/gen_propagate_cast.py b/onnxruntime/test/testdata/transform/propagate_cast/gen_propagate_cast.py
index 910ff93a32ead..31753208c0b1e 100644
--- a/onnxruntime/test/testdata/transform/propagate_cast/gen_propagate_cast.py
+++ b/onnxruntime/test/testdata/transform/propagate_cast/gen_propagate_cast.py
@@ -29,9 +29,7 @@ def save(model_path, nodes, inputs, outputs, initializers):
 
 
 def gen_fuse_back2back_casts(model_path):
-
-    for (type1, type2) in list(itertools.product([TensorProto.FLOAT, TensorProto.FLOAT16], repeat=2)):
-
+    for type1, type2 in list(itertools.product([TensorProto.FLOAT, TensorProto.FLOAT16], repeat=2)):
         nodes = [
             helper.make_node("MatMul", ["input_0", "input_1"], ["product"], "MatMul_0"),
             helper.make_node("Cast", ["product"], ["product_cast"], "Cast_0", to=type1),
@@ -64,8 +62,7 @@ def gen_fuse_back2back_casts(model_path):
 
 
 def gen_fuse_sibling_casts(model_path):
-
-    for (type1, type2) in list(itertools.product([TensorProto.FLOAT, TensorProto.FLOAT16], repeat=2)):
+    for type1, type2 in list(itertools.product([TensorProto.FLOAT, TensorProto.FLOAT16], repeat=2)):
         input_type = (
             type2 if type1 != type2 else (TensorProto.FLOAT16 if type1 == TensorProto.FLOAT else TensorProto.FLOAT)
         )
diff --git a/orttraining/orttraining/eager/test_models/mnist_fc.py b/orttraining/orttraining/eager/test_models/mnist_fc.py
index 0f0b3bb604149..0b7e78a943bde 100644
--- a/orttraining/orttraining/eager/test_models/mnist_fc.py
+++ b/orttraining/orttraining/eager/test_models/mnist_fc.py
@@ -31,7 +31,6 @@ def forward(self, x):
 device = torch_ort.device()
 
 with torch.no_grad():
-
     model = NeuralNet(input_size, hidden_size, num_classes)
     pred = model(batch)
     print("inference result is: ")
diff --git a/orttraining/orttraining/python/checkpointing_utils.py b/orttraining/orttraining/python/checkpointing_utils.py
index 359f6a8c53552..ea2b220d6753c 100644
--- a/orttraining/orttraining/python/checkpointing_utils.py
+++ b/orttraining/orttraining/python/checkpointing_utils.py
@@ -44,7 +44,6 @@ def _split_state_dict(state_dict):
 
 class CombineZeroCheckpoint(object):
     def __init__(self, checkpoint_files, clean_state_dict=None):
-
         assert len(checkpoint_files) > 0, "No checkpoint files passed"
         self.checkpoint_files = checkpoint_files
         self.clean_state_dict = clean_state_dict
diff --git a/orttraining/orttraining/python/training/api/lr_scheduler.py b/orttraining/orttraining/python/training/api/lr_scheduler.py
index cff7eaaa14555..5783ee316d203 100644
--- a/orttraining/orttraining/python/training/api/lr_scheduler.py
+++ b/orttraining/orttraining/python/training/api/lr_scheduler.py
@@ -22,7 +22,6 @@ class LinearLRScheduler:
     """
 
     def __init__(self, optimizer, warmup_step_count, total_step_count, initial_lr) -> None:
-
         self._scheduler = C.LinearLRScheduler(optimizer._optimizer, warmup_step_count, total_step_count, initial_lr)
 
     def step(self):
diff --git a/orttraining/orttraining/python/training/checkpoint.py b/orttraining/orttraining/python/training/checkpoint.py
index e4a2f1230b7a4..b0331ee8c7367 100644
--- a/orttraining/orttraining/python/training/checkpoint.py
+++ b/orttraining/orttraining/python/training/checkpoint.py
@@ -659,7 +659,6 @@ def _split_state_dict(state_dict):
 
 class _CombineZeroCheckpoint(object):
     def __init__(self, checkpoint_files, clean_state_dict=None):
-
         assert len(checkpoint_files) > 0, "No checkpoint files passed"
         self.checkpoint_files = checkpoint_files
         self.clean_state_dict = clean_state_dict
diff --git a/orttraining/orttraining/python/training/optim/fused_adam.py b/orttraining/orttraining/python/training/optim/fused_adam.py
index 30ebcf30e4844..9b71d4473c8ad 100644
--- a/orttraining/orttraining/python/training/optim/fused_adam.py
+++ b/orttraining/orttraining/python/training/optim/fused_adam.py
@@ -72,7 +72,6 @@ def __init__(
         weight_decay=0.0,
         set_grad_none=True,
     ):
-
         # The FusedAdam implementation is mathematically equivalent to
         # transformers AdamW. The input arguments also have the same defaults.
 
diff --git a/orttraining/orttraining/python/training/optim/lr_scheduler.py b/orttraining/orttraining/python/training/optim/lr_scheduler.py
index cbe013d32f310..1141e78ca4820 100644
--- a/orttraining/orttraining/python/training/optim/lr_scheduler.py
+++ b/orttraining/orttraining/python/training/optim/lr_scheduler.py
@@ -273,7 +273,6 @@ def __init__(self, total_steps, lr_end=1e-7, power=1.0, warmup=0.002):
         self._num_warmup_steps = warmup * total_steps
 
     def _warmup_poly(self, train_step_info):
-
         assert (
             train_step_info.optimizer_config.lr > self.lr_end
         ), f"lr_end ({lr_end}) must be be smaller than initial lr ({train_step_info.optimizer_config.lr})"
diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function.py b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function.py
index 1c2fce2b1a80e..0de0e4fe4ea9f 100644
--- a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function.py
@@ -30,9 +30,9 @@ def state(self, val):
 
 custom_autograd_function_enabler = Enabler()
 
+
 # Legacy API to enable the custom autograd, keep its name with default value for compatibility.
 def enable_custom_autograd_support(to_enable=True):
-
     import atexit
 
     from torch.onnx import register_custom_op_symbolic, unregister_custom_op_symbolic
diff --git a/orttraining/orttraining/python/training/ortmodule/_fallback.py b/orttraining/orttraining/python/training/ortmodule/_fallback.py
index 7129e522b8c49..71642e3d1fc66 100644
--- a/orttraining/orttraining/python/training/ortmodule/_fallback.py
+++ b/orttraining/orttraining/python/training/ortmodule/_fallback.py
@@ -69,7 +69,6 @@ class _FallbackManager(object):
     """
 
     def __init__(self, pytorch_module: torch.nn.Module, policy: _FallbackPolicy, retry: bool):
-
         self._original_module = pytorch_module
 
         # Read policy from environment variable for testing purposes
@@ -133,7 +132,6 @@ def _set_exception(policy: _FallbackPolicy, exception: Exception, log_level: _lo
                     and type(exception) in self._policy_exception_map[policy.value]
                 )
             ):
-
                 if log_level <= _logger.LogLevel.INFO:
                     warnings.warn(f"Fallback for policy {policy.name} is pending.", UserWarning)
 
diff --git a/orttraining/orttraining/python/training/ortmodule/_utils.py b/orttraining/orttraining/python/training/ortmodule/_utils.py
index a43f0c3e66c7d..1cd70ad4c5fdc 100644
--- a/orttraining/orttraining/python/training/ortmodule/_utils.py
+++ b/orttraining/orttraining/python/training/ortmodule/_utils.py
@@ -239,7 +239,6 @@ def check_for_name_collisions_and_bind_methods_to_ortmodule(ortmodule: torch.nn.
                 or not inspect.ismethod(torch_module_attributes[attribute_name])
                 or attribute.__func__ != torch_module_attributes[attribute_name].__func__
             ):
-
                 # forward is expected to be defined by the user.
                 if attribute_name == "forward":
                     continue
@@ -294,7 +293,6 @@ def get_state_after_deletion_of_non_ortmodule_methods(ortmodule, user_module):
                 and inspect.ismethod(ortmodule_attributes[attribute_name])
                 and attribute.__func__ == ortmodule_attributes[attribute_name].__func__
             ):
-
                 # forward is expected to be defined by the user.
                 if attribute_name == "forward":
                     continue
diff --git a/orttraining/orttraining/python/training/ortmodule/ortmodule.py b/orttraining/orttraining/python/training/ortmodule/ortmodule.py
index 18000e0462d00..3f8a1467ecea8 100644
--- a/orttraining/orttraining/python/training/ortmodule/ortmodule.py
+++ b/orttraining/orttraining/python/training/ortmodule/ortmodule.py
@@ -35,7 +35,6 @@ class ORTModule(torch.nn.Module):
     """
 
     def __init__(self, module, debug_options=None):
-
         # NOTE: torch.nn.Modules that call setattr on their internal attributes regularly
         #       (for example PyTorch Lightning), will trigger regular re-exports. This is
         #       because ORTModule auto detects such setattrs on the original module and
@@ -283,13 +282,11 @@ def __getattr__(self, name: str):
             return super(ORTModule, self).__getattr__(name)
 
     def __setattr__(self, name: str, value) -> None:
-
         if name in self.__dict__:
             # If the name is an attribute of ORTModule, update only ORTModule
             self.__dict__[name] = value
 
         elif "_is_initialized" in self.__dict__ and self.__dict__["_is_initialized"] is True:
-
             assert "_torch_module" in self.__dict__, "ORTModule does not have a reference to the user's model"
 
             # If the name is an attribute of user model, or is a new attribute, update there.
diff --git a/orttraining/orttraining/python/training/orttrainer.py b/orttraining/orttraining/python/training/orttrainer.py
index bdf6a1e9e1ea1..a7552928bd22b 100644
--- a/orttraining/orttraining/python/training/orttrainer.py
+++ b/orttraining/orttraining/python/training/orttrainer.py
@@ -710,6 +710,7 @@ def _create_ort_training_session(self, optimizer_state_dict={}, session_options=
         # old ort session may already exists and occupies GPU memory when creating new session, this may cause OOM error.
         # for example, load_state_dict will be called before returing the function, and it calls _init_session again
         del self._training_session
+
         # Set provider-specific options if needed
         def get_providers(provider_options):
             providers = ort.get_available_providers()
diff --git a/orttraining/orttraining/python/training/torchdynamo/ort_backend.py b/orttraining/orttraining/python/training/torchdynamo/ort_backend.py
index bd397edf67e7e..b4d5c4e0b310f 100644
--- a/orttraining/orttraining/python/training/torchdynamo/ort_backend.py
+++ b/orttraining/orttraining/python/training/torchdynamo/ort_backend.py
@@ -101,12 +101,14 @@ def _get_onnx_supported_table() -> Set[str]:
     return onnx_supported_ops
 
 
-def _get_support_dictionaries_and_decomposition_tables() -> Tuple[
-    Dict[torch._ops.OpOverload, Any],
-    Dict[str, Any],
-    Dict[torch._ops.OpOverload, Callable],
-    Dict[torch._ops.OpOverload, Callable],
-]:
+def _get_support_dictionaries_and_decomposition_tables() -> (
+    Tuple[
+        Dict[torch._ops.OpOverload, Any],
+        Dict[str, Any],
+        Dict[torch._ops.OpOverload, Callable],
+        Dict[torch._ops.OpOverload, Callable],
+    ]
+):
     # The keys of this dictionary are OpOverload's which can be
     # exported by ONNX exporter. Type of key is torch._ops.OpOverload.
     # For example, if torch.ops.aten.add.default is a key in support_dict,
@@ -263,7 +265,6 @@ def _move_placeholder_to_front(graph_module: torch.fx.GraphModule) -> None:
 def _replace_to_copy_with_to(fx_module: torch.fx.GraphModule) -> None:
     # aten._to_copy doesn't have exporter so we replace it with aten.to.
     for node in fx_module.graph.nodes:
-
         if (
             isinstance(node.target, torch._ops.OpOverload)
             and node.target.overloadpacket == torch.ops.aten._to_copy  # type: ignore
diff --git a/orttraining/orttraining/test/python/orttraining_run_bert_pretrain.py b/orttraining/orttraining/test/python/orttraining_run_bert_pretrain.py
index a087a97da5a54..0fd3148b586c1 100644
--- a/orttraining/orttraining/test/python/orttraining_run_bert_pretrain.py
+++ b/orttraining/orttraining/test/python/orttraining_run_bert_pretrain.py
@@ -109,7 +109,6 @@ def bert_model_description(config):
 
 
 def create_pretraining_dataset(input_file, max_pred_length, args):
-
     train_data = pretraining_dataset(input_file=input_file, max_pred_length=max_pred_length)
     train_sampler = RandomSampler(train_data)
     train_dataloader = DataLoader(
@@ -162,7 +161,6 @@ def __getitem__(self, index):
 
 
 def parse_arguments():
-
     parser = argparse.ArgumentParser()
 
     # batch size test config parameters
@@ -348,7 +346,6 @@ def to_sanitized_dict(self) -> Dict[str, Any]:
 
 
 def setup_training(args):
-
     assert torch.cuda.is_available()
 
     if args.local_rank == -1:
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
index 7758603c484fc..e2cb48d5e5f68 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
@@ -2161,7 +2161,6 @@ def run_step(model, x):
 
 
 def test_bert_inputs_with_dynamic_shape():
-
     # create pytorch model with dropout disabled
     pt_model = _get_bert_for_sequence_classification_model(
         "cuda", is_training=True, hidden_dropout_prob=0.0, attention_probs_dropout_prob=0.0
@@ -2748,7 +2747,6 @@ def forward(self, x):
 
 @pytest.mark.parametrize("device", ["cuda", "cuda:0", "cuda:1", "cuda:2"])
 def test_model_with_different_cuda_devices(device):
-
     # Trick to run this test in single GPU machines
     device_id = _utils.get_device_index(device)
     if device_id >= torch.cuda.device_count():
@@ -2905,7 +2903,6 @@ def forward(self, input1, input2, input3):
 
 @pytest.mark.parametrize("data_device, model_device", (["cuda", "cpu"], ["cpu", "cuda"]))
 def test_forward_data_and_model_on_different_devices(data_device, model_device):
-
     os.environ["ORTMODULE_SKIPCHECK_POLICY"] = "SKIP_CHECK_DISABLED"
 
     N, D_in, H, D_out = 64, 784, 500, 10
@@ -3039,7 +3036,6 @@ def test_model_wrapped_inside_torch_no_grad():
 
 
 def test_model_initializer_requires_grad_changes_from_one_forward_to_next():
-
     os.environ["ORTMODULE_SKIPCHECK_POLICY"] = "SKIP_CHECK_DISABLED"
 
     device = "cuda"
@@ -3432,7 +3428,6 @@ def train_step(model, x):
 
 
 def test_forward_dynamic_args():
-
     os.environ["ORTMODULE_SKIPCHECK_POLICY"] = "SKIP_CHECK_DISABLED"
 
     device = "cuda"
@@ -3446,7 +3441,6 @@ def test_forward_dynamic_args():
 
     # Make sure model runs without any exception
     for i in range(2):
-
         # Test both train and inference mode
         if i % 2 == 0:
             model.train()
@@ -3478,7 +3472,6 @@ def test_forward_dynamic_args():
 
 
 def test_forward_dynamic_kwargs():
-
     os.environ["ORTMODULE_SKIPCHECK_POLICY"] = "SKIP_CHECK_DISABLED"
 
     one = torch.FloatTensor([1])
@@ -3487,7 +3480,6 @@ def test_forward_dynamic_kwargs():
 
     # Make sure model runs without any exception
     for i in range(2):
-
         # Test both train and inference mode
         if i % 2 == 0:
             model.train()
@@ -3641,7 +3633,6 @@ def forward(self, x):
 
 
 def test_forward_call_default_input():
-
     os.environ["ORTMODULE_SKIPCHECK_POLICY"] = "SKIP_CHECK_DISABLED"
 
     class UnusedNet(torch.nn.Module):
@@ -3767,7 +3758,6 @@ def forward(self, input1=None, input2=None):
 
 
 def test_forward_call_lots_None():
-
     os.environ["ORTMODULE_SKIPCHECK_POLICY"] = "SKIP_CHECK_DISABLED"
 
     class NoneNet(torch.nn.Module):
@@ -3915,7 +3905,6 @@ def forward(self, input1, bool_argument, int_argument, float_argument):
 
 @pytest.mark.parametrize("bool_arguments", [(True, False), (False, True)])
 def test_changing_bool_input_re_exports_model(bool_arguments):
-
     os.environ["ORTMODULE_SKIPCHECK_POLICY"] = "SKIP_CHECK_DISABLED"
 
     class PrimitiveTypesInputNet(torch.nn.Module):
@@ -4088,7 +4077,6 @@ def forward(
 
 @pytest.mark.parametrize("device", ["cuda", "cpu", None])
 def test_stateless_model_specified_device(device):
-
     N, D_in, H, D_out = 32, 784, 500, 10
     pt_model = StatelessModel().to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
@@ -4103,7 +4091,6 @@ def test_stateless_model_specified_device(device):
 
 
 def test_stateless_model_unspecified_device():
-
     N, D_in, H, D_out = 32, 784, 500, 10
     pt_model = StatelessModel()
     ort_model = ORTModule(copy.deepcopy(pt_model))
@@ -4209,7 +4196,6 @@ def test_hf_save_pretrained():
 
 
 def test_ortmodule_string_inputs_are_ignored():
-
     pt_model = MyStrNet()
     ort_model = ORTModule(copy.deepcopy(pt_model))
     x = torch.randn(1, 2)
@@ -4317,7 +4303,6 @@ def forward(self, batch):
 
 @pytest.mark.parametrize("mode", ["training", "inference"])
 def test_debug_options_save_onnx_models_os_environment(mode):
-
     device = "cuda"
     N, D_in, H, D_out = 64, 784, 500, 10
     # Create a temporary directory for the onnx_models
@@ -4341,7 +4326,6 @@ def test_debug_options_save_onnx_models_os_environment(mode):
 
 @pytest.mark.parametrize("mode", ["training", "inference"])
 def test_debug_options_save_onnx_models_cwd(mode):
-
     device = "cuda"
     N, D_in, H, D_out = 64, 784, 500, 10
     model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
@@ -4366,7 +4350,6 @@ def test_debug_options_save_onnx_models_cwd(mode):
 
 
 def test_debug_options_save_onnx_models_validate_fail_on_non_writable_dir():
-
     os.environ["ORTMODULE_SAVE_ONNX_PATH"] = "/non/existent/directory"
     with pytest.raises(Exception) as ex_info:
         _ = DebugOptions(save_onnx=True, onnx_prefix="my_model")
@@ -4764,7 +4747,6 @@ def forward(self, a):
 
 
 def test_ortmodule_setattr_signals_model_changed():
-
     os.environ["ORTMODULE_SKIPCHECK_POLICY"] = "SKIP_CHECK_DISABLED"
 
     class UserNet(torch.nn.Module):
@@ -4899,7 +4881,6 @@ def test_ortmodule_skip_check_load_from_os_env(policy_str, policy):
 
 @pytest.mark.parametrize("is_training,deterministic", list(itertools.product([True, False], repeat=2)))
 def test_ortmodule_determinism_flag(is_training, deterministic):
-
     torch.use_deterministic_algorithms(deterministic)
 
     N, D_in, H, D_out = 64, 784, 500, 10
@@ -5024,7 +5005,6 @@ def __init__(self, module, debug_options=None):
 
 
 def test_ortmodule_fused_adam_optimizer_correctness():
-
     torch.manual_seed(8888)
 
     device = "cuda"
@@ -5073,7 +5053,6 @@ def run_optim_step(optimizer):
 
 
 def test_ortmodule_fused_adam_optimizer_correctness_torch():
-
     torch.manual_seed(8888)
 
     device = "cuda"
@@ -5295,7 +5274,6 @@ def test_opset_version_change(opset_version):
 
 
 def test_serialize_ortmodule():
-
     device = "cuda"
     N, D_in, H, D_out = 64, 784, 500, 10
     pt_model = SerializationNet(D_in, H, D_out).to(device)
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py
index 8f1d57ff138a8..c6a5ea7067705 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py
@@ -45,7 +45,6 @@ def train(model, optimizer, scheduler, train_dataloader, epoch, device, args):
 
     # For each batch of training data...
     for step, batch in enumerate(train_dataloader):
-
         if step == args.train_steps:
             break
 
@@ -155,7 +154,6 @@ def test(model, validation_dataloader, device, args):
         # Telling the model not to compute or store gradients, saving memory and
         # speeding up validation
         with torch.no_grad():
-
             # Forward pass, calculate logit predictions.
             # This will return the logits rather than the loss because we have
             # not provided labels.
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py
index 42697766c9815..56f0b476ff9cf 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py
@@ -51,7 +51,6 @@ def train(model, optimizer, scaler, scheduler, train_dataloader, epoch, device,
 
     # For each batch of training data...
     for step, batch in enumerate(train_dataloader):
-
         if step == args.train_steps:
             break
 
@@ -158,7 +157,6 @@ def test(model, validation_dataloader, device, args):
         # Telling the model not to compute or store gradients, saving memory and
         # speeding up validation
         with torch.no_grad():
-
             # Forward pass, calculate logit predictions.
             # This will return the logits rather than the loss because we have
             # not provided labels.
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_fairscale_sharded_optimizer.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_fairscale_sharded_optimizer.py
index e1a7dd591ec36..7372bfb420583 100755
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_fairscale_sharded_optimizer.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_fairscale_sharded_optimizer.py
@@ -41,7 +41,6 @@ def forward(self, input1):
 
 
 def get_dataloader(args, rank, batch_size):
-
     # Data loading code
     train_dataset = torchvision.datasets.MNIST(
         root=args.data_dir, train=True, transform=transforms.ToTensor(), download=True
@@ -177,7 +176,6 @@ def test(args, model, device, loss_fn, test_loader):
 
 
 def train(rank: int, args, world_size: int, epochs: int):
-
     # DDP init example
     dist_init(rank, world_size)
     torch.backends.cudnn.deterministic = True
@@ -242,7 +240,6 @@ def train(rank: int, args, world_size: int, epochs: int):
 
 
 if __name__ == "__main__":
-
     parser = argparse.ArgumentParser(
         description="Benchmark the optimizer state sharding, on a typical computer vision workload"
     )
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_fallback.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_fallback.py
index 6cde304a6570b..8f439f87c3fdb 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_fallback.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_fallback.py
@@ -379,7 +379,6 @@ def test_ortmodule_fallback_init__torch_version(is_training, fallback_enabled, m
     runtime_pytorch_version = version.parse(torch.__version__.split("+")[0])
     minimum_runtime_pytorch_version = version.parse(MINIMUM_RUNTIME_PYTORCH_VERSION_STR)
     if runtime_pytorch_version < minimum_runtime_pytorch_version:
-
         if fallback_enabled:
             if matching_policy:
                 policy = "FALLBACK_BAD_INITIALIZATION"
@@ -443,7 +442,6 @@ def test_ortmodule_fallback_init__missing_cpp_extensions(
             f" It requires PyTorch CPP extensions to be missing"
         )
     else:
-
         if fallback_enabled:
             if matching_policy:
                 policy = "FALLBACK_BAD_INITIALIZATION"
diff --git a/orttraining/orttraining/test/python/orttraining_test_orttrainer_frontend.py b/orttraining/orttraining/test/python/orttraining_test_orttrainer_frontend.py
index 57b5af656eb66..596e6db33a54a 100644
--- a/orttraining/orttraining/test/python/orttraining_test_orttrainer_frontend.py
+++ b/orttraining/orttraining/test/python/orttraining_test_orttrainer_frontend.py
@@ -265,7 +265,6 @@ def testDynamicLossScaler():
     # Performing 9*2000 updates to cover all branches of LossScaler.update(train_step_info.all_finite=True)
     loss_scale = float(1 << 16)
     for cycles in range(1, 10):
-
         # 1999 updates without overflow produces 1999 stable steps
         for i in range(1, 2000):
             new_loss_scale = default_scaler.update(train_step_info)
@@ -1606,7 +1605,6 @@ def testLossScalerLegacyAndExperimentalFullCycle():
 
     # Performing 9*2000 updates to cover all branches of LossScaler.update(train_step_info.all_finite=True)
     for cycles in range(1, 10):
-
         # 1999 updates without overflow produces 1999 stable steps
         for i in range(1, 2000):
             new_loss_scale = new_ls.update(train_step_info)
diff --git a/orttraining/orttraining/test/python/orttraining_transformer_trainer.py b/orttraining/orttraining/test/python/orttraining_transformer_trainer.py
index 0185670dac79f..5e50e98df807a 100644
--- a/orttraining/orttraining/test/python/orttraining_transformer_trainer.py
+++ b/orttraining/orttraining/test/python/orttraining_transformer_trainer.py
@@ -245,7 +245,6 @@ def train(self):
         for epoch in train_iterator:
             epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=self.args.local_rank not in [-1, 0])
             for step, inputs in enumerate(epoch_iterator):
-
                 # Skip past any already trained steps if resuming training
                 if steps_trained_in_current_epoch > 0:
                     steps_trained_in_current_epoch -= 1
diff --git a/orttraining/orttraining/test/python/perf_log/ort_module_perf_test_tools.py b/orttraining/orttraining/test/python/perf_log/ort_module_perf_test_tools.py
index b7b619a92e53b..2e1ce8cf37745 100644
--- a/orttraining/orttraining/test/python/perf_log/ort_module_perf_test_tools.py
+++ b/orttraining/orttraining/test/python/perf_log/ort_module_perf_test_tools.py
@@ -102,6 +102,7 @@ def get_repo_commit(repo_path):
     %(RunConfig)s,\
     %(Time)s)"
 
+
 # Obtain connection string information from the portal
 def connect_to_perf_dashboard_db(mysql_server_name, power_bi_user_name, password, database):
     config = {
diff --git a/orttraining/orttraining/test/python/utils_multiple_choice.py b/orttraining/orttraining/test/python/utils_multiple_choice.py
index 562ecbf8c496d..05fd219ebbb5b 100644
--- a/orttraining/orttraining/test/python/utils_multiple_choice.py
+++ b/orttraining/orttraining/test/python/utils_multiple_choice.py
@@ -116,7 +116,6 @@ def __init__(
         # and the others will use the cache.
         lock_path = cached_features_file + ".lock"
         with FileLock(lock_path):
-
             if os.path.exists(cached_features_file) and not overwrite_cache:
                 logger.info(f"Loading features from cached file {cached_features_file}")
                 self.features = torch.load(cached_features_file)
@@ -218,7 +217,7 @@ def convert_examples_to_features(
     label_map = {label: i for i, label in enumerate(label_list)}
 
     features = []
-    for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
+    for ex_index, example in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
         if ex_index % 10000 == 0:
             logger.info("Writing example %d of %d" % (ex_index, len(examples)))
         choices_inputs = []
diff --git a/orttraining/tools/scripts/nv_run_pretraining.py b/orttraining/tools/scripts/nv_run_pretraining.py
index 3e51a8886ecb6..fd35ee7799572 100644
--- a/orttraining/tools/scripts/nv_run_pretraining.py
+++ b/orttraining/tools/scripts/nv_run_pretraining.py
@@ -59,7 +59,6 @@
 
 
 def create_pretraining_dataset(input_file, max_pred_length, shared_list, args):
-
     train_data = pretraining_dataset(input_file=input_file, max_pred_length=max_pred_length)
     train_sampler = RandomSampler(train_data)
     train_dataloader = DataLoader(
@@ -90,7 +89,6 @@ def __len__(self):
         return len(self.inputs[0])
 
     def __getitem__(self, index):
-
         [input_ids, input_mask, segment_ids, masked_lm_positions, masked_lm_ids, next_sentence_labels] = [
             torch.from_numpy(input[index].astype(np.int64))
             if indice < 5
@@ -110,7 +108,6 @@ def __getitem__(self, index):
 
 
 def parse_arguments():
-
     parser = argparse.ArgumentParser()
 
     ## Required parameters
@@ -223,7 +220,6 @@ def parse_arguments():
 
 
 def setup_training(args):
-
     assert torch.cuda.is_available()
 
     if args.local_rank == -1:
@@ -268,7 +264,6 @@ def setup_training(args):
 
 
 def prepare_model_and_optimizer(args, device):
-
     # Prepare model
     config = BertConfig.from_json_file(args.config_file)
 
@@ -314,7 +309,6 @@ def prepare_model_and_optimizer(args, device):
         optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=args.max_steps
     )
     if args.fp16:
-
         if args.loss_scale == 0:
             # optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
             model, optimizer = amp.initialize(
@@ -367,7 +361,6 @@ def prepare_model_and_optimizer(args, device):
 
 
 def take_optimizer_step(args, optimizer, model, overflow_buf, global_step):
-
     if args.allreduce_post_accumulation:
         # manually allreduce gradients after all accumulation steps
         # check for Inf/NaN
@@ -425,7 +418,6 @@ def take_optimizer_step(args, optimizer, model, overflow_buf, global_step):
 
 
 def main():
-
     args = parse_arguments()
     random.seed(args.seed)
     np.random.seed(args.seed)
@@ -511,7 +503,6 @@ def main():
                 overflow_buf = torch.cuda.IntTensor([0])
 
             for f_id in range(f_start_id + 1, len(files)):
-
                 # torch.cuda.synchronize()
                 # f_start = time.time()
                 if torch.distributed.is_initialized() and torch.distributed.get_world_size() > num_files:
diff --git a/orttraining/tools/scripts/opset12_model_transform.py b/orttraining/tools/scripts/opset12_model_transform.py
index c19aceb6216d8..4c308eb96159b 100644
--- a/orttraining/tools/scripts/opset12_model_transform.py
+++ b/orttraining/tools/scripts/opset12_model_transform.py
@@ -27,6 +27,7 @@
 output_model_name = input_model_name[:-5] + "_opset12.onnx"
 model = onnx.load(input_model_name)
 
+
 # for a given node input, look thru the graph nodes and find the node
 # whose output is matching the input
 def find_input_node(model, arg):
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index d552fb71b6547..4b35655a2a893 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1471,7 +1471,6 @@ def setup_tensorrt_vars(args):
 
 
 def setup_migraphx_vars(args):
-
     migraphx_home = None
 
     if args.use_migraphx:
diff --git a/tools/ci_build/github/linux/ort_minimal/check_build_binary_size.py b/tools/ci_build/github/linux/ort_minimal/check_build_binary_size.py
index 6d7c70ae953fb..048125f4b977c 100644
--- a/tools/ci_build/github/linux/ort_minimal/check_build_binary_size.py
+++ b/tools/ci_build/github/linux/ort_minimal/check_build_binary_size.py
@@ -11,7 +11,6 @@
 
 
 def _check_binary_size(path, readelf, threshold, os_str, arch, build_config):
-
     print("Checking binary size of {} using {}".format(path, readelf))
     ondisk_size = os.path.getsize(path)
 
diff --git a/tools/ci_build/op_registration_utils.py b/tools/ci_build/op_registration_utils.py
index 5120552e81330..d59aabf6958d4 100644
--- a/tools/ci_build/op_registration_utils.py
+++ b/tools/ci_build/op_registration_utils.py
@@ -230,7 +230,6 @@ def process_kernel_registration_file(
 
     offset = 0
     while offset < len(lines):
-
         line = lines[offset]
         stripped = line.strip()
 
diff --git a/tools/ci_build/op_registration_validator.py b/tools/ci_build/op_registration_validator.py
index 975c06ba1fb25..c52c82f328f77 100644
--- a/tools/ci_build/op_registration_validator.py
+++ b/tools/ci_build/op_registration_validator.py
@@ -97,7 +97,6 @@ def validate_last_registrations(self):
 
 
 if __name__ == "__main__":
-
     parser = argparse.ArgumentParser(description="Script to validate operator kernel registrations.")
 
     parser.add_argument(
diff --git a/tools/nuget/validate_package.py b/tools/nuget/validate_package.py
index 5baa2f603c5d7..7b412cd7f1e97 100644
--- a/tools/nuget/validate_package.py
+++ b/tools/nuget/validate_package.py
@@ -89,7 +89,7 @@ def check_if_dlls_are_present(
     platforms = platforms_supported.strip().split(",")
     if package_type == "tarball":
         file_list_in_package = list()
-        for (dirpath, dirnames, filenames) in os.walk(package_path):
+        for dirpath, dirnames, filenames in os.walk(package_path):
             file_list_in_package += [os.path.join(dirpath, file) for file in filenames]
     else:
         file_list_in_package = zip_file.namelist()
diff --git a/tools/python/create_reduced_build_config.py b/tools/python/create_reduced_build_config.py
index f7bbe5001c685..bda913924b5ae 100644
--- a/tools/python/create_reduced_build_config.py
+++ b/tools/python/create_reduced_build_config.py
@@ -74,7 +74,6 @@ def _extract_ops_from_onnx_model(model_files: typing.Iterable[pathlib.Path]):
 
 
 def create_config_from_onnx_models(model_files: typing.Iterable[pathlib.Path], output_file: pathlib.Path):
-
     required_ops = _extract_ops_from_onnx_model(model_files)
 
     output_file.parent.mkdir(parents=True, exist_ok=True)
diff --git a/tools/python/example_operator_perf_test.py b/tools/python/example_operator_perf_test.py
index 50a3edd5c9b27..226f63a21e63e 100644
--- a/tools/python/example_operator_perf_test.py
+++ b/tools/python/example_operator_perf_test.py
@@ -70,7 +70,6 @@ def create_test_input(n, num_items, k):
 # Example code that tests various combinations of input sizes.
 #
 def run_perf_tests(model_path, num_threads=1):
-
     so = rt.SessionOptions()
     so.intra_op_num_threads = num_threads
     sess = rt.InferenceSession(model_path, sess_options=so)
@@ -128,7 +127,6 @@ def run_test():
 # so that the model can be easily run directly or from a debugger.
 #
 def create_example_test_directory():
-
     # fill in the inputs that we want to use specific values for
     input_data = {}
     input_data["K"] = np.asarray([64]).astype(np.int64)
diff --git a/tools/python/find_optimizer_opset_version_updates_required.py b/tools/python/find_optimizer_opset_version_updates_required.py
index bad3258e93887..6f16d2469eb26 100644
--- a/tools/python/find_optimizer_opset_version_updates_required.py
+++ b/tools/python/find_optimizer_opset_version_updates_required.py
@@ -164,7 +164,6 @@ def get_latest_onnx_op_versions(root_dir):
 
 
 def find_potential_issues(root_dir, op_to_opset):
-
     optimizer_dir = os.path.join(root_dir, "onnxruntime/core/optimizer")
 
     files = glob.glob(optimizer_dir + "/**/*.cc", recursive=True)
diff --git a/tools/python/gen_contrib_doc.py b/tools/python/gen_contrib_doc.py
index 15e7f65d093d9..0927ce92b495a 100644
--- a/tools/python/gen_contrib_doc.py
+++ b/tools/python/gen_contrib_doc.py
@@ -313,7 +313,6 @@ def support_level_str(level):  # type: (OpSchema.SupportType) -> Text
 
 
 def main(output_path: str, domain_filter: [str]):
-
     with io.open(output_path, "w", newline="", encoding="utf-8") as fout:
         fout.write("## Contrib Operator Schemas\n")
         fout.write(
diff --git a/tools/python/gen_opkernel_doc.py b/tools/python/gen_opkernel_doc.py
index e399b00c97fcd..4fded44369bf2 100644
--- a/tools/python/gen_opkernel_doc.py
+++ b/tools/python/gen_opkernel_doc.py
@@ -61,7 +61,6 @@ def expand_providers(provider_filter: [str]):
 
 
 def main(output_path: pathlib.Path, provider_filter: [str]):
-
     providers = expand_providers(provider_filter)
 
     with io.open(output_path, "w", newline="", encoding="utf-8") as fout:
diff --git a/tools/python/util/convert_onnx_models_to_ort.py b/tools/python/util/convert_onnx_models_to_ort.py
index 9331c48d0c5f5..58c7ed86132a8 100644
--- a/tools/python/util/convert_onnx_models_to_ort.py
+++ b/tools/python/util/convert_onnx_models_to_ort.py
@@ -83,7 +83,6 @@ def _convert(
     target_platform: str,
     session_options_config_entries: typing.Dict[str, str],
 ) -> typing.List[pathlib.Path]:
-
     model_dir = model_path_or_dir if model_path_or_dir.is_dir() else model_path_or_dir.parent
     output_dir = output_dir or model_dir
 
@@ -118,7 +117,6 @@ def is_model_file_to_convert(file_path: pathlib.Path):
 
     for model in models:
         try:
-
             relative_model_path = model.relative_to(model_dir)
 
             (output_dir / relative_model_path).parent.mkdir(parents=True, exist_ok=True)
diff --git a/tools/python/util/mobile_helpers/usability_checker.py b/tools/python/util/mobile_helpers/usability_checker.py
index f286544fa510e..4c31425d2d2e9 100644
--- a/tools/python/util/mobile_helpers/usability_checker.py
+++ b/tools/python/util/mobile_helpers/usability_checker.py
@@ -85,7 +85,6 @@ def __init__(self):
         self.nodes_unsupported_due_to_dynamic_input = -1
 
     def suitability(self):
-
         # for now add up all the nodes. if there are subgraphs, the percentage of covered nodes will be reduced by all
         # nodes in the subgraphs.
         num_nodes = self.num_nodes + self.num_nodes_in_subgraphs
@@ -465,7 +464,6 @@ def check_shapes(graph: onnx.GraphProto, logger: logging.Logger = None):
 
 
 def checker(model_path, logger: logging.Logger):
-
     model = onnx.load(model_path)
     model_with_shape_info = onnx.shape_inference.infer_shapes(model)
 
diff --git a/tools/python/util/ort_format_model/operator_type_usage_processors.py b/tools/python/util/ort_format_model/operator_type_usage_processors.py
index 8f21298518f87..2968cea21b24e 100644
--- a/tools/python/util/ort_format_model/operator_type_usage_processors.py
+++ b/tools/python/util/ort_format_model/operator_type_usage_processors.py
@@ -589,7 +589,6 @@ def restore_from_config_entry(self, domain: str, optype: str, config_entry: str)
             op_processor.from_config_entry(config_entry)
 
     def debug_dump(self):
-
         print("C++ code that will be emitted:")
         [print(cpp_line) for cpp_line in self.get_cpp_entries()]