Merge branch 'master' into directives

KernelTuner · Dec 17, 2024 · 69ec5ac · 69ec5ac
2 parents b207e06 + ac05da3
commit 69ec5ac
Show file tree

Hide file tree

Showing 4 changed files with 33 additions and 13 deletions.
diff --git a/kernel_tuner/backends/compiler.py b/kernel_tuner/backends/compiler.py
@@ -18,6 +18,7 @@
     get_temp_filename,
     delete_temp_file,
     write_file,
+    SkippableFailure,
 )
 
 try:
@@ -260,12 +261,23 @@ def compile(self, kernel_instance):
             if platform.system() == "Darwin":
                 lib_extension = ".dylib"
 
-            subprocess.check_call([self.compiler, "-c", source_file] + compiler_options + ["-o", filename + ".o"])
-            subprocess.check_call(
+            subprocess.run(
+                [self.compiler, "-c", source_file] + compiler_options + ["-o", filename + ".o"],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                check=True
+            )
+
+            subprocess.run(
                 [self.compiler, filename + ".o"]
                 + compiler_options
                 + ["-shared", "-o", filename + lib_extension]
-                + lib_args
+                + lib_args,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                check=True
             )
 
             self.lib = np.ctypeslib.load_library(filename, ".")
@@ -385,11 +397,17 @@ def refresh_memory(self, arguments, should_sync):
                 self.memcpy_dtoh(arg, self.allocations[i])
 
     def cleanup_lib(self):
-        """Unload the previously loaded shared library"""
+        """unload the previously loaded shared library"""
+        if self.lib is None:
+            return
+
         if not self.using_openmp and not self.using_openacc:
             # this if statement is necessary because shared libraries that use
             # OpenMP will core dump when unloaded, this is a well-known issue with OpenMP
             logging.debug("unloading shared library")
-            _ctypes.dlclose(self.lib._handle)
+            try:
+                _ctypes.dlclose(self.lib._handle)
+            finally:
+                self.lib = None
 
     units = {}
diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py
@@ -619,9 +619,13 @@ def compile_kernel(self, instance, verbose):
             shared_mem_error_messages = [
                 "uses too much shared data",
                 "local memory limit exceeded",
+                r"local memory \(\d+\) exceeds limit \(\d+\)",
             ]
-            if any(msg in str(e) for msg in shared_mem_error_messages):
-                logging.debug("compile_kernel failed due to kernel using too much shared memory")
+            error_message = str(e.stderr) if hasattr(e, "stderr") else str(e)
+            if any(re.search(msg, error_message) for msg in shared_mem_error_messages):
+                logging.debug(
+                    "compile_kernel failed due to kernel using too much shared memory"
+                )
                 if verbose:
                     print(
                         f"skipping config {util.get_instance_string(instance.params)} reason: too much shared memory used"
@@ -683,7 +687,7 @@ def create_kernel_instance(self, kernel_source, kernel_options, params, verbose)
         )
 
         # check for templated kernel
-        if kernel_source.lang in ["CUDA", "NVCUDA"] and "<" in name and ">" in name:
+        if kernel_source.lang in ["CUDA", "NVCUDA", "HIP"] and "<" in name and ">" in name:
             kernel_string, name = wrap_templated_kernel(kernel_string, name)
 
         # Preprocess GPU arguments. Require for handling `Tunable` arguments

diff --git a/kernel_tuner/observers/pmt.py b/kernel_tuner/observers/pmt.py
@@ -125,9 +125,7 @@ def after_finish(self):
 
     def get_results(self):
         average_kernel_execution_time_ms = self.results["time"]
-
-        averages = {key: np.average(values) for key, values in self.results.items()}
-        self.parent.initialize_results(self.parent.pm_names)
+        averages = self.parent.get_results()
 
         # correct energy measurement, because current _energy number is collected over the entire duration
         # we estimate energy as the average power over the continuous duration times the kernel execution time

diff --git a/test/test_compiler_functions.py b/test/test_compiler_functions.py
@@ -188,11 +188,11 @@ def test_compile_detects_device_code(npct, subprocess):
     cfunc = CompilerFunctions()
     cfunc.compile(kernel_instance)
 
-    print(subprocess.check_call.call_args_list)
+    print(subprocess.run.call_args_list)
 
     # assert the filename suffix used for source compilation is .cu
     dot_cu_used = False
-    for call in subprocess.check_call.call_args_list:
+    for call in subprocess.run.call_args_list:
         args, kwargs = call
         args = args[0]
         print(args)