Skip to content

Commit

Permalink
Merge branch 'master' into directives
Browse files Browse the repository at this point in the history
  • Loading branch information
isazi committed Dec 17, 2024
2 parents b207e06 + ac05da3 commit 69ec5ac
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 13 deletions.
28 changes: 23 additions & 5 deletions kernel_tuner/backends/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
get_temp_filename,
delete_temp_file,
write_file,
SkippableFailure,
)

try:
Expand Down Expand Up @@ -260,12 +261,23 @@ def compile(self, kernel_instance):
if platform.system() == "Darwin":
lib_extension = ".dylib"

subprocess.check_call([self.compiler, "-c", source_file] + compiler_options + ["-o", filename + ".o"])
subprocess.check_call(
subprocess.run(
[self.compiler, "-c", source_file] + compiler_options + ["-o", filename + ".o"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=True
)

subprocess.run(
[self.compiler, filename + ".o"]
+ compiler_options
+ ["-shared", "-o", filename + lib_extension]
+ lib_args
+ lib_args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=True
)

self.lib = np.ctypeslib.load_library(filename, ".")
Expand Down Expand Up @@ -385,11 +397,17 @@ def refresh_memory(self, arguments, should_sync):
self.memcpy_dtoh(arg, self.allocations[i])

def cleanup_lib(self):
"""Unload the previously loaded shared library"""
"""unload the previously loaded shared library"""
if self.lib is None:
return

if not self.using_openmp and not self.using_openacc:
# this if statement is necessary because shared libraries that use
# OpenMP will core dump when unloaded, this is a well-known issue with OpenMP
logging.debug("unloading shared library")
_ctypes.dlclose(self.lib._handle)
try:
_ctypes.dlclose(self.lib._handle)
finally:
self.lib = None

units = {}
10 changes: 7 additions & 3 deletions kernel_tuner/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -619,9 +619,13 @@ def compile_kernel(self, instance, verbose):
shared_mem_error_messages = [
"uses too much shared data",
"local memory limit exceeded",
r"local memory \(\d+\) exceeds limit \(\d+\)",
]
if any(msg in str(e) for msg in shared_mem_error_messages):
logging.debug("compile_kernel failed due to kernel using too much shared memory")
error_message = str(e.stderr) if hasattr(e, "stderr") else str(e)
if any(re.search(msg, error_message) for msg in shared_mem_error_messages):
logging.debug(
"compile_kernel failed due to kernel using too much shared memory"
)
if verbose:
print(
f"skipping config {util.get_instance_string(instance.params)} reason: too much shared memory used"
Expand Down Expand Up @@ -683,7 +687,7 @@ def create_kernel_instance(self, kernel_source, kernel_options, params, verbose)
)

# check for templated kernel
if kernel_source.lang in ["CUDA", "NVCUDA"] and "<" in name and ">" in name:
if kernel_source.lang in ["CUDA", "NVCUDA", "HIP"] and "<" in name and ">" in name:
kernel_string, name = wrap_templated_kernel(kernel_string, name)

# Preprocess GPU arguments. Require for handling `Tunable` arguments
Expand Down
4 changes: 1 addition & 3 deletions kernel_tuner/observers/pmt.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,7 @@ def after_finish(self):

def get_results(self):
average_kernel_execution_time_ms = self.results["time"]

averages = {key: np.average(values) for key, values in self.results.items()}
self.parent.initialize_results(self.parent.pm_names)
averages = self.parent.get_results()

# correct energy measurement, because current _energy number is collected over the entire duration
# we estimate energy as the average power over the continuous duration times the kernel execution time
Expand Down
4 changes: 2 additions & 2 deletions test/test_compiler_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,11 +188,11 @@ def test_compile_detects_device_code(npct, subprocess):
cfunc = CompilerFunctions()
cfunc.compile(kernel_instance)

print(subprocess.check_call.call_args_list)
print(subprocess.run.call_args_list)

# assert the filename suffix used for source compilation is .cu
dot_cu_used = False
for call in subprocess.check_call.call_args_list:
for call in subprocess.run.call_args_list:
args, kwargs = call
args = args[0]
print(args)
Expand Down

0 comments on commit 69ec5ac

Please sign in to comment.